From c3c1ffc3a5946d4fba3876ba2e1b0a5d8c0dcf84 Mon Sep 17 00:00:00 2001 From: ryx <2736755949@qq.com> Date: Sat, 27 Dec 2025 00:50:59 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E7=9B=B8=E4=BC=BC=E5=BA=A6=E9=97=AE=E9=A2=98.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 文本相似度问题.py | 663 ---------------------------------------------- 1 file changed, 663 deletions(-) delete mode 100644 文本相似度问题.py diff --git a/文本相似度问题.py b/文本相似度问题.py deleted file mode 100644 index 384628b..0000000 --- a/文本相似度问题.py +++ /dev/null @@ -1,663 +0,0 @@ -import os#用于处理文件和目录路径 -import math -import tkinter as tk#用于创建图形用户界面(GUI) -from tkinter import ttk, filedialog, messagebox, scrolledtext -#提供提供现代风格的 GUI 组件,用于打开文件或文件夹选择对话框,用于显示消息框,用于创建带滚动条的文本框 -from collections import defaultdict, Counter -#当访问不存在的键时,会自动创建一个默认值。用于统计元素的出现次数 -from tkinter import font#字体 -import nltk#自然语言处理工具包 -from nltk.corpus import stopwords#用于获取停用词列表 -import re#正则表达式模块,用于文本匹配和处理 -from pyecharts import options as opts#用于创建交互式图表,这里用于生成词云,options:用于设置图表的选项 -from pyecharts.charts import WordCloud#用于生成词云图 -import webbrowser#用于在浏览器中打开文件 -import tempfile#用于创建临时文件 -import matplotlib.pyplot as plt#用于绘制图表,热力图和柱状图 -import numpy as np#用于处理数组和矩阵 -from matplotlib.colors import LinearSegmentedColormap#用于创建自定义颜色映射 -plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]#设置 matplotlib 使用的字体,确保中文能够正常显示 -plt.rcParams["axes.unicode_minus"] = False # 解决负号显示问题 -nltk.download('stopwords')# 下载停用词所需数据 -nltk.download('averaged_perceptron_tagger')#下载词性标注所需的数据 -# 获取英文停用词列表 -ENGLISH_STOPWORDS = set(stopwords.words('english')) -# 用于匹配数词的正则表达式 -NUMBER_REGEX = re.compile(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?%?$') -# 英文数词列表 -ENGLISH_NUMBERS = { - 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', - 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', - 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', - 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand', 'million', - 'billion', 'trillion' -}#数词列表 -CUSTOM_STOPWORDS = { - "year", "years", "month", "months", "day", "days", - "whose", "who", "what", "when", "where", "why", "how", -}#停用词列表 -# 过滤逻辑函数 -def should_include_term(term):#过滤不需要的单词 - term_lower = term.lower()#把单词都转化成小写 - if term_lower in ENGLISH_STOPWORDS or term_lower in CUSTOM_STOPWORDS:#检查这些单词是否在英文停用词列表里 - return False - if NUMBER_REGEX.match(term) or term_lower in ENGLISH_NUMBERS:#检查这些单词是否在数词列表 - return False - if len(term) <= 2:#小于等于两个字母的排除 - return False - return True -#定义加载文档函数 -def load_documents(folder_path): - documents = {}#初始化字典存储每个文件的单词列表 - for root, dirs, files in os.walk(folder_path):#遍历指定文件夹及其子文件夹中的所有文件,os.walk(folder_path) 是 Python 内置的文件夹遍历函数 - #每次循环返回一个元组 (root, dirs, files) - #dirs:当前 root 文件夹下的所有子文件夹名称(列表类型) - for file in files: - if file.endswith(('.txt', '.csv', '.json')): - file_path = os.path.join(root, file)## 拼接文件夹路径和文件名,得到文件的绝对路径 - try: - with open(file_path, 'r', encoding='utf-8') as f: - lines = f.readlines() - words = []#将处理后的单词添加到 words 列表中 - for line_num, line in enumerate(lines, start=1): - line_words = preprocess_text(line)#调用函数进行预处理 - for word in line_words: - words.append(word) - documents[file] = words#将处理后的单词列表存储到字典中 - except Exception as e: - print(f"读取{file_path}错误: {e}")#如果读取文件时出现错误,打印错误信息 - return documents - -# 文本预处理函数 -def preprocess_text(text): - text = text.lower() - words = re.findall(r'\w+', text)#使用正则表达式 \w+ 提取文本中的所有单词,存储在 words 列表中 - return [word for word in words if should_include_term(word)]#判断每个单词是否应该被包含,返回过滤后的单词列表 -#计算相似度矩阵函数 -def compute_similarity_matrix(documents):#用于计算文本相似度矩阵 - term_vectors = {file: Counter(words) for file, words in documents.items()}#计算每个文件的词频向量。 - similarity_matrix = defaultdict(dict)#初始化相似度矩阵 - filenames = list(documents.keys())#获取所有文件名 - for i, file1 in enumerate(filenames):#双重循环遍历所有文件对 - for j, file2 in enumerate(filenames): - if i == j: - similarity_matrix[file1][file2] = 1.0#如果是同一个文件,相似度为 1.0 - else: - similarity_matrix[file1][file2] = cosine_similarity(term_vectors[file1], term_vectors[file2])#调用 cosine_similarity 函数计算两个文件的余弦相似度 - return similarity_matrix, term_vectors#返回相似度和词频 - -#余弦相似度计算 -def cosine_similarity(vec1, vec2): - intersection = set(vec1.keys()) & set(vec2.keys())#计算两个向量的交集 - numerator = sum(vec1[x] * vec2[x] for x in intersection)#计算分子 - sum1 = sum(v ** 2 for v in vec1.values())#计算两个向量的模 - sum2 = sum(v ** 2 for v in vec2.values()) - denominator = math.sqrt(sum1) * math.sqrt(sum2)#计算分母 - return numerator / denominator if denominator else 0.0#如果分母不为零,则返回余弦相似度;否则返回 0.0 -#共同词汇查询函数 -def get_common_terms(vec1, vec2): - return set(vec1.keys()) & set(vec2.keys())#计算两个向量的交集,共同词汇 -# TOP10高频词获取函数 -def get_top10_words(words): - counter = Counter(words)#全部计数 - return [word for word, _ in counter.most_common(10)]#返回前十 -#TOP10查重矩阵 -def compute_top10_similarity(documents): - """计算基于TOP10高频词的相似度矩阵""" - top10_words_dict = {file: get_top10_words(words) for file, words in documents.items()}#计算每个文件的 TOP10 高频词 - similarity_matrix = defaultdict(dict)#初始化相似度矩阵 - filenames = list(documents.keys())#获取所有文件名 - for i, file1 in enumerate(filenames): - for j, file2 in enumerate(filenames): - if i == j: - similarity_matrix[file1][file2] = 1.0 - else: - # 计算两个文件TOP10高频词的交集比例 - common_words = set(top10_words_dict[file1]) & set(top10_words_dict[file2]) - similarity_matrix[file1][file2] = len(common_words) / 10.0#计算相似度 - return similarity_matrix, top10_words_dict -#窗口GUI设计 -class TextSimilarityAnalyzer: - def __init__(self, root): - self.root = root#保存主窗口对象 - self.root.title("文本相似度分析工具")#设置窗口标题 - self.root.geometry("1000x800")#尺寸 - self.root.resizable(True, True)#允许窗口调整大小 - self.default_font = font.nametofont("TkDefaultFont") - self.default_font.configure(family="SimHei", size=10) - self.root.option_add("*Font", self.default_font) - - # 初始化所有实例变量(修复外部定义错误) - self.documents = {} - self.similarity_matrix = {} - self.term_vectors = {} - self.filenames = [] - self.word_positions = {} - self.top10_similarity_matrix = {} - self.top10_words_dict = {} - - # 界面控件变量(统一在__init__中定义) - self.folder_path_var = tk.StringVar() # 修复实例特性在外部定义的错误 - self.base_file_var = tk.StringVar() # 修复实例特性在外部定义的错误 - self.file1_var = tk.StringVar() - self.file2_var = tk.StringVar() - - self.create_widgets() - - def create_widgets(self): - main_frame = ttk.Frame(self.root, padding="10") - main_frame.pack(fill=tk.BOTH, expand=True) - - # 顶部文件选择区 - top_frame = ttk.LabelFrame(main_frame, text="文件选择", padding="10") - top_frame.pack(fill=tk.X, pady=(0, 10)) - ttk.Label(top_frame, text="文件夹路径:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W) - ttk.Entry(top_frame, textvariable=self.folder_path_var, width=60).grid(row=0, column=1, padx=5, pady=5) - ttk.Button(top_frame, text="浏览...", command=self.browse_folder).grid(row=0, column=2, padx=5, pady=5) - ttk.Button(top_frame, text="分析", command=self.analyze_files, style='Accent.TButton').grid(row=0, column=3, - padx=5, pady=5) - ttk.Button(top_frame, text="查看热力图", command=self.show_heatmap, style='Accent.TButton').grid(row=0, - column=4, - padx=5, pady=5) - - # TOP10高频词查重功能UI - top10_frame = ttk.LabelFrame(main_frame, text="TOP10高频词预查重", padding="10") - top10_frame.pack(fill=tk.X, pady=(0, 10)) - ttk.Label(top10_frame, text="选择基准文件:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W) - self.base_file_combo = ttk.Combobox(top10_frame, textvariable=self.base_file_var, state="disabled", width=40) - self.base_file_combo.grid(row=0, column=1, padx=5, pady=5) - ttk.Button(top10_frame, text="查看TOP10高频词", command=self.show_top10_words, style='Accent.TButton').grid( - row=0, column=2, padx=5, pady=5) - ttk.Button(top10_frame, text="生成预查重报告", command=self.show_top10_similarity, style='Accent.TButton').grid( - row=0, - column=3, - padx=5, - pady=5) - - # 中间相似度矩阵区 - middle_frame = ttk.LabelFrame(main_frame, text="相似度矩阵", padding="10") - middle_frame.pack(fill=tk.BOTH, expand=True, pady=(0, 10)) - columns = ["File"] - self.similarity_tree = ttk.Treeview(middle_frame, columns=columns, show="headings") - self.similarity_tree.heading("File", text="文件") - self.similarity_tree.column("File", width=150, anchor=tk.CENTER) - tree_scroll_y = ttk.Scrollbar(middle_frame, orient=tk.VERTICAL, command=self.similarity_tree.yview) - tree_scroll_x = ttk.Scrollbar(middle_frame, orient=tk.HORIZONTAL, command=self.similarity_tree.xview) - self.similarity_tree.configure(yscrollcommand=tree_scroll_y.set, xscrollcommand=tree_scroll_x.set) - tree_scroll_y.pack(side=tk.RIGHT, fill=tk.Y) - tree_scroll_x.pack(side=tk.BOTTOM, fill=tk.X) - self.similarity_tree.pack(fill=tk.BOTH, expand=True) - - # 底部公共词汇查询区 - bottom_frame = ttk.LabelFrame(main_frame, text="公共词汇查询", padding="10") - bottom_frame.pack(fill=tk.X, pady=(0, 10)) - ttk.Label(bottom_frame, text="选择两个文件:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W) - self.file1_combo = ttk.Combobox(bottom_frame, textvariable=self.file1_var, state="disabled", width=30) - self.file1_combo.grid(row=0, column=1, padx=5, pady=5) - ttk.Label(bottom_frame, text="和").grid(row=0, column=2, padx=5, pady=5) - self.file2_combo = ttk.Combobox(bottom_frame, textvariable=self.file2_var, state="disabled", width=30) - self.file2_combo.grid(row=0, column=3, padx=5, pady=5) - ttk.Button(bottom_frame, text="查询公共词汇", command=self.query_common_terms, style='Accent.TButton').grid( - row=0, column=4, padx=5, pady=5) - ttk.Button(bottom_frame, text="查看词云", command=self.show_wordcloud, style='Accent.TButton').grid(row=0, - column=5, - padx=5, - pady=5) - - # 结果显示区 - result_frame = ttk.LabelFrame(main_frame, text="公共词汇结果", padding="10") - result_frame.pack(fill=tk.BOTH, expand=True) - self.result_text = scrolledtext.ScrolledText(result_frame, wrap=tk.WORD, height=6, font=('SimHei', 14)) - self.result_text.pack(fill=tk.BOTH, expand=True) - - # 样式设置 - style = ttk.Style() - style.configure('Accent.TButton', font=('SimHei', 10, 'bold')) - style.configure('Treeview', rowheight=25, font=('SimHei', 14), background='white', borderwidth=1) - style.configure('Treeview.Heading', font=('SimHei', 14, 'bold'), borderwidth=1) - style.layout('Treeview', [('Treeview.treearea', {'sticky': 'nswe'})]) - - def browse_folder(self): - folder_path = filedialog.askdirectory()#打开文件夹选择对话框 - if folder_path: - self.folder_path_var.set(folder_path)#将选择的文件夹路径设置到 folder_path_var 中 - - def analyze_files(self): - folder_path = self.folder_path_var.get()#获取选择的文件夹路径 - if not folder_path or not os.path.isdir(folder_path):#检查文件夹路径是否有效 - messagebox.showerror("错误", "请选择有效的文件夹路径") - return - for item in self.similarity_tree.get_children():#清空相似度矩阵树形视图 - self.similarity_tree.delete(item) - self.documents= load_documents(folder_path)#加载指定文件夹中的文本文件 - if not self.documents:#检查是否找到文本文件 - messagebox.showerror("错误", "未找到文本文件") - return - - # 计算文本相似度矩阵 - self.similarity_matrix, self.term_vectors = compute_similarity_matrix(self.documents) - - # 计算TOP10高频词相似度矩阵 - self.top10_similarity_matrix, self.top10_words_dict = compute_top10_similarity(self.documents) - - self.filenames = sorted(list(self.documents.keys()), - key=lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]) - columns = ["File"] + self.filenames - self.similarity_tree.configure(columns=columns)#设置相似度矩阵树形视图的列 - for col in self.filenames: - self.similarity_tree.heading(col, text=col) - self.similarity_tree.column(col, width=100, anchor=tk.CENTER) - for file1 in self.filenames: - values = [file1] + [f"{self.similarity_matrix[file1][file2]:.4f}" for file2 in self.filenames] - self.similarity_tree.insert("", tk.END, values=values)#将相似度矩阵插入到树形视图中 - self.file1_combo['values'] = self.filenames#设置下拉框的选项 - self.file2_combo['values'] = self.filenames - self.file1_combo['state'] = 'readonly' - self.file2_combo['state'] = 'readonly'#设置下拉框为只读模式 - - # 设置TOP10高频词下拉菜单 - self.base_file_combo['values'] = self.filenames#设置 TOP10 高频词下拉菜单的选项和状态 - self.base_file_combo['state'] = 'readonly' - if self.filenames: - self.file1_var.set(self.filenames[0]) - self.file2_var.set(self.filenames[1]) - self.base_file_var.set(self.filenames[0]) - - messagebox.showinfo("成功", f"分析完成,共处理 {len(self.filenames)} 个文件") - - def query_common_terms(self): - file1, file2 = self.file1_var.get(), self.file2_var.get() - if not file1 or not file2 or file1 == file2: - messagebox.showerror("错误", "请选择不同的两个文件") - return - if file1 not in self.term_vectors or file2 not in self.term_vectors: - messagebox.showerror("错误", "文件未分析") - return - common_terms = get_common_terms(self.term_vectors[file1], self.term_vectors[file2]) - filtered = [term for term in common_terms if should_include_term(term)] - self.result_text.delete(1.0, tk.END) - if not filtered: - self.result_text.insert(tk.END, f"{file1} 和 {file2} 无公共词汇") - return - self.result_text.insert(tk.END, f"{file1} 和 {file2} 的公共词汇 ({len(filtered)} 个):\n\n") - grouped = defaultdict(list) - for term in sorted(filtered): - clean = term.strip('"\'') - grouped[clean[0].upper() if clean else '#'].append(clean) - for initial in sorted(grouped.keys()): - terms = grouped[initial] - for i in range(0, len(terms), 10): - self.result_text.insert(tk.END, f"{initial}: {' '.join(terms[i:i + 10])}\n")#每行显示十个 - - def show_wordcloud(self): - """生成类似图片的紧凑词云(公共词汇)""" - file1, file2 = self.file1_var.get(), self.file2_var.get() - if not file1 or not file2 or file1 == file2: - messagebox.showerror("错误", "请选择不同的两个文件") - return - if file1 not in self.term_vectors or file2 not in self.term_vectors: - messagebox.showerror("错误", "文件未分析") - return - # 获取过滤后的公共词汇及词频 - common_terms = get_common_terms(self.term_vectors[file1], self.term_vectors[file2]) - filtered_terms = [term for term in common_terms if should_include_term(term)] - if not filtered_terms: - messagebox.showinfo("提示", "无符合条件的公共词汇") - return - word_freq = {term: min(self.term_vectors[file1][term], self.term_vectors[file2][term]) for term in - filtered_terms} - # 使用 pyecharts 生成词云 - words = [(word, freq) for word, freq in word_freq.items()] - ( - WordCloud() - .add("", words, word_size_range=[20, 100]) - .set_global_opts( - title_opts=opts.TitleOpts(title=f"{file1} 与 {file2} 的公共词汇词云"), - tooltip_opts=opts.TooltipOpts(is_show=True) - ) - .render("wordcloud_interactive.html") - ) - webbrowser.open_new_tab("wordcloud_interactive.html") - - def show_heatmap(self):#self 同样用于访问实例的属性和调用其他实例方法 - if not self.similarity_matrix: - messagebox.showerror("错误", "请先分析文件") - return - matrix_size = len(self.filenames) - heatmap_matrix = np.zeros((matrix_size, matrix_size)) - for i, file1 in enumerate(self.filenames): - for j, file2 in enumerate(self.filenames): - heatmap_matrix[i, j] = self.similarity_matrix[file1][file2] - fig, ax = plt.subplots(figsize=(14, 10)) - cmap = LinearSegmentedColormap.from_list("custom_cmap", ["blue", "white", "red"]) - im = ax.imshow(heatmap_matrix, cmap=cmap) - ax.set_xticks(np.arange(matrix_size)) - ax.set_yticks(np.arange(matrix_size)) - ax.set_xticklabels(self.filenames, rotation=45, ha="right", fontsize=10) - ax.set_yticklabels(self.filenames, fontsize=10) - for i in range(matrix_size): - for j in range(matrix_size): - ax.text(j, i, f"{heatmap_matrix[i, j]:.0%}", ha="center", va="center", - color="black" if heatmap_matrix[i, j] < 0.5 else "white", fontsize=16) - cbar = ax.figure.colorbar(im, ax=ax) - cbar.ax.set_ylabel("相似度", rotation=-90, va="bottom", fontsize=12) - ax.set_title("文本相似度热力图", fontsize=16) - plt.tight_layout() - plt.show() - - # TOP10高频词相关功能 - def show_top10_words(self): - file = self.base_file_var.get() - if not file or file not in self.documents: - messagebox.showerror("错误", "请选择有效的文件") - return - # 严格过滤后重新统计词频 - filtered_words = [word for word in self.documents[file] if should_include_term(word)] - word_counts = Counter(filtered_words) - top10 = word_counts.most_common(10) - words = [w for w, _ in top10] - counts = [c for _, c in top10] - # 优化图表大小和布局 - plt.figure(figsize=(14, 8)) - # 根据词频设置颜色 - colors = [] - for count in counts: - if count >= 15: - colors.append('#FF4136') # 红:词频>=15 - elif count >= 10: - colors.append('#FFDC00') # 黄:10<=词频<15 - else: - colors.append('#0074D9') # 蓝:词频<10 - bars = plt.bar(words, counts, color=colors) - # 添加网格线增强可读性 - plt.grid(axis='y', linestyle='--', alpha=0.7) - # 调整x轴标签旋转角度和字体大小 - plt.xticks(rotation=30, ha='right', fontsize=14) - # 优化数值标注位置和样式 - for bar, count in zip(bars, counts): - height = bar.get_height() - plt.text( - bar.get_x() + bar.get_width() / 2, - height + max(0.02 * max(counts), 0.3), # 自适应位置 - f'{count}', - ha='center', - va='bottom', - fontsize=14, - fontweight='bold', - color='darkslategrey' - ) - - # 添加颜色图例说明 - from matplotlib.patches import Patch - legend_elements = [ - Patch(facecolor='#FF4136', label='词频 ≥ 15'), - Patch(facecolor='#FFDC00', label='10 ≤ 词频 < 15'), - Patch(facecolor='#0074D9', label='词频 < 10') - ] - plt.legend(handles=legend_elements, fontsize=12) - - # 设置更清晰的标题和标签 - plt.ylabel('词频', fontsize=16) - plt.title(f'{file} 的TOP10高频词(过滤后)', fontsize=20, pad=15) - - # 调整布局并显示 - plt.tight_layout() - plt.show() - - def show_top10_similarity(self): - """生成优化后的基于TOP10高频词的可视化查重报告""" - file = self.base_file_var.get() - if not file or file not in self.top10_words_dict: - messagebox.showerror("错误", "请选择有效的文件") - return - - # 获取相似度数据并过滤 - similarities = self.top10_similarity_matrix[file] - sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True) - valid_files = [item for item in sorted_files if item[0] != file and item[1] > 0] - excluded_count = len(similarities) - 1 - len(valid_files) # 计算排除的文件数(减去自身) - - # 创建临时HTML文件(修复temp_file_path作用域错误) - temp_file_path = None - with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f: - html_content = f""" - - -
- - -| 文件名称 | -相似度 | -相似度直观展示 | -共同高频词 | -
|---|---|---|---|
| {other_file} | -{sim:.2%} | -- - | -{common_words_str} | -
本次预查重针对基准文件 {file} 完成以下分析:
-