From c3c1ffc3a5946d4fba3876ba2e1b0a5d8c0dcf84 Mon Sep 17 00:00:00 2001
From: ryx <2736755949@qq.com>
Date: Sat, 27 Dec 2025 00:50:59 +0800
Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20=E6=96=87=E6=9C=AC?=
 =?UTF-8?q?=E7=9B=B8=E4=BC=BC=E5=BA=A6=E9=97=AE=E9=A2=98.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 文本相似度问题.py | 663 ----------------------------------------------
 1 file changed, 663 deletions(-)
 delete mode 100644 文本相似度问题.py

diff --git a/文本相似度问题.py b/文本相似度问题.py
deleted file mode 100644
index 384628b..0000000
--- a/文本相似度问题.py
+++ /dev/null
@@ -1,663 +0,0 @@
-import os#用于处理文件和目录路径
-import math
-import tkinter as tk#用于创建图形用户界面（GUI）
-from tkinter import ttk, filedialog, messagebox, scrolledtext
-#提供提供现代风格的 GUI 组件，用于打开文件或文件夹选择对话框，用于显示消息框，用于创建带滚动条的文本框
-from collections import defaultdict, Counter
-#当访问不存在的键时，会自动创建一个默认值。用于统计元素的出现次数
-from tkinter import font#字体
-import nltk#自然语言处理工具包
-from nltk.corpus import stopwords#用于获取停用词列表
-import re#正则表达式模块，用于文本匹配和处理
-from pyecharts import options as opts#用于创建交互式图表，这里用于生成词云，options：用于设置图表的选项
-from pyecharts.charts import WordCloud#用于生成词云图
-import webbrowser#用于在浏览器中打开文件
-import tempfile#用于创建临时文件
-import matplotlib.pyplot as plt#用于绘制图表，热力图和柱状图
-import numpy as np#用于处理数组和矩阵
-from matplotlib.colors import LinearSegmentedColormap#用于创建自定义颜色映射
-plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]#设置 matplotlib 使用的字体，确保中文能够正常显示
-plt.rcParams["axes.unicode_minus"] = False  # 解决负号显示问题
-nltk.download('stopwords')# 下载停用词所需数据
-nltk.download('averaged_perceptron_tagger')#下载词性标注所需的数据
-# 获取英文停用词列表
-ENGLISH_STOPWORDS = set(stopwords.words('english'))
-# 用于匹配数词的正则表达式
-NUMBER_REGEX = re.compile(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?%?$')
-# 英文数词列表
-ENGLISH_NUMBERS = {
-    'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
-    'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
-    'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty',
-    'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand', 'million',
-    'billion', 'trillion'
-}#数词列表
-CUSTOM_STOPWORDS = {
-    "year", "years", "month", "months", "day", "days",
-    "whose", "who", "what", "when", "where", "why", "how",
-}#停用词列表
-# 过滤逻辑函数
-def should_include_term(term):#过滤不需要的单词
-    term_lower = term.lower()#把单词都转化成小写
-    if term_lower in ENGLISH_STOPWORDS or term_lower in CUSTOM_STOPWORDS:#检查这些单词是否在英文停用词列表里
-        return False
-    if NUMBER_REGEX.match(term) or term_lower in ENGLISH_NUMBERS:#检查这些单词是否在数词列表
-        return False
-    if len(term) <= 2:#小于等于两个字母的排除
-        return False
-    return True
-#定义加载文档函数
-def load_documents(folder_path):
-    documents = {}#初始化字典存储每个文件的单词列表
-    for root, dirs, files in os.walk(folder_path):#遍历指定文件夹及其子文件夹中的所有文件，os.walk(folder_path) 是 Python 内置的文件夹遍历函数
-        #每次循环返回一个元组 (root, dirs, files)
-        #dirs：当前 root 文件夹下的所有子文件夹名称（列表类型）
-        for file in files:
-            if file.endswith(('.txt', '.csv', '.json')):
-                file_path = os.path.join(root, file)## 拼接文件夹路径和文件名，得到文件的绝对路径
-                try:
-                    with open(file_path, 'r', encoding='utf-8') as f:
-                        lines = f.readlines()
-                        words = []#将处理后的单词添加到 words 列表中
-                        for line_num, line in enumerate(lines, start=1):
-                            line_words = preprocess_text(line)#调用函数进行预处理
-                            for word in line_words:
-                                words.append(word)
-                        documents[file] = words#将处理后的单词列表存储到字典中
-                except Exception as e:
-                    print(f"读取{file_path}错误: {e}")#如果读取文件时出现错误，打印错误信息
-    return documents
-
-# 文本预处理函数
-def preprocess_text(text):
-    text = text.lower()
-    words = re.findall(r'\w+', text)#使用正则表达式 \w+ 提取文本中的所有单词，存储在 words 列表中
-    return [word for word in words if should_include_term(word)]#判断每个单词是否应该被包含，返回过滤后的单词列表
-#计算相似度矩阵函数
-def compute_similarity_matrix(documents):#用于计算文本相似度矩阵
-    term_vectors = {file: Counter(words) for file, words in documents.items()}#计算每个文件的词频向量。
-    similarity_matrix = defaultdict(dict)#初始化相似度矩阵
-    filenames = list(documents.keys())#获取所有文件名
-    for i, file1 in enumerate(filenames):#双重循环遍历所有文件对
-        for j, file2 in enumerate(filenames):
-            if i == j:
-                similarity_matrix[file1][file2] = 1.0#如果是同一个文件，相似度为 1.0
-            else:
-                similarity_matrix[file1][file2] = cosine_similarity(term_vectors[file1], term_vectors[file2])#调用 cosine_similarity 函数计算两个文件的余弦相似度
-    return similarity_matrix, term_vectors#返回相似度和词频
-
-#余弦相似度计算
-def cosine_similarity(vec1, vec2):
-    intersection = set(vec1.keys()) & set(vec2.keys())#计算两个向量的交集
-    numerator = sum(vec1[x] * vec2[x] for x in intersection)#计算分子
-    sum1 = sum(v ** 2 for v in vec1.values())#计算两个向量的模
-    sum2 = sum(v ** 2 for v in vec2.values())
-    denominator = math.sqrt(sum1) * math.sqrt(sum2)#计算分母
-    return numerator / denominator if denominator else 0.0#如果分母不为零，则返回余弦相似度；否则返回 0.0
-#共同词汇查询函数
-def get_common_terms(vec1, vec2):
-    return set(vec1.keys()) & set(vec2.keys())#计算两个向量的交集，共同词汇
-# TOP10高频词获取函数
-def get_top10_words(words):
-    counter = Counter(words)#全部计数
-    return [word for word, _ in counter.most_common(10)]#返回前十
-#TOP10查重矩阵
-def compute_top10_similarity(documents):
-    """计算基于TOP10高频词的相似度矩阵"""
-    top10_words_dict = {file: get_top10_words(words) for file, words in documents.items()}#计算每个文件的 TOP10 高频词
-    similarity_matrix = defaultdict(dict)#初始化相似度矩阵
-    filenames = list(documents.keys())#获取所有文件名
-    for i, file1 in enumerate(filenames):
-        for j, file2 in enumerate(filenames):
-            if i == j:
-                similarity_matrix[file1][file2] = 1.0
-            else:
-                # 计算两个文件TOP10高频词的交集比例
-                common_words = set(top10_words_dict[file1]) & set(top10_words_dict[file2])
-                similarity_matrix[file1][file2] = len(common_words) / 10.0#计算相似度
-    return similarity_matrix, top10_words_dict
-#窗口GUI设计
-class TextSimilarityAnalyzer:
-    def __init__(self, root):
-        self.root = root#保存主窗口对象
-        self.root.title("文本相似度分析工具")#设置窗口标题
-        self.root.geometry("1000x800")#尺寸
-        self.root.resizable(True, True)#允许窗口调整大小
-        self.default_font = font.nametofont("TkDefaultFont")
-        self.default_font.configure(family="SimHei", size=10)
-        self.root.option_add("*Font", self.default_font)
-
-        # 初始化所有实例变量（修复外部定义错误）
-        self.documents = {}
-        self.similarity_matrix = {}
-        self.term_vectors = {}
-        self.filenames = []
-        self.word_positions = {}
-        self.top10_similarity_matrix = {}
-        self.top10_words_dict = {}
-
-        # 界面控件变量（统一在__init__中定义）
-        self.folder_path_var = tk.StringVar()  # 修复实例特性在外部定义的错误
-        self.base_file_var = tk.StringVar()  # 修复实例特性在外部定义的错误
-        self.file1_var = tk.StringVar()
-        self.file2_var = tk.StringVar()
-
-        self.create_widgets()
-
-    def create_widgets(self):
-        main_frame = ttk.Frame(self.root, padding="10")
-        main_frame.pack(fill=tk.BOTH, expand=True)
-
-        # 顶部文件选择区
-        top_frame = ttk.LabelFrame(main_frame, text="文件选择", padding="10")
-        top_frame.pack(fill=tk.X, pady=(0, 10))
-        ttk.Label(top_frame, text="文件夹路径:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W)
-        ttk.Entry(top_frame, textvariable=self.folder_path_var, width=60).grid(row=0, column=1, padx=5, pady=5)
-        ttk.Button(top_frame, text="浏览...", command=self.browse_folder).grid(row=0, column=2, padx=5, pady=5)
-        ttk.Button(top_frame, text="分析", command=self.analyze_files, style='Accent.TButton').grid(row=0, column=3,
-                                                                                                    padx=5, pady=5)
-        ttk.Button(top_frame, text="查看热力图", command=self.show_heatmap, style='Accent.TButton').grid(row=0,
-                                                                                                         column=4,
-                                                                                                         padx=5, pady=5)
-
-        # TOP10高频词查重功能UI
-        top10_frame = ttk.LabelFrame(main_frame, text="TOP10高频词预查重", padding="10")
-        top10_frame.pack(fill=tk.X, pady=(0, 10))
-        ttk.Label(top10_frame, text="选择基准文件:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W)
-        self.base_file_combo = ttk.Combobox(top10_frame, textvariable=self.base_file_var, state="disabled", width=40)
-        self.base_file_combo.grid(row=0, column=1, padx=5, pady=5)
-        ttk.Button(top10_frame, text="查看TOP10高频词", command=self.show_top10_words, style='Accent.TButton').grid(
-            row=0, column=2, padx=5, pady=5)
-        ttk.Button(top10_frame, text="生成预查重报告", command=self.show_top10_similarity, style='Accent.TButton').grid(
-            row=0,
-            column=3,
-            padx=5,
-            pady=5)
-
-        # 中间相似度矩阵区
-        middle_frame = ttk.LabelFrame(main_frame, text="相似度矩阵", padding="10")
-        middle_frame.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
-        columns = ["File"]
-        self.similarity_tree = ttk.Treeview(middle_frame, columns=columns, show="headings")
-        self.similarity_tree.heading("File", text="文件")
-        self.similarity_tree.column("File", width=150, anchor=tk.CENTER)
-        tree_scroll_y = ttk.Scrollbar(middle_frame, orient=tk.VERTICAL, command=self.similarity_tree.yview)
-        tree_scroll_x = ttk.Scrollbar(middle_frame, orient=tk.HORIZONTAL, command=self.similarity_tree.xview)
-        self.similarity_tree.configure(yscrollcommand=tree_scroll_y.set, xscrollcommand=tree_scroll_x.set)
-        tree_scroll_y.pack(side=tk.RIGHT, fill=tk.Y)
-        tree_scroll_x.pack(side=tk.BOTTOM, fill=tk.X)
-        self.similarity_tree.pack(fill=tk.BOTH, expand=True)
-
-        # 底部公共词汇查询区
-        bottom_frame = ttk.LabelFrame(main_frame, text="公共词汇查询", padding="10")
-        bottom_frame.pack(fill=tk.X, pady=(0, 10))
-        ttk.Label(bottom_frame, text="选择两个文件:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W)
-        self.file1_combo = ttk.Combobox(bottom_frame, textvariable=self.file1_var, state="disabled", width=30)
-        self.file1_combo.grid(row=0, column=1, padx=5, pady=5)
-        ttk.Label(bottom_frame, text="和").grid(row=0, column=2, padx=5, pady=5)
-        self.file2_combo = ttk.Combobox(bottom_frame, textvariable=self.file2_var, state="disabled", width=30)
-        self.file2_combo.grid(row=0, column=3, padx=5, pady=5)
-        ttk.Button(bottom_frame, text="查询公共词汇", command=self.query_common_terms, style='Accent.TButton').grid(
-            row=0, column=4, padx=5, pady=5)
-        ttk.Button(bottom_frame, text="查看词云", command=self.show_wordcloud, style='Accent.TButton').grid(row=0,
-                                                                                                            column=5,
-                                                                                                            padx=5,
-                                                                                                            pady=5)
-
-        # 结果显示区
-        result_frame = ttk.LabelFrame(main_frame, text="公共词汇结果", padding="10")
-        result_frame.pack(fill=tk.BOTH, expand=True)
-        self.result_text = scrolledtext.ScrolledText(result_frame, wrap=tk.WORD, height=6, font=('SimHei', 14))
-        self.result_text.pack(fill=tk.BOTH, expand=True)
-
-        # 样式设置
-        style = ttk.Style()
-        style.configure('Accent.TButton', font=('SimHei', 10, 'bold'))
-        style.configure('Treeview', rowheight=25, font=('SimHei', 14), background='white', borderwidth=1)
-        style.configure('Treeview.Heading', font=('SimHei', 14, 'bold'), borderwidth=1)
-        style.layout('Treeview', [('Treeview.treearea', {'sticky': 'nswe'})])
-
-    def browse_folder(self):
-        folder_path = filedialog.askdirectory()#打开文件夹选择对话框
-        if folder_path:
-            self.folder_path_var.set(folder_path)#将选择的文件夹路径设置到 folder_path_var 中
-
-    def analyze_files(self):
-        folder_path = self.folder_path_var.get()#获取选择的文件夹路径
-        if not folder_path or not os.path.isdir(folder_path):#检查文件夹路径是否有效
-            messagebox.showerror("错误", "请选择有效的文件夹路径")
-            return
-        for item in self.similarity_tree.get_children():#清空相似度矩阵树形视图
-            self.similarity_tree.delete(item)
-        self.documents= load_documents(folder_path)#加载指定文件夹中的文本文件
-        if not self.documents:#检查是否找到文本文件
-            messagebox.showerror("错误", "未找到文本文件")
-            return
-
-        # 计算文本相似度矩阵
-        self.similarity_matrix, self.term_vectors = compute_similarity_matrix(self.documents)
-
-        # 计算TOP10高频词相似度矩阵
-        self.top10_similarity_matrix, self.top10_words_dict = compute_top10_similarity(self.documents)
-
-        self.filenames = sorted(list(self.documents.keys()),
-                                key=lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)])
-        columns = ["File"] + self.filenames
-        self.similarity_tree.configure(columns=columns)#设置相似度矩阵树形视图的列
-        for col in self.filenames:
-            self.similarity_tree.heading(col, text=col)
-            self.similarity_tree.column(col, width=100, anchor=tk.CENTER)
-        for file1 in self.filenames:
-            values = [file1] + [f"{self.similarity_matrix[file1][file2]:.4f}" for file2 in self.filenames]
-            self.similarity_tree.insert("", tk.END, values=values)#将相似度矩阵插入到树形视图中
-        self.file1_combo['values'] = self.filenames#设置下拉框的选项
-        self.file2_combo['values'] = self.filenames
-        self.file1_combo['state'] = 'readonly'
-        self.file2_combo['state'] = 'readonly'#设置下拉框为只读模式
-
-        # 设置TOP10高频词下拉菜单
-        self.base_file_combo['values'] = self.filenames#设置 TOP10 高频词下拉菜单的选项和状态
-        self.base_file_combo['state'] = 'readonly'
-        if self.filenames:
-            self.file1_var.set(self.filenames[0])
-            self.file2_var.set(self.filenames[1])
-            self.base_file_var.set(self.filenames[0])
-
-        messagebox.showinfo("成功", f"分析完成，共处理 {len(self.filenames)} 个文件")
-
-    def query_common_terms(self):
-        file1, file2 = self.file1_var.get(), self.file2_var.get()
-        if not file1 or not file2 or file1 == file2:
-            messagebox.showerror("错误", "请选择不同的两个文件")
-            return
-        if file1 not in self.term_vectors or file2 not in self.term_vectors:
-            messagebox.showerror("错误", "文件未分析")
-            return
-        common_terms = get_common_terms(self.term_vectors[file1], self.term_vectors[file2])
-        filtered = [term for term in common_terms if should_include_term(term)]
-        self.result_text.delete(1.0, tk.END)
-        if not filtered:
-            self.result_text.insert(tk.END, f"{file1} 和 {file2} 无公共词汇")
-            return
-        self.result_text.insert(tk.END, f"{file1} 和 {file2} 的公共词汇 ({len(filtered)} 个):\n\n")
-        grouped = defaultdict(list)
-        for term in sorted(filtered):
-            clean = term.strip('"\'')
-            grouped[clean[0].upper() if clean else '#'].append(clean)
-        for initial in sorted(grouped.keys()):
-            terms = grouped[initial]
-            for i in range(0, len(terms), 10):
-                self.result_text.insert(tk.END, f"{initial}: {' '.join(terms[i:i + 10])}\n")#每行显示十个
-
-    def show_wordcloud(self):
-        """生成类似图片的紧凑词云（公共词汇）"""
-        file1, file2 = self.file1_var.get(), self.file2_var.get()
-        if not file1 or not file2 or file1 == file2:
-            messagebox.showerror("错误", "请选择不同的两个文件")
-            return
-        if file1 not in self.term_vectors or file2 not in self.term_vectors:
-            messagebox.showerror("错误", "文件未分析")
-            return
-        # 获取过滤后的公共词汇及词频
-        common_terms = get_common_terms(self.term_vectors[file1], self.term_vectors[file2])
-        filtered_terms = [term for term in common_terms if should_include_term(term)]
-        if not filtered_terms:
-            messagebox.showinfo("提示", "无符合条件的公共词汇")
-            return
-        word_freq = {term: min(self.term_vectors[file1][term], self.term_vectors[file2][term]) for term in
-                     filtered_terms}
-        # 使用 pyecharts 生成词云
-        words = [(word, freq) for word, freq in word_freq.items()]
-        (
-            WordCloud()
-            .add("", words, word_size_range=[20, 100])
-            .set_global_opts(
-                title_opts=opts.TitleOpts(title=f"{file1} 与 {file2} 的公共词汇词云"),
-                tooltip_opts=opts.TooltipOpts(is_show=True)
-            )
-            .render("wordcloud_interactive.html")
-        )
-        webbrowser.open_new_tab("wordcloud_interactive.html")
-
-    def show_heatmap(self):#self 同样用于访问实例的属性和调用其他实例方法
-        if not self.similarity_matrix:
-            messagebox.showerror("错误", "请先分析文件")
-            return
-        matrix_size = len(self.filenames)
-        heatmap_matrix = np.zeros((matrix_size, matrix_size))
-        for i, file1 in enumerate(self.filenames):
-            for j, file2 in enumerate(self.filenames):
-                heatmap_matrix[i, j] = self.similarity_matrix[file1][file2]
-        fig, ax = plt.subplots(figsize=(14, 10))
-        cmap = LinearSegmentedColormap.from_list("custom_cmap", ["blue", "white", "red"])
-        im = ax.imshow(heatmap_matrix, cmap=cmap)
-        ax.set_xticks(np.arange(matrix_size))
-        ax.set_yticks(np.arange(matrix_size))
-        ax.set_xticklabels(self.filenames, rotation=45, ha="right", fontsize=10)
-        ax.set_yticklabels(self.filenames, fontsize=10)
-        for i in range(matrix_size):
-            for j in range(matrix_size):
-                ax.text(j, i, f"{heatmap_matrix[i, j]:.0%}", ha="center", va="center",
-                        color="black" if heatmap_matrix[i, j] < 0.5 else "white", fontsize=16)
-        cbar = ax.figure.colorbar(im, ax=ax)
-        cbar.ax.set_ylabel("相似度", rotation=-90, va="bottom", fontsize=12)
-        ax.set_title("文本相似度热力图", fontsize=16)
-        plt.tight_layout()
-        plt.show()
-
-    # TOP10高频词相关功能
-    def show_top10_words(self):
-        file = self.base_file_var.get()
-        if not file or file not in self.documents:
-            messagebox.showerror("错误", "请选择有效的文件")
-            return
-        # 严格过滤后重新统计词频
-        filtered_words = [word for word in self.documents[file] if should_include_term(word)]
-        word_counts = Counter(filtered_words)
-        top10 = word_counts.most_common(10)
-        words = [w for w, _ in top10]
-        counts = [c for _, c in top10]
-        # 优化图表大小和布局
-        plt.figure(figsize=(14, 8))
-        # 根据词频设置颜色
-        colors = []
-        for count in counts:
-            if count >= 15:
-                colors.append('#FF4136')  # 红：词频>=15
-            elif count >= 10:
-                colors.append('#FFDC00')  # 黄：10<=词频<15
-            else:
-                colors.append('#0074D9')  # 蓝：词频<10
-        bars = plt.bar(words, counts, color=colors)
-        # 添加网格线增强可读性
-        plt.grid(axis='y', linestyle='--', alpha=0.7)
-        # 调整x轴标签旋转角度和字体大小
-        plt.xticks(rotation=30, ha='right', fontsize=14)
-        # 优化数值标注位置和样式
-        for bar, count in zip(bars, counts):
-            height = bar.get_height()
-            plt.text(
-                bar.get_x() + bar.get_width() / 2,
-                height + max(0.02 * max(counts), 0.3),  # 自适应位置
-                f'{count}',
-                ha='center',
-                va='bottom',
-                fontsize=14,
-                fontweight='bold',
-                color='darkslategrey'
-            )
-
-        # 添加颜色图例说明
-        from matplotlib.patches import Patch
-        legend_elements = [
-            Patch(facecolor='#FF4136', label='词频 ≥ 15'),
-            Patch(facecolor='#FFDC00', label='10 ≤ 词频 < 15'),
-            Patch(facecolor='#0074D9', label='词频 < 10')
-        ]
-        plt.legend(handles=legend_elements, fontsize=12)
-
-        # 设置更清晰的标题和标签
-        plt.ylabel('词频', fontsize=16)
-        plt.title(f'{file} 的TOP10高频词（过滤后）', fontsize=20, pad=15)
-
-        # 调整布局并显示
-        plt.tight_layout()
-        plt.show()
-
-    def show_top10_similarity(self):
-        """生成优化后的基于TOP10高频词的可视化查重报告"""
-        file = self.base_file_var.get()
-        if not file or file not in self.top10_words_dict:
-            messagebox.showerror("错误", "请选择有效的文件")
-            return
-
-        # 获取相似度数据并过滤
-        similarities = self.top10_similarity_matrix[file]
-        sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
-        valid_files = [item for item in sorted_files if item[0] != file and item[1] > 0]
-        excluded_count = len(similarities) - 1 - len(valid_files)  # 计算排除的文件数（减去自身）
-
-        # 创建临时HTML文件（修复temp_file_path作用域错误）
-        temp_file_path = None
-        with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
-            html_content = f"""
-            <!DOCTYPE html>
-            <html lang="zh-CN">
-            <head>
-                <meta charset="UTF-8">
-                <meta name="viewport" content="width=device-width, initial-scale=1.0">
-                <title>{file} 基于TOP10高频词的预查重报告</title>
-                <style>
-                    :root {{
-                        --primary: #2c3e50;
-                        --secondary: #3498db;
-                        --highlight: #e74c3c;
-                        --medium: #f39c12;
-                        --low: #2ecc71;
-                        --light-bg: #f8f9fa;
-                        --border: #dee2e6;
-                        --base-font-size: 18px; /* 基础字体大小，统一调整 */
-                        --heading-font-weight: 700; /* 标题字体加粗 */
-                        --text-font-weight: 400; /* 正文字体粗细 */
-                    }}
-                    body {{ 
-                        font-family: "SimHei", "Microsoft YaHei", sans-serif; 
-                        margin: 0; 
-                        padding: 20px; 
-                        color: var(--primary);
-                        background-color: #f5f7fa;
-                        font-size: var(--base-font-size); /* 应用基础字体大小 */
-                    }}
-                    .container {{ 
-                        max-width: 1200px; 
-                        margin: 0 auto; 
-                        background: white;
-                        padding: 30px;
-                        border-radius: 10px;
-                        box-shadow: 0 2px 15px rgba(0,0,0,0.1);
-                    }}
-                    h1 {{ 
-                        color: var(--primary); 
-                        text-align: center; 
-                        margin-bottom: 30px;
-                        padding-bottom: 15px;
-                        border-bottom: 2px solid var(--secondary);
-                        font-size: 28px; /* 加大标题字体 */
-                        font-weight: var(--heading-font-weight);
-                    }}
-                    .report-section {{ 
-                        margin-bottom: 35px; 
-                        padding: 20px;
-                        background-color: var(--light-bg);
-                        border-radius: 8px;
-                    }}
-                    .report-section h2 {{ 
-                        color: var(--secondary); 
-                        margin-top: 0;
-                        padding-bottom: 10px;
-                        border-bottom: 1px dashed var(--border);
-                        font-size: 22px; /* 加大二级标题字体 */
-                        font-weight: var(--heading-font-weight);
-                    }}
-                    .top10-words {{ 
-                        display: flex; 
-                        flex-wrap: wrap; 
-                        gap: 12px; 
-                        margin-top: 15px;
-                    }}
-                    .word-item {{ 
-                        background-color: white;
-                        border: 2px solid var(--secondary);
-                        padding: 8px 15px; 
-                        border-radius: 20px;
-                        font-weight: var(--heading-font-weight); /* 高频词标签加粗 */
-                        display: inline-flex;
-                        align-items: center;
-                        box-shadow: 0 2px 5px rgba(0,0,0,0.05);
-                        font-size: 18px; /* 高频词标签字体大小 */
-                    }}
-                    .word-item::before {{ 
-                        content: attr(data-index);
-                        background: var(--secondary);
-                        color: white;
-                        width: 24px; /* 加大圆形背景 */
-                        height: 24px;
-                        border-radius: 50%;
-                        display: inline-block;
-                        text-align: center;
-                        font-size: 15px; /* 编号字体大小 */
-                        margin-right: 8px;
-                        line-height: 24px; /* 垂直居中 */
-                    }}
-                    .similarity-table {{ 
-                        width: 100%; 
-                        border-collapse: collapse; 
-                        margin-top: 15px;
-                        overflow: hidden;
-                        border-radius: 8px;
-                        box-shadow: 0 2px 8px rgba(0,0,0,0.08);
-                    }}
-                    .similarity-table th {{ 
-                        background-color: var(--primary);
-                        color: white;
-                        text-align: center;
-                        padding: 12px 10px;
-                        font-weight: var(--heading-font-weight);
-                        font-size: 20px; /* 表头字体大小 */
-                    }}
-                    .similarity-table td {{ 
-                        border: 1px solid var(--border); 
-                        padding: 12px 10px; 
-                        text-align: center;
-                        font-size: 18px; /* 表格内容字体大小 */
-                        font-weight: var(--text-font-weight);
-                    }}
-                    .similarity-table tr:nth-child(even) {{ 
-                        background-color: #f9f9f9;
-                    }}
-                    .similarity-table tr:hover {{ 
-                        background-color: #f1f7ff;
-                    }}
-                    .progress-bar {{ 
-                        height: 24px; 
-                        background-color: #e9ecef; 
-                        border-radius: 12px; 
-                        overflow: hidden;
-                        position: relative;
-                        box-shadow: inset 0 1px 3px rgba(0,0,0,0.1);
-                    }}
-                    .progress {{ 
-                        height: 100%; 
-                        transition: width 0.5s ease;
-                    }}
-                    .high {{ background-color: var(--highlight); }}
-                    .medium {{ background-color: var(--medium); }}
-                    .low {{ background-color: var(--low); }}
-                    .common-words {{ 
-                        font-size: 16px; /* 共同高频词字体大小，稍小于基础 */
-                        color: #495057;
-                        word-break: break-all;
-                        font-weight: var(--text-font-weight);
-                    }}
-                    .conclusion {{
-                        background-color: white;
-                        border-left: 4px solid var(--secondary);
-                        padding: 15px 20px;
-                        margin-top: 15px;
-                        border-radius: 0 5px 5px 0;
-                        line-height: 1.6;
-                        font-size: 18px; /* 结论字体大小 */
-                        font-weight: var(--text-font-weight);
-                    }}
-                    .stats {{
-                        font-weight: var(--heading-font-weight);
-                        color: var(--highlight);
-                    }}
-                </style>
-            </head>
-            <body>
-                <div class="container">
-                    <h1>{file} 的TOP10高频词预查重报告</h1>
-                    <div class="report-section">
-                        <h2>基准文件TOP10高频词</h2>
-                        <div class="top10-words">
-            """
-            # 生成带编号的高频词标签
-            base_top10 = self.top10_words_dict[file]
-            for i, word in enumerate(base_top10, 1):
-                html_content += f"""
-                                <div class="word-item" data-index="{i}">
-                                    {word}
-                                </div>
-                    """
-            html_content += f"""
-                        </div>
-                    </div>
-                    <div class="report-section">
-                        <h2>与其他文件的相似度比较</h2>
-                        <table class="similarity-table">
-                            <tr>
-                                <th>文件名称</th>
-                                <th>相似度</th>
-                                <th>相似度直观展示</th>
-                                <th>共同高频词</th>
-                            </tr>
-            """
-            # 生成带颜色分级的相似度表格
-            for other_file, sim in valid_files:
-                common_words = list(set(self.top10_words_dict[file]) & set(self.top10_words_dict[other_file]))
-                common_words_str = ", ".join(common_words)
-
-                # 根据相似度设置进度条颜色
-                if sim >= 0.7:
-                    progress_class = "high"
-                elif sim >= 0.3:
-                    progress_class = "medium"
-                else:
-                    progress_class = "low"
-
-                html_content += f"""
-                                <tr>
-                                    <td>{other_file}</td>
-                                    <td>{sim:.2%}</td>
-                                    <td>
-                                        <div class="progress-bar">
-                                            <div class="progress {progress_class}" style="width: {sim * 100}%"></div>
-                                        </div>
-                                    </td>
-                                    <td class="common-words">{common_words_str}</td>
-                                </tr>
-                    """
-            html_content += f"""
-                        </table>
-                    </div>
-                    <div class="report-section">
-                        <h2>查重结论与建议</h2>
-                        <div class="conclusion">
-                            <p>本次预查重针对基准文件 <strong>{file}</strong> 完成以下分析：</p>
-                            <ul>
-                                <li>共检测到 {len(similarities) - 1} 个对比文件，其中 <span class="stats">{excluded_count} 个文件与基准文件重复度为0</span>，已排除无需后续查重</li>
-                                <li>剩余 {len(valid_files)} 个文件中：
-                                    <ul>
-                                        <li>高相似度文件（≥70%）：{sum(1 for f, s in valid_files if s >= 0.7)} 个，建议重点核查</li>
-                                        <li>中等相似度文件（30%-70%）：{sum(1 for f, s in valid_files if 0.3 <= s < 0.7)} 个，建议适当关注</li>
-                                        <li>低相似度文件（<30%）：{sum(1 for f, s in valid_files if s < 0.3)} 个，相关性较低</li>
-                                    </ul>
-                                </li>
-                                <li>共同高频词分析显示：{base_top10[0]}、{base_top10[1]} 等核心词汇在多个文件中出现频率较高，可作为重点比对特征词</li>
-                            </ul>
-                        </div>
-                    </div>
-                </div>
-            </body>
-            </html>
-            """
-            f.write(html_content)
-            temp_file_path = f.name  # 在with块内正确赋值
-        # 修复作用域问题：确保在方法内部使用变量
-        if temp_file_path:
-            webbrowser.open_new_tab(temp_file_path)
-if __name__ == "__main__":
-    root = tk.Tk()#创建主窗口对象
-    app = TextSimilarityAnalyzer(root)#创建文本相似度分析器对象
-    root.mainloop()#进入主事件循环，等待用户交互
\ No newline at end of file