rj23/文本相似度问题.py
2025-11-24 16:59:26 +08:00

663 lines
35 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os#用于处理文件和目录路径
import math
import tkinter as tk#用于创建图形用户界面GUI
from tkinter import ttk, filedialog, messagebox, scrolledtext
#提供提供现代风格的 GUI 组件,用于打开文件或文件夹选择对话框,用于显示消息框,用于创建带滚动条的文本框
from collections import defaultdict, Counter
#当访问不存在的键时,会自动创建一个默认值。用于统计元素的出现次数
from tkinter import font#字体
import nltk#自然语言处理工具包
from nltk.corpus import stopwords#用于获取停用词列表
import re#正则表达式模块,用于文本匹配和处理
from pyecharts import options as opts#用于创建交互式图表这里用于生成词云options用于设置图表的选项
from pyecharts.charts import WordCloud#用于生成词云图
import webbrowser#用于在浏览器中打开文件
import tempfile#用于创建临时文件
import matplotlib.pyplot as plt#用于绘制图表,热力图和柱状图
import numpy as np#用于处理数组和矩阵
from matplotlib.colors import LinearSegmentedColormap#用于创建自定义颜色映射
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]#设置 matplotlib 使用的字体,确保中文能够正常显示
plt.rcParams["axes.unicode_minus"] = False # 解决负号显示问题
nltk.download('stopwords')# 下载停用词所需数据
nltk.download('averaged_perceptron_tagger')#下载词性标注所需的数据
# 获取英文停用词列表
ENGLISH_STOPWORDS = set(stopwords.words('english'))
# 用于匹配数词的正则表达式
NUMBER_REGEX = re.compile(r'^[-+]?\d*\.?\d+([eE][-+]?\d+)?%?$')
# 英文数词列表
ENGLISH_NUMBERS = {
'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty',
'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand', 'million',
'billion', 'trillion'
}#数词列表
CUSTOM_STOPWORDS = {
"year", "years", "month", "months", "day", "days",
"whose", "who", "what", "when", "where", "why", "how",
}#停用词列表
# 过滤逻辑函数
def should_include_term(term):#过滤不需要的单词
term_lower = term.lower()#把单词都转化成小写
if term_lower in ENGLISH_STOPWORDS or term_lower in CUSTOM_STOPWORDS:#检查这些单词是否在英文停用词列表里
return False
if NUMBER_REGEX.match(term) or term_lower in ENGLISH_NUMBERS:#检查这些单词是否在数词列表
return False
if len(term) <= 2:#小于等于两个字母的排除
return False
return True
#定义加载文档函数
def load_documents(folder_path):
documents = {}#初始化字典存储每个文件的单词列表
for root, dirs, files in os.walk(folder_path):#遍历指定文件夹及其子文件夹中的所有文件os.walk(folder_path) 是 Python 内置的文件夹遍历函数
#每次循环返回一个元组 (root, dirs, files)
#dirs当前 root 文件夹下的所有子文件夹名称(列表类型)
for file in files:
if file.endswith(('.txt', '.csv', '.json')):
file_path = os.path.join(root, file)## 拼接文件夹路径和文件名,得到文件的绝对路径
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
words = []#将处理后的单词添加到 words 列表中
for line_num, line in enumerate(lines, start=1):
line_words = preprocess_text(line)#调用函数进行预处理
for word in line_words:
words.append(word)
documents[file] = words#将处理后的单词列表存储到字典中
except Exception as e:
print(f"读取{file_path}错误: {e}")#如果读取文件时出现错误,打印错误信息
return documents
# 文本预处理函数
def preprocess_text(text):
text = text.lower()
words = re.findall(r'\w+', text)#使用正则表达式 \w+ 提取文本中的所有单词,存储在 words 列表中
return [word for word in words if should_include_term(word)]#判断每个单词是否应该被包含,返回过滤后的单词列表
#计算相似度矩阵函数
def compute_similarity_matrix(documents):#用于计算文本相似度矩阵
term_vectors = {file: Counter(words) for file, words in documents.items()}#计算每个文件的词频向量。
similarity_matrix = defaultdict(dict)#初始化相似度矩阵
filenames = list(documents.keys())#获取所有文件名
for i, file1 in enumerate(filenames):#双重循环遍历所有文件对
for j, file2 in enumerate(filenames):
if i == j:
similarity_matrix[file1][file2] = 1.0#如果是同一个文件,相似度为 1.0
else:
similarity_matrix[file1][file2] = cosine_similarity(term_vectors[file1], term_vectors[file2])#调用 cosine_similarity 函数计算两个文件的余弦相似度
return similarity_matrix, term_vectors#返回相似度和词频
#余弦相似度计算
def cosine_similarity(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())#计算两个向量的交集
numerator = sum(vec1[x] * vec2[x] for x in intersection)#计算分子
sum1 = sum(v ** 2 for v in vec1.values())#计算两个向量的模
sum2 = sum(v ** 2 for v in vec2.values())
denominator = math.sqrt(sum1) * math.sqrt(sum2)#计算分母
return numerator / denominator if denominator else 0.0#如果分母不为零,则返回余弦相似度;否则返回 0.0
#共同词汇查询函数
def get_common_terms(vec1, vec2):
return set(vec1.keys()) & set(vec2.keys())#计算两个向量的交集,共同词汇
# TOP10高频词获取函数
def get_top10_words(words):
counter = Counter(words)#全部计数
return [word for word, _ in counter.most_common(10)]#返回前十
#TOP10查重矩阵
def compute_top10_similarity(documents):
"""计算基于TOP10高频词的相似度矩阵"""
top10_words_dict = {file: get_top10_words(words) for file, words in documents.items()}#计算每个文件的 TOP10 高频词
similarity_matrix = defaultdict(dict)#初始化相似度矩阵
filenames = list(documents.keys())#获取所有文件名
for i, file1 in enumerate(filenames):
for j, file2 in enumerate(filenames):
if i == j:
similarity_matrix[file1][file2] = 1.0
else:
# 计算两个文件TOP10高频词的交集比例
common_words = set(top10_words_dict[file1]) & set(top10_words_dict[file2])
similarity_matrix[file1][file2] = len(common_words) / 10.0#计算相似度
return similarity_matrix, top10_words_dict
#窗口GUI设计
class TextSimilarityAnalyzer:
def __init__(self, root):
self.root = root#保存主窗口对象
self.root.title("文本相似度分析工具")#设置窗口标题
self.root.geometry("1000x800")#尺寸
self.root.resizable(True, True)#允许窗口调整大小
self.default_font = font.nametofont("TkDefaultFont")
self.default_font.configure(family="SimHei", size=10)
self.root.option_add("*Font", self.default_font)
# 初始化所有实例变量(修复外部定义错误)
self.documents = {}
self.similarity_matrix = {}
self.term_vectors = {}
self.filenames = []
self.word_positions = {}
self.top10_similarity_matrix = {}
self.top10_words_dict = {}
# 界面控件变量统一在__init__中定义
self.folder_path_var = tk.StringVar() # 修复实例特性在外部定义的错误
self.base_file_var = tk.StringVar() # 修复实例特性在外部定义的错误
self.file1_var = tk.StringVar()
self.file2_var = tk.StringVar()
self.create_widgets()
def create_widgets(self):
main_frame = ttk.Frame(self.root, padding="10")
main_frame.pack(fill=tk.BOTH, expand=True)
# 顶部文件选择区
top_frame = ttk.LabelFrame(main_frame, text="文件选择", padding="10")
top_frame.pack(fill=tk.X, pady=(0, 10))
ttk.Label(top_frame, text="文件夹路径:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W)
ttk.Entry(top_frame, textvariable=self.folder_path_var, width=60).grid(row=0, column=1, padx=5, pady=5)
ttk.Button(top_frame, text="浏览...", command=self.browse_folder).grid(row=0, column=2, padx=5, pady=5)
ttk.Button(top_frame, text="分析", command=self.analyze_files, style='Accent.TButton').grid(row=0, column=3,
padx=5, pady=5)
ttk.Button(top_frame, text="查看热力图", command=self.show_heatmap, style='Accent.TButton').grid(row=0,
column=4,
padx=5, pady=5)
# TOP10高频词查重功能UI
top10_frame = ttk.LabelFrame(main_frame, text="TOP10高频词预查重", padding="10")
top10_frame.pack(fill=tk.X, pady=(0, 10))
ttk.Label(top10_frame, text="选择基准文件:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W)
self.base_file_combo = ttk.Combobox(top10_frame, textvariable=self.base_file_var, state="disabled", width=40)
self.base_file_combo.grid(row=0, column=1, padx=5, pady=5)
ttk.Button(top10_frame, text="查看TOP10高频词", command=self.show_top10_words, style='Accent.TButton').grid(
row=0, column=2, padx=5, pady=5)
ttk.Button(top10_frame, text="生成预查重报告", command=self.show_top10_similarity, style='Accent.TButton').grid(
row=0,
column=3,
padx=5,
pady=5)
# 中间相似度矩阵区
middle_frame = ttk.LabelFrame(main_frame, text="相似度矩阵", padding="10")
middle_frame.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
columns = ["File"]
self.similarity_tree = ttk.Treeview(middle_frame, columns=columns, show="headings")
self.similarity_tree.heading("File", text="文件")
self.similarity_tree.column("File", width=150, anchor=tk.CENTER)
tree_scroll_y = ttk.Scrollbar(middle_frame, orient=tk.VERTICAL, command=self.similarity_tree.yview)
tree_scroll_x = ttk.Scrollbar(middle_frame, orient=tk.HORIZONTAL, command=self.similarity_tree.xview)
self.similarity_tree.configure(yscrollcommand=tree_scroll_y.set, xscrollcommand=tree_scroll_x.set)
tree_scroll_y.pack(side=tk.RIGHT, fill=tk.Y)
tree_scroll_x.pack(side=tk.BOTTOM, fill=tk.X)
self.similarity_tree.pack(fill=tk.BOTH, expand=True)
# 底部公共词汇查询区
bottom_frame = ttk.LabelFrame(main_frame, text="公共词汇查询", padding="10")
bottom_frame.pack(fill=tk.X, pady=(0, 10))
ttk.Label(bottom_frame, text="选择两个文件:").grid(row=0, column=0, padx=5, pady=5, sticky=tk.W)
self.file1_combo = ttk.Combobox(bottom_frame, textvariable=self.file1_var, state="disabled", width=30)
self.file1_combo.grid(row=0, column=1, padx=5, pady=5)
ttk.Label(bottom_frame, text="").grid(row=0, column=2, padx=5, pady=5)
self.file2_combo = ttk.Combobox(bottom_frame, textvariable=self.file2_var, state="disabled", width=30)
self.file2_combo.grid(row=0, column=3, padx=5, pady=5)
ttk.Button(bottom_frame, text="查询公共词汇", command=self.query_common_terms, style='Accent.TButton').grid(
row=0, column=4, padx=5, pady=5)
ttk.Button(bottom_frame, text="查看词云", command=self.show_wordcloud, style='Accent.TButton').grid(row=0,
column=5,
padx=5,
pady=5)
# 结果显示区
result_frame = ttk.LabelFrame(main_frame, text="公共词汇结果", padding="10")
result_frame.pack(fill=tk.BOTH, expand=True)
self.result_text = scrolledtext.ScrolledText(result_frame, wrap=tk.WORD, height=6, font=('SimHei', 14))
self.result_text.pack(fill=tk.BOTH, expand=True)
# 样式设置
style = ttk.Style()
style.configure('Accent.TButton', font=('SimHei', 10, 'bold'))
style.configure('Treeview', rowheight=25, font=('SimHei', 14), background='white', borderwidth=1)
style.configure('Treeview.Heading', font=('SimHei', 14, 'bold'), borderwidth=1)
style.layout('Treeview', [('Treeview.treearea', {'sticky': 'nswe'})])
def browse_folder(self):
folder_path = filedialog.askdirectory()#打开文件夹选择对话框
if folder_path:
self.folder_path_var.set(folder_path)#将选择的文件夹路径设置到 folder_path_var 中
def analyze_files(self):
folder_path = self.folder_path_var.get()#获取选择的文件夹路径
if not folder_path or not os.path.isdir(folder_path):#检查文件夹路径是否有效
messagebox.showerror("错误", "请选择有效的文件夹路径")
return
for item in self.similarity_tree.get_children():#清空相似度矩阵树形视图
self.similarity_tree.delete(item)
self.documents= load_documents(folder_path)#加载指定文件夹中的文本文件
if not self.documents:#检查是否找到文本文件
messagebox.showerror("错误", "未找到文本文件")
return
# 计算文本相似度矩阵
self.similarity_matrix, self.term_vectors = compute_similarity_matrix(self.documents)
# 计算TOP10高频词相似度矩阵
self.top10_similarity_matrix, self.top10_words_dict = compute_top10_similarity(self.documents)
self.filenames = sorted(list(self.documents.keys()),
key=lambda s: [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)])
columns = ["File"] + self.filenames
self.similarity_tree.configure(columns=columns)#设置相似度矩阵树形视图的列
for col in self.filenames:
self.similarity_tree.heading(col, text=col)
self.similarity_tree.column(col, width=100, anchor=tk.CENTER)
for file1 in self.filenames:
values = [file1] + [f"{self.similarity_matrix[file1][file2]:.4f}" for file2 in self.filenames]
self.similarity_tree.insert("", tk.END, values=values)#将相似度矩阵插入到树形视图中
self.file1_combo['values'] = self.filenames#设置下拉框的选项
self.file2_combo['values'] = self.filenames
self.file1_combo['state'] = 'readonly'
self.file2_combo['state'] = 'readonly'#设置下拉框为只读模式
# 设置TOP10高频词下拉菜单
self.base_file_combo['values'] = self.filenames#设置 TOP10 高频词下拉菜单的选项和状态
self.base_file_combo['state'] = 'readonly'
if self.filenames:
self.file1_var.set(self.filenames[0])
self.file2_var.set(self.filenames[1])
self.base_file_var.set(self.filenames[0])
messagebox.showinfo("成功", f"分析完成,共处理 {len(self.filenames)} 个文件")
def query_common_terms(self):
file1, file2 = self.file1_var.get(), self.file2_var.get()
if not file1 or not file2 or file1 == file2:
messagebox.showerror("错误", "请选择不同的两个文件")
return
if file1 not in self.term_vectors or file2 not in self.term_vectors:
messagebox.showerror("错误", "文件未分析")
return
common_terms = get_common_terms(self.term_vectors[file1], self.term_vectors[file2])
filtered = [term for term in common_terms if should_include_term(term)]
self.result_text.delete(1.0, tk.END)
if not filtered:
self.result_text.insert(tk.END, f"{file1}{file2} 无公共词汇")
return
self.result_text.insert(tk.END, f"{file1}{file2} 的公共词汇 ({len(filtered)} 个):\n\n")
grouped = defaultdict(list)
for term in sorted(filtered):
clean = term.strip('"\'')
grouped[clean[0].upper() if clean else '#'].append(clean)
for initial in sorted(grouped.keys()):
terms = grouped[initial]
for i in range(0, len(terms), 10):
self.result_text.insert(tk.END, f"{initial}: {' '.join(terms[i:i + 10])}\n")#每行显示十个
def show_wordcloud(self):
"""生成类似图片的紧凑词云(公共词汇)"""
file1, file2 = self.file1_var.get(), self.file2_var.get()
if not file1 or not file2 or file1 == file2:
messagebox.showerror("错误", "请选择不同的两个文件")
return
if file1 not in self.term_vectors or file2 not in self.term_vectors:
messagebox.showerror("错误", "文件未分析")
return
# 获取过滤后的公共词汇及词频
common_terms = get_common_terms(self.term_vectors[file1], self.term_vectors[file2])
filtered_terms = [term for term in common_terms if should_include_term(term)]
if not filtered_terms:
messagebox.showinfo("提示", "无符合条件的公共词汇")
return
word_freq = {term: min(self.term_vectors[file1][term], self.term_vectors[file2][term]) for term in
filtered_terms}
# 使用 pyecharts 生成词云
words = [(word, freq) for word, freq in word_freq.items()]
(
WordCloud()
.add("", words, word_size_range=[20, 100])
.set_global_opts(
title_opts=opts.TitleOpts(title=f"{file1}{file2} 的公共词汇词云"),
tooltip_opts=opts.TooltipOpts(is_show=True)
)
.render("wordcloud_interactive.html")
)
webbrowser.open_new_tab("wordcloud_interactive.html")
def show_heatmap(self):#self 同样用于访问实例的属性和调用其他实例方法
if not self.similarity_matrix:
messagebox.showerror("错误", "请先分析文件")
return
matrix_size = len(self.filenames)
heatmap_matrix = np.zeros((matrix_size, matrix_size))
for i, file1 in enumerate(self.filenames):
for j, file2 in enumerate(self.filenames):
heatmap_matrix[i, j] = self.similarity_matrix[file1][file2]
fig, ax = plt.subplots(figsize=(14, 10))
cmap = LinearSegmentedColormap.from_list("custom_cmap", ["blue", "white", "red"])
im = ax.imshow(heatmap_matrix, cmap=cmap)
ax.set_xticks(np.arange(matrix_size))
ax.set_yticks(np.arange(matrix_size))
ax.set_xticklabels(self.filenames, rotation=45, ha="right", fontsize=10)
ax.set_yticklabels(self.filenames, fontsize=10)
for i in range(matrix_size):
for j in range(matrix_size):
ax.text(j, i, f"{heatmap_matrix[i, j]:.0%}", ha="center", va="center",
color="black" if heatmap_matrix[i, j] < 0.5 else "white", fontsize=16)
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel("相似度", rotation=-90, va="bottom", fontsize=12)
ax.set_title("文本相似度热力图", fontsize=16)
plt.tight_layout()
plt.show()
# TOP10高频词相关功能
def show_top10_words(self):
file = self.base_file_var.get()
if not file or file not in self.documents:
messagebox.showerror("错误", "请选择有效的文件")
return
# 严格过滤后重新统计词频
filtered_words = [word for word in self.documents[file] if should_include_term(word)]
word_counts = Counter(filtered_words)
top10 = word_counts.most_common(10)
words = [w for w, _ in top10]
counts = [c for _, c in top10]
# 优化图表大小和布局
plt.figure(figsize=(14, 8))
# 根据词频设置颜色
colors = []
for count in counts:
if count >= 15:
colors.append('#FF4136') # 红:词频>=15
elif count >= 10:
colors.append('#FFDC00') # 黄10<=词频<15
else:
colors.append('#0074D9') # 蓝:词频<10
bars = plt.bar(words, counts, color=colors)
# 添加网格线增强可读性
plt.grid(axis='y', linestyle='--', alpha=0.7)
# 调整x轴标签旋转角度和字体大小
plt.xticks(rotation=30, ha='right', fontsize=14)
# 优化数值标注位置和样式
for bar, count in zip(bars, counts):
height = bar.get_height()
plt.text(
bar.get_x() + bar.get_width() / 2,
height + max(0.02 * max(counts), 0.3), # 自适应位置
f'{count}',
ha='center',
va='bottom',
fontsize=14,
fontweight='bold',
color='darkslategrey'
)
# 添加颜色图例说明
from matplotlib.patches import Patch
legend_elements = [
Patch(facecolor='#FF4136', label='词频 ≥ 15'),
Patch(facecolor='#FFDC00', label='10 ≤ 词频 < 15'),
Patch(facecolor='#0074D9', label='词频 < 10')
]
plt.legend(handles=legend_elements, fontsize=12)
# 设置更清晰的标题和标签
plt.ylabel('词频', fontsize=16)
plt.title(f'{file} 的TOP10高频词过滤后', fontsize=20, pad=15)
# 调整布局并显示
plt.tight_layout()
plt.show()
def show_top10_similarity(self):
"""生成优化后的基于TOP10高频词的可视化查重报告"""
file = self.base_file_var.get()
if not file or file not in self.top10_words_dict:
messagebox.showerror("错误", "请选择有效的文件")
return
# 获取相似度数据并过滤
similarities = self.top10_similarity_matrix[file]
sorted_files = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
valid_files = [item for item in sorted_files if item[0] != file and item[1] > 0]
excluded_count = len(similarities) - 1 - len(valid_files) # 计算排除的文件数(减去自身)
# 创建临时HTML文件修复temp_file_path作用域错误
temp_file_path = None
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
html_content = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{file} 基于TOP10高频词的预查重报告</title>
<style>
:root {{
--primary: #2c3e50;
--secondary: #3498db;
--highlight: #e74c3c;
--medium: #f39c12;
--low: #2ecc71;
--light-bg: #f8f9fa;
--border: #dee2e6;
--base-font-size: 18px; /* 基础字体大小,统一调整 */
--heading-font-weight: 700; /* 标题字体加粗 */
--text-font-weight: 400; /* 正文字体粗细 */
}}
body {{
font-family: "SimHei", "Microsoft YaHei", sans-serif;
margin: 0;
padding: 20px;
color: var(--primary);
background-color: #f5f7fa;
font-size: var(--base-font-size); /* 应用基础字体大小 */
}}
.container {{
max-width: 1200px;
margin: 0 auto;
background: white;
padding: 30px;
border-radius: 10px;
box-shadow: 0 2px 15px rgba(0,0,0,0.1);
}}
h1 {{
color: var(--primary);
text-align: center;
margin-bottom: 30px;
padding-bottom: 15px;
border-bottom: 2px solid var(--secondary);
font-size: 28px; /* 加大标题字体 */
font-weight: var(--heading-font-weight);
}}
.report-section {{
margin-bottom: 35px;
padding: 20px;
background-color: var(--light-bg);
border-radius: 8px;
}}
.report-section h2 {{
color: var(--secondary);
margin-top: 0;
padding-bottom: 10px;
border-bottom: 1px dashed var(--border);
font-size: 22px; /* 加大二级标题字体 */
font-weight: var(--heading-font-weight);
}}
.top10-words {{
display: flex;
flex-wrap: wrap;
gap: 12px;
margin-top: 15px;
}}
.word-item {{
background-color: white;
border: 2px solid var(--secondary);
padding: 8px 15px;
border-radius: 20px;
font-weight: var(--heading-font-weight); /* 高频词标签加粗 */
display: inline-flex;
align-items: center;
box-shadow: 0 2px 5px rgba(0,0,0,0.05);
font-size: 18px; /* 高频词标签字体大小 */
}}
.word-item::before {{
content: attr(data-index);
background: var(--secondary);
color: white;
width: 24px; /* 加大圆形背景 */
height: 24px;
border-radius: 50%;
display: inline-block;
text-align: center;
font-size: 15px; /* 编号字体大小 */
margin-right: 8px;
line-height: 24px; /* 垂直居中 */
}}
.similarity-table {{
width: 100%;
border-collapse: collapse;
margin-top: 15px;
overflow: hidden;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.08);
}}
.similarity-table th {{
background-color: var(--primary);
color: white;
text-align: center;
padding: 12px 10px;
font-weight: var(--heading-font-weight);
font-size: 20px; /* 表头字体大小 */
}}
.similarity-table td {{
border: 1px solid var(--border);
padding: 12px 10px;
text-align: center;
font-size: 18px; /* 表格内容字体大小 */
font-weight: var(--text-font-weight);
}}
.similarity-table tr:nth-child(even) {{
background-color: #f9f9f9;
}}
.similarity-table tr:hover {{
background-color: #f1f7ff;
}}
.progress-bar {{
height: 24px;
background-color: #e9ecef;
border-radius: 12px;
overflow: hidden;
position: relative;
box-shadow: inset 0 1px 3px rgba(0,0,0,0.1);
}}
.progress {{
height: 100%;
transition: width 0.5s ease;
}}
.high {{ background-color: var(--highlight); }}
.medium {{ background-color: var(--medium); }}
.low {{ background-color: var(--low); }}
.common-words {{
font-size: 16px; /* 共同高频词字体大小,稍小于基础 */
color: #495057;
word-break: break-all;
font-weight: var(--text-font-weight);
}}
.conclusion {{
background-color: white;
border-left: 4px solid var(--secondary);
padding: 15px 20px;
margin-top: 15px;
border-radius: 0 5px 5px 0;
line-height: 1.6;
font-size: 18px; /* 结论字体大小 */
font-weight: var(--text-font-weight);
}}
.stats {{
font-weight: var(--heading-font-weight);
color: var(--highlight);
}}
</style>
</head>
<body>
<div class="container">
<h1>{file} 的TOP10高频词预查重报告</h1>
<div class="report-section">
<h2>基准文件TOP10高频词</h2>
<div class="top10-words">
"""
# 生成带编号的高频词标签
base_top10 = self.top10_words_dict[file]
for i, word in enumerate(base_top10, 1):
html_content += f"""
<div class="word-item" data-index="{i}">
{word}
</div>
"""
html_content += f"""
</div>
</div>
<div class="report-section">
<h2>与其他文件的相似度比较</h2>
<table class="similarity-table">
<tr>
<th>文件名称</th>
<th>相似度</th>
<th>相似度直观展示</th>
<th>共同高频词</th>
</tr>
"""
# 生成带颜色分级的相似度表格
for other_file, sim in valid_files:
common_words = list(set(self.top10_words_dict[file]) & set(self.top10_words_dict[other_file]))
common_words_str = ", ".join(common_words)
# 根据相似度设置进度条颜色
if sim >= 0.7:
progress_class = "high"
elif sim >= 0.3:
progress_class = "medium"
else:
progress_class = "low"
html_content += f"""
<tr>
<td>{other_file}</td>
<td>{sim:.2%}</td>
<td>
<div class="progress-bar">
<div class="progress {progress_class}" style="width: {sim * 100}%"></div>
</div>
</td>
<td class="common-words">{common_words_str}</td>
</tr>
"""
html_content += f"""
</table>
</div>
<div class="report-section">
<h2>查重结论与建议</h2>
<div class="conclusion">
<p>本次预查重针对基准文件 <strong>{file}</strong> 完成以下分析:</p>
<ul>
<li>共检测到 {len(similarities) - 1} 个对比文件,其中 <span class="stats">{excluded_count} 个文件与基准文件重复度为0</span>,已排除无需后续查重</li>
<li>剩余 {len(valid_files)} 个文件中:
<ul>
<li>高相似度文件≥70%{sum(1 for f, s in valid_files if s >= 0.7)} 个,建议重点核查</li>
<li>中等相似度文件30%-70%{sum(1 for f, s in valid_files if 0.3 <= s < 0.7)} 个,建议适当关注</li>
<li>低相似度文件(<30%{sum(1 for f, s in valid_files if s < 0.3)} 个,相关性较低</li>
</ul>
</li>
<li>共同高频词分析显示:{base_top10[0]}{base_top10[1]} 等核心词汇在多个文件中出现频率较高,可作为重点比对特征词</li>
</ul>
</div>
</div>
</div>
</body>
</html>
"""
f.write(html_content)
temp_file_path = f.name # 在with块内正确赋值
# 修复作用域问题:确保在方法内部使用变量
if temp_file_path:
webbrowser.open_new_tab(temp_file_path)
if __name__ == "__main__":
root = tk.Tk()#创建主窗口对象
app = TextSimilarityAnalyzer(root)#创建文本相似度分析器对象
root.mainloop()#进入主事件循环,等待用户交互