Spaces:
Sleeping
Sleeping
app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# app_refactored.py - 重构后的展示系统
|
| 2 |
import gradio as gr
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
|
@@ -10,6 +9,7 @@ from openai import OpenAI
|
|
| 10 |
import re
|
| 11 |
import spacy
|
| 12 |
from spellchecker import SpellChecker
|
|
|
|
| 13 |
|
| 14 |
# ======================== API配置 ========================
|
| 15 |
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
|
@@ -60,6 +60,109 @@ Next, please correct the following sentence according to the above requirements.
|
|
| 60 |
|
| 61 |
[input]: """
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# ======================== 工具函数 ========================
|
| 64 |
def check_api_key():
|
| 65 |
if not DEEPSEEK_API_KEY:
|
|
@@ -164,52 +267,40 @@ def calculate_spelling_error_density(sentences):
|
|
| 164 |
|
| 165 |
# ======================== Leaderboard数据处理 ========================
|
| 166 |
def load_leaderboard_data():
|
| 167 |
-
"""从JSON加载Leaderboard数据(现在包含Category字段)"""
|
| 168 |
json_path = "leaderboard.json"
|
| 169 |
try:
|
| 170 |
with open(json_path, 'r', encoding='utf-8') as f:
|
| 171 |
data = json.load(f)
|
| 172 |
-
|
| 173 |
-
# Category已经在JSON中定义,直接加载即可
|
| 174 |
return pd.DataFrame(data)
|
| 175 |
except Exception as e:
|
| 176 |
print(f"Error loading leaderboard: {e}")
|
| 177 |
return pd.DataFrame()
|
| 178 |
|
| 179 |
-
def make_clickable_download(download_text):
|
| 180 |
-
"""将Markdown链接转换为HTML链接"""
|
| 181 |
-
if '[下载](' in download_text:
|
| 182 |
-
url = download_text.split('(')[1].rstrip(')')
|
| 183 |
-
return f'<a href="{url}" class="download-link" target="_blank">下载</a>'
|
| 184 |
-
return download_text
|
| 185 |
-
|
| 186 |
def filter_leaderboard(df, query):
|
| 187 |
-
"""根据Category筛选Leaderboard"""
|
| 188 |
if query == "all":
|
| 189 |
return df
|
| 190 |
else:
|
| 191 |
return df[df['Category'] == query]
|
| 192 |
|
| 193 |
def search_leaderboard(df, query):
|
| 194 |
-
"""搜索Leaderboard"""
|
| 195 |
if not query:
|
| 196 |
return df
|
| 197 |
return df[df['Benchmark'].str.contains(query, case=False, na=False)]
|
| 198 |
|
| 199 |
-
# ======================== 数据清洗函数
|
| 200 |
def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
|
| 201 |
try:
|
| 202 |
try:
|
| 203 |
check_api_key()
|
| 204 |
except ValueError as e:
|
| 205 |
-
return str(e), None, None
|
| 206 |
|
| 207 |
progress(0.05, desc="📁 读取数据文件...")
|
| 208 |
df = pd.read_parquet(file_path)
|
| 209 |
|
| 210 |
if question_column not in df.columns:
|
| 211 |
available_columns = ", ".join(df.columns.tolist())
|
| 212 |
-
return f"❌ 列名 '{question_column}' 不存在!\n可用列名: {available_columns}", None, None
|
| 213 |
|
| 214 |
data_ori = df[question_column].tolist()[:int(max_samples)]
|
| 215 |
total = len(data_ori)
|
|
@@ -320,6 +411,9 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 320 |
log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
|
| 321 |
log_text += f"{'='*50}\n"
|
| 322 |
|
|
|
|
|
|
|
|
|
|
| 323 |
preview_df = pd.DataFrame({
|
| 324 |
'原始问题': [str(x)[:100] for x in data_ori[:5]],
|
| 325 |
'清洗后问题': [str(x)[:100] for x in lst_final[:5]]
|
|
@@ -327,12 +421,12 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 327 |
|
| 328 |
progress(1.0, desc="✅ 完成!")
|
| 329 |
|
| 330 |
-
return log_text, output_path, preview_df
|
| 331 |
|
| 332 |
except Exception as e:
|
| 333 |
import traceback
|
| 334 |
error_detail = traceback.format_exc()
|
| 335 |
-
return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None
|
| 336 |
|
| 337 |
# ======================== 文本内容 ========================
|
| 338 |
ABOUT_TEXT = """
|
|
@@ -369,11 +463,18 @@ ABOUT_TEXT = """
|
|
| 369 |
- **CoQA**: 对话问答
|
| 370 |
- 以及更多...
|
| 371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
### 技术栈
|
| 373 |
|
| 374 |
- **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
|
| 375 |
- **前端**: Gradio 4.16.0
|
| 376 |
- **数据处理**: Pandas + PyArrow (Parquet)
|
|
|
|
| 377 |
- **API调用**: OpenAI SDK
|
| 378 |
- **部署**: Hugging Face Spaces
|
| 379 |
|
|
@@ -382,63 +483,15 @@ ABOUT_TEXT = """
|
|
| 382 |
- **WAR (Whitespace Anomaly Rate)**: 空白符异常率
|
| 383 |
- **SED (Spelling Error Density)**: 拼写错误密度
|
| 384 |
|
| 385 |
-
### 数据集分类
|
| 386 |
-
|
| 387 |
-
- **BT (Basic Tasks)**: 基础任务 - MRPC, RTE, SST2
|
| 388 |
-
- **RA (Reasoning Abilities)**: 推理能力 - ARC, GSM8K, MMLU
|
| 389 |
-
- **TG (Text Generation)**: 文本生成 - CoQA, DROP, Truthful_QA
|
| 390 |
-
- **SU (Speech Understanding)**: 语音理解 - WNLI, Natural_questions
|
| 391 |
-
- **ME (Medical)**: 医学领域 - MedMCQA, MedQA, PubMedQA
|
| 392 |
-
- **GR (Grammatical)**: 语法领域 - BEA-2019, CoNLL-2014
|
| 393 |
-
|
| 394 |
-
### 使用说明
|
| 395 |
-
|
| 396 |
-
1. **配置 API Key**: Settings → Repository secrets → `DEEPSEEK_API_KEY`
|
| 397 |
-
2. **上传数据集**: 选择 `.parquet` 文件
|
| 398 |
-
3. **指定列名**: 输入包含问题的列名(通常是 `question`)
|
| 399 |
-
4. **调整参数**: 选择模型、temperature等
|
| 400 |
-
5. **开始清洗**: 点击按钮开始处理
|
| 401 |
-
6. **下载结果**: 下载 `XXX-Denoising.parquet` 文件
|
| 402 |
-
|
| 403 |
-
⚠️ **重要提示**:
|
| 404 |
-
- Demo版本限制最多处理100个样本
|
| 405 |
-
- 完整版本可处理数万样本
|
| 406 |
-
- 建议 temperature=0.1 以获得稳定输出
|
| 407 |
-
|
| 408 |
---
|
| 409 |
|
| 410 |
**研究生毕业论文成果展示** | Powered by DeepSeek API
|
| 411 |
"""
|
| 412 |
|
| 413 |
-
SUBMISSION_TEXT = """
|
| 414 |
-
## 提交说明
|
| 415 |
-
|
| 416 |
-
### 如何提交新的去噪结果
|
| 417 |
-
|
| 418 |
-
1. **准备数据**: 使用本系统对benchmark数据集进行去噪
|
| 419 |
-
2. **记录指标**: 记录WAR和SED指标
|
| 420 |
-
3. **提交PR**: 在GitHub上提交Pull Request
|
| 421 |
-
4. **审核**: 等待维护者审核
|
| 422 |
-
|
| 423 |
-
### 数据格式要求
|
| 424 |
-
|
| 425 |
-
提交的数据需要包含以下字段:
|
| 426 |
-
- ID: 序号
|
| 427 |
-
- Category: 类别 (BT/RA/TG/SU/ME/GR)
|
| 428 |
-
- Benchmark名称
|
| 429 |
-
- WAR (%)
|
| 430 |
-
- SED
|
| 431 |
-
- Download: 下载链接
|
| 432 |
-
|
| 433 |
-
### 联系方式
|
| 434 |
-
|
| 435 |
-
如有问题,请通过以下方式联系:
|
| 436 |
-
- GitHub Issues
|
| 437 |
-
- Email: your-email@example.com
|
| 438 |
-
"""
|
| 439 |
-
|
| 440 |
# ======================== Gradio界面 ========================
|
| 441 |
-
demo = gr.Blocks(title="数据集清洗框架展示系统"
|
|
|
|
|
|
|
| 442 |
|
| 443 |
with demo:
|
| 444 |
gr.Markdown(
|
|
@@ -449,11 +502,9 @@ with demo:
|
|
| 449 |
elem_classes="markdown-text"
|
| 450 |
)
|
| 451 |
|
| 452 |
-
# 加载leaderboard数据
|
| 453 |
leaderboard_data = load_leaderboard_data()
|
| 454 |
|
| 455 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 456 |
-
# ==================== Tab 1: Evaluation Table ====================
|
| 457 |
with gr.TabItem("📊 Evaluation Table", id=0):
|
| 458 |
with gr.Column():
|
| 459 |
gr.Markdown("### 清洗效果排行榜")
|
|
@@ -484,7 +535,6 @@ with demo:
|
|
| 484 |
visible=False
|
| 485 |
)
|
| 486 |
|
| 487 |
-
# 绑定搜索和筛选
|
| 488 |
search_bar.submit(
|
| 489 |
lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
|
| 490 |
[hidden_leaderboard, search_bar],
|
|
@@ -502,22 +552,15 @@ with demo:
|
|
| 502 |
- **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
|
| 503 |
- **WAR**: 空白符异常率变化 (正值表示改善)
|
| 504 |
- **SED**: 拼写错误密度变化 (负值表示改善)
|
| 505 |
-
- 绿色: 正向提升 | 红色: 负向影响
|
| 506 |
""", elem_classes="markdown-text")
|
| 507 |
|
| 508 |
-
# ==================== Tab 2: Performance Plot ====================
|
| 509 |
with gr.TabItem("📈 Performance Plot", id=1):
|
| 510 |
gr.Markdown("### 性能可视化分析")
|
| 511 |
gr.Markdown("**注意**: 性能图表功能开发中,敬请期待。")
|
| 512 |
-
|
| 513 |
-
# 这里可以添加性能图表
|
| 514 |
-
# 例如: WAR和SED的对比图、不同方法的效果对比等
|
| 515 |
|
| 516 |
-
# ==================== Tab 3: About ====================
|
| 517 |
with gr.TabItem("📝 About", id=2):
|
| 518 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
| 519 |
|
| 520 |
-
# ==================== Tab 4: Submit Results ====================
|
| 521 |
with gr.TabItem("🚀 Submit Results", id=3):
|
| 522 |
gr.Markdown("## 提交去噪结果")
|
| 523 |
|
|
@@ -565,28 +608,29 @@ with demo:
|
|
| 565 |
max_lines=15
|
| 566 |
)
|
| 567 |
|
| 568 |
-
preview_df = gr.Dataframe(
|
| 569 |
-
label="🔍 结果预览",
|
| 570 |
-
wrap=True
|
| 571 |
-
)
|
| 572 |
-
|
| 573 |
download_file = gr.File(label="📥 下载去噪后的数据集")
|
| 574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
clean_btn.click(
|
| 576 |
fn=clean_dataset,
|
| 577 |
inputs=[file_input, question_column, model_choice, temperature, max_samples],
|
| 578 |
-
outputs=[output_text, download_file, preview_df]
|
| 579 |
)
|
| 580 |
-
|
| 581 |
-
gr.Markdown("""
|
| 582 |
-
### WAC-GEC方法 (开发中)
|
| 583 |
-
|
| 584 |
-
WAC-GEC (Whitespace Anomaly Correction - Grammar Error Correction) 方法结合了:
|
| 585 |
-
- 空白符异常检测与修正
|
| 586 |
-
- 语法错误检测与修正
|
| 587 |
-
|
| 588 |
-
该功能即将上线,敬请期待!
|
| 589 |
-
""", elem_classes="markdown-text")
|
| 590 |
|
| 591 |
if __name__ == "__main__":
|
| 592 |
demo.launch(
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import json
|
| 3 |
import pandas as pd
|
|
|
|
| 9 |
import re
|
| 10 |
import spacy
|
| 11 |
from spellchecker import SpellChecker
|
| 12 |
+
import difflib
|
| 13 |
|
| 14 |
# ======================== API配置 ========================
|
| 15 |
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
|
|
|
| 60 |
|
| 61 |
[input]: """
|
| 62 |
|
| 63 |
+
# ======================== 新增:颜色对比函数 ========================
|
| 64 |
+
def generate_colored_diff(original, cleaned):
|
| 65 |
+
"""
|
| 66 |
+
生成带颜色标注的HTML差异对比
|
| 67 |
+
原始文本中的错误:红色
|
| 68 |
+
清洗后的修正:绿色
|
| 69 |
+
"""
|
| 70 |
+
# 分词处理
|
| 71 |
+
original_words = original.split()
|
| 72 |
+
cleaned_words = cleaned.split()
|
| 73 |
+
|
| 74 |
+
# 使用difflib进行序列匹配
|
| 75 |
+
matcher = difflib.SequenceMatcher(None, original_words, cleaned_words)
|
| 76 |
+
|
| 77 |
+
original_html = []
|
| 78 |
+
cleaned_html = []
|
| 79 |
+
|
| 80 |
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
| 81 |
+
if tag == 'equal':
|
| 82 |
+
# 相同部分保持黑色
|
| 83 |
+
original_html.extend(original_words[i1:i2])
|
| 84 |
+
cleaned_html.extend(cleaned_words[j1:j2])
|
| 85 |
+
elif tag == 'replace':
|
| 86 |
+
# 替换部分:原文红色,新文绿色
|
| 87 |
+
original_html.extend([f'<span style="color: #dc3545; font-weight: bold;">{w}</span>'
|
| 88 |
+
for w in original_words[i1:i2]])
|
| 89 |
+
cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>'
|
| 90 |
+
for w in cleaned_words[j1:j2]])
|
| 91 |
+
elif tag == 'delete':
|
| 92 |
+
# 删除部分:原文红色带删除线
|
| 93 |
+
original_html.extend([f'<span style="color: #dc3545; text-decoration: line-through;">{w}</span>'
|
| 94 |
+
for w in original_words[i1:i2]])
|
| 95 |
+
elif tag == 'insert':
|
| 96 |
+
# 插入部分:新文绿色
|
| 97 |
+
cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>'
|
| 98 |
+
for w in cleaned_words[j1:j2]])
|
| 99 |
+
|
| 100 |
+
return ' '.join(original_html), ' '.join(cleaned_html)
|
| 101 |
+
|
| 102 |
+
def create_comparison_html(original_list, cleaned_list):
|
| 103 |
+
"""
|
| 104 |
+
创建HTML表格展示对比
|
| 105 |
+
"""
|
| 106 |
+
html = """
|
| 107 |
+
<div style="font-family: 'Segoe UI', Arial, sans-serif; max-width: 100%; overflow-x: auto;">
|
| 108 |
+
<style>
|
| 109 |
+
.comparison-table {
|
| 110 |
+
width: 100%;
|
| 111 |
+
border-collapse: collapse;
|
| 112 |
+
margin: 20px 0;
|
| 113 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
| 114 |
+
}
|
| 115 |
+
.comparison-table th {
|
| 116 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 117 |
+
color: white;
|
| 118 |
+
padding: 12px;
|
| 119 |
+
text-align: left;
|
| 120 |
+
font-weight: 600;
|
| 121 |
+
}
|
| 122 |
+
.comparison-table td {
|
| 123 |
+
padding: 12px;
|
| 124 |
+
border-bottom: 1px solid #e0e0e0;
|
| 125 |
+
line-height: 1.6;
|
| 126 |
+
}
|
| 127 |
+
.comparison-table tr:hover {
|
| 128 |
+
background-color: #f8f9fa;
|
| 129 |
+
}
|
| 130 |
+
.index-col {
|
| 131 |
+
width: 50px;
|
| 132 |
+
text-align: center;
|
| 133 |
+
font-weight: bold;
|
| 134 |
+
color: #6c757d;
|
| 135 |
+
}
|
| 136 |
+
</style>
|
| 137 |
+
<table class="comparison-table">
|
| 138 |
+
<thead>
|
| 139 |
+
<tr>
|
| 140 |
+
<th class="index-col">#</th>
|
| 141 |
+
<th>原始问题 (红色=错误)</th>
|
| 142 |
+
<th>清洗后问题 (绿色=修正)</th>
|
| 143 |
+
</tr>
|
| 144 |
+
</thead>
|
| 145 |
+
<tbody>
|
| 146 |
+
"""
|
| 147 |
+
|
| 148 |
+
for idx, (orig, clean) in enumerate(zip(original_list, cleaned_list), 1):
|
| 149 |
+
orig_colored, clean_colored = generate_colored_diff(str(orig), str(clean))
|
| 150 |
+
html += f"""
|
| 151 |
+
<tr>
|
| 152 |
+
<td class="index-col">{idx}</td>
|
| 153 |
+
<td>{orig_colored}</td>
|
| 154 |
+
<td>{clean_colored}</td>
|
| 155 |
+
</tr>
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
+
html += """
|
| 159 |
+
</tbody>
|
| 160 |
+
</table>
|
| 161 |
+
</div>
|
| 162 |
+
"""
|
| 163 |
+
|
| 164 |
+
return html
|
| 165 |
+
|
| 166 |
# ======================== 工具函数 ========================
|
| 167 |
def check_api_key():
|
| 168 |
if not DEEPSEEK_API_KEY:
|
|
|
|
| 267 |
|
| 268 |
# ======================== Leaderboard数据处理 ========================
|
| 269 |
def load_leaderboard_data():
|
|
|
|
| 270 |
json_path = "leaderboard.json"
|
| 271 |
try:
|
| 272 |
with open(json_path, 'r', encoding='utf-8') as f:
|
| 273 |
data = json.load(f)
|
|
|
|
|
|
|
| 274 |
return pd.DataFrame(data)
|
| 275 |
except Exception as e:
|
| 276 |
print(f"Error loading leaderboard: {e}")
|
| 277 |
return pd.DataFrame()
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
def filter_leaderboard(df, query):
|
|
|
|
| 280 |
if query == "all":
|
| 281 |
return df
|
| 282 |
else:
|
| 283 |
return df[df['Category'] == query]
|
| 284 |
|
| 285 |
def search_leaderboard(df, query):
|
|
|
|
| 286 |
if not query:
|
| 287 |
return df
|
| 288 |
return df[df['Benchmark'].str.contains(query, case=False, na=False)]
|
| 289 |
|
| 290 |
+
# ======================== 数据清洗函数(修改版)========================
|
| 291 |
def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
|
| 292 |
try:
|
| 293 |
try:
|
| 294 |
check_api_key()
|
| 295 |
except ValueError as e:
|
| 296 |
+
return str(e), None, None, ""
|
| 297 |
|
| 298 |
progress(0.05, desc="📁 读取数据文件...")
|
| 299 |
df = pd.read_parquet(file_path)
|
| 300 |
|
| 301 |
if question_column not in df.columns:
|
| 302 |
available_columns = ", ".join(df.columns.tolist())
|
| 303 |
+
return f"❌ 列名 '{question_column}' 不存在!\n可用列名: {available_columns}", None, None, ""
|
| 304 |
|
| 305 |
data_ori = df[question_column].tolist()[:int(max_samples)]
|
| 306 |
total = len(data_ori)
|
|
|
|
| 411 |
log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
|
| 412 |
log_text += f"{'='*50}\n"
|
| 413 |
|
| 414 |
+
# 生成带颜色的对比HTML
|
| 415 |
+
preview_html = create_comparison_html(data_ori[:5], lst_final[:5])
|
| 416 |
+
|
| 417 |
preview_df = pd.DataFrame({
|
| 418 |
'原始问题': [str(x)[:100] for x in data_ori[:5]],
|
| 419 |
'清洗后问题': [str(x)[:100] for x in lst_final[:5]]
|
|
|
|
| 421 |
|
| 422 |
progress(1.0, desc="✅ 完成!")
|
| 423 |
|
| 424 |
+
return log_text, output_path, preview_df, preview_html
|
| 425 |
|
| 426 |
except Exception as e:
|
| 427 |
import traceback
|
| 428 |
error_detail = traceback.format_exc()
|
| 429 |
+
return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None, ""
|
| 430 |
|
| 431 |
# ======================== 文本内容 ========================
|
| 432 |
ABOUT_TEXT = """
|
|
|
|
| 463 |
- **CoQA**: 对话问答
|
| 464 |
- 以及更多...
|
| 465 |
|
| 466 |
+
### 颜色标注说明
|
| 467 |
+
|
| 468 |
+
- 🔴 **红色**: 原始文本中的错误(拼写、语法、空格等)
|
| 469 |
+
- 🟢 **绿色**: 清洗后的修正内容
|
| 470 |
+
- ⚫ **黑色**: 未修改的正确部分
|
| 471 |
+
|
| 472 |
### 技术栈
|
| 473 |
|
| 474 |
- **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
|
| 475 |
- **前端**: Gradio 4.16.0
|
| 476 |
- **数据处理**: Pandas + PyArrow (Parquet)
|
| 477 |
+
- **差异对比**: Python difflib
|
| 478 |
- **API调用**: OpenAI SDK
|
| 479 |
- **部署**: Hugging Face Spaces
|
| 480 |
|
|
|
|
| 483 |
- **WAR (Whitespace Anomaly Rate)**: 空白符异常率
|
| 484 |
- **SED (Spelling Error Density)**: 拼写错误密度
|
| 485 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
---
|
| 487 |
|
| 488 |
**研究生毕业论文成果展示** | Powered by DeepSeek API
|
| 489 |
"""
|
| 490 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
# ======================== Gradio界面 ========================
|
| 492 |
+
demo = gr.Blocks(title="数据集清洗框架展示系统", css="""
|
| 493 |
+
.markdown-text { font-size: 16px; line-height: 1.6; }
|
| 494 |
+
""")
|
| 495 |
|
| 496 |
with demo:
|
| 497 |
gr.Markdown(
|
|
|
|
| 502 |
elem_classes="markdown-text"
|
| 503 |
)
|
| 504 |
|
|
|
|
| 505 |
leaderboard_data = load_leaderboard_data()
|
| 506 |
|
| 507 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
| 508 |
with gr.TabItem("📊 Evaluation Table", id=0):
|
| 509 |
with gr.Column():
|
| 510 |
gr.Markdown("### 清洗效果排行榜")
|
|
|
|
| 535 |
visible=False
|
| 536 |
)
|
| 537 |
|
|
|
|
| 538 |
search_bar.submit(
|
| 539 |
lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
|
| 540 |
[hidden_leaderboard, search_bar],
|
|
|
|
| 552 |
- **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
|
| 553 |
- **WAR**: 空白符异常率变化 (正值表示改善)
|
| 554 |
- **SED**: 拼写错误密度变化 (负值表示改善)
|
|
|
|
| 555 |
""", elem_classes="markdown-text")
|
| 556 |
|
|
|
|
| 557 |
with gr.TabItem("📈 Performance Plot", id=1):
|
| 558 |
gr.Markdown("### 性能可视化分析")
|
| 559 |
gr.Markdown("**注意**: 性能图表功能开发中,敬请期待。")
|
|
|
|
|
|
|
|
|
|
| 560 |
|
|
|
|
| 561 |
with gr.TabItem("📝 About", id=2):
|
| 562 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
| 563 |
|
|
|
|
| 564 |
with gr.TabItem("🚀 Submit Results", id=3):
|
| 565 |
gr.Markdown("## 提交去噪结果")
|
| 566 |
|
|
|
|
| 608 |
max_lines=15
|
| 609 |
)
|
| 610 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
download_file = gr.File(label="📥 下载去噪后的数据集")
|
| 612 |
|
| 613 |
+
# 新增:颜色对比预览区域
|
| 614 |
+
gr.Markdown("### 🎨 清洗效果对比预览")
|
| 615 |
+
gr.Markdown("""
|
| 616 |
+
**颜色说明**:
|
| 617 |
+
- 🔴 <span style="color: #dc3545;">红色</span> = 原始文本中的错误
|
| 618 |
+
- 🟢 <span style="color: #28a745;">绿色</span> = 清洗后的修正
|
| 619 |
+
- ⚫ 黑色 = 未修改的正确部分
|
| 620 |
+
""")
|
| 621 |
+
|
| 622 |
+
colored_preview = gr.HTML(label="带颜色标注的对比")
|
| 623 |
+
|
| 624 |
+
preview_df = gr.Dataframe(
|
| 625 |
+
label="🔍 原始对比表格",
|
| 626 |
+
wrap=True
|
| 627 |
+
)
|
| 628 |
+
|
| 629 |
clean_btn.click(
|
| 630 |
fn=clean_dataset,
|
| 631 |
inputs=[file_input, question_column, model_choice, temperature, max_samples],
|
| 632 |
+
outputs=[output_text, download_file, preview_df, colored_preview]
|
| 633 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
|
| 635 |
if __name__ == "__main__":
|
| 636 |
demo.launch(
|