lllouo commited on
Commit
7ee1568
·
1 Parent(s): 27ccef7
Files changed (1) hide show
  1. app.py +139 -95
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # app_refactored.py - 重构后的展示系统
2
  import gradio as gr
3
  import json
4
  import pandas as pd
@@ -10,6 +9,7 @@ from openai import OpenAI
10
  import re
11
  import spacy
12
  from spellchecker import SpellChecker
 
13
 
14
  # ======================== API配置 ========================
15
  DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
@@ -60,6 +60,109 @@ Next, please correct the following sentence according to the above requirements.
60
 
61
  [input]: """
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # ======================== 工具函数 ========================
64
  def check_api_key():
65
  if not DEEPSEEK_API_KEY:
@@ -164,52 +267,40 @@ def calculate_spelling_error_density(sentences):
164
 
165
  # ======================== Leaderboard数据处理 ========================
166
  def load_leaderboard_data():
167
- """从JSON加载Leaderboard数据(现在包含Category字段)"""
168
  json_path = "leaderboard.json"
169
  try:
170
  with open(json_path, 'r', encoding='utf-8') as f:
171
  data = json.load(f)
172
-
173
- # Category已经在JSON中定义,直接加载即可
174
  return pd.DataFrame(data)
175
  except Exception as e:
176
  print(f"Error loading leaderboard: {e}")
177
  return pd.DataFrame()
178
 
179
- def make_clickable_download(download_text):
180
- """将Markdown链接转换为HTML链接"""
181
- if '[下载](' in download_text:
182
- url = download_text.split('(')[1].rstrip(')')
183
- return f'<a href="{url}" class="download-link" target="_blank">下载</a>'
184
- return download_text
185
-
186
  def filter_leaderboard(df, query):
187
- """根据Category筛选Leaderboard"""
188
  if query == "all":
189
  return df
190
  else:
191
  return df[df['Category'] == query]
192
 
193
  def search_leaderboard(df, query):
194
- """搜索Leaderboard"""
195
  if not query:
196
  return df
197
  return df[df['Benchmark'].str.contains(query, case=False, na=False)]
198
 
199
- # ======================== 数据清洗函数 ========================
200
  def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
201
  try:
202
  try:
203
  check_api_key()
204
  except ValueError as e:
205
- return str(e), None, None
206
 
207
  progress(0.05, desc="📁 读取数据文件...")
208
  df = pd.read_parquet(file_path)
209
 
210
  if question_column not in df.columns:
211
  available_columns = ", ".join(df.columns.tolist())
212
- return f"❌ 列名 '{question_column}' 不存在!\n可用列名: {available_columns}", None, None
213
 
214
  data_ori = df[question_column].tolist()[:int(max_samples)]
215
  total = len(data_ori)
@@ -320,6 +411,9 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
320
  log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
321
  log_text += f"{'='*50}\n"
322
 
 
 
 
323
  preview_df = pd.DataFrame({
324
  '原始问题': [str(x)[:100] for x in data_ori[:5]],
325
  '清洗后问题': [str(x)[:100] for x in lst_final[:5]]
@@ -327,12 +421,12 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
327
 
328
  progress(1.0, desc="✅ 完成!")
329
 
330
- return log_text, output_path, preview_df
331
 
332
  except Exception as e:
333
  import traceback
334
  error_detail = traceback.format_exc()
335
- return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None
336
 
337
  # ======================== 文本内容 ========================
338
  ABOUT_TEXT = """
@@ -369,11 +463,18 @@ ABOUT_TEXT = """
369
  - **CoQA**: 对话问答
370
  - 以及更多...
371
 
 
 
 
 
 
 
372
  ### 技术栈
373
 
374
  - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
375
  - **前端**: Gradio 4.16.0
376
  - **数据处理**: Pandas + PyArrow (Parquet)
 
377
  - **API调用**: OpenAI SDK
378
  - **部署**: Hugging Face Spaces
379
 
@@ -382,63 +483,15 @@ ABOUT_TEXT = """
382
  - **WAR (Whitespace Anomaly Rate)**: 空白符异常率
383
  - **SED (Spelling Error Density)**: 拼写错误密度
384
 
385
- ### 数据集分类
386
-
387
- - **BT (Basic Tasks)**: 基础任务 - MRPC, RTE, SST2
388
- - **RA (Reasoning Abilities)**: 推理能力 - ARC, GSM8K, MMLU
389
- - **TG (Text Generation)**: 文本生成 - CoQA, DROP, Truthful_QA
390
- - **SU (Speech Understanding)**: 语音理解 - WNLI, Natural_questions
391
- - **ME (Medical)**: 医学领域 - MedMCQA, MedQA, PubMedQA
392
- - **GR (Grammatical)**: 语法领域 - BEA-2019, CoNLL-2014
393
-
394
- ### 使用说明
395
-
396
- 1. **配置 API Key**: Settings → Repository secrets → `DEEPSEEK_API_KEY`
397
- 2. **上传数据集**: 选择 `.parquet` 文件
398
- 3. **指定列名**: 输入包含问题的列名(通常是 `question`)
399
- 4. **调整参数**: 选择模型、temperature等
400
- 5. **开始清洗**: 点击按钮开始处理
401
- 6. **下载结果**: 下载 `XXX-Denoising.parquet` 文件
402
-
403
- ⚠️ **重要提示**:
404
- - Demo版本限制最多处理100个样本
405
- - 完整版本可处理数万样本
406
- - 建议 temperature=0.1 以获得稳定输出
407
-
408
  ---
409
 
410
  **研究生毕业论文成果展示** | Powered by DeepSeek API
411
  """
412
 
413
- SUBMISSION_TEXT = """
414
- ## 提交说明
415
-
416
- ### 如何提交新的去噪结果
417
-
418
- 1. **准备数据**: 使用本系统对benchmark数据集进行去噪
419
- 2. **记录指标**: 记录WAR和SED指标
420
- 3. **提交PR**: 在GitHub上提交Pull Request
421
- 4. **审核**: 等待维护者审核
422
-
423
- ### 数据格式要求
424
-
425
- 提交的数据需要包含以下字段:
426
- - ID: 序号
427
- - Category: 类别 (BT/RA/TG/SU/ME/GR)
428
- - Benchmark名称
429
- - WAR (%)
430
- - SED
431
- - Download: 下载链接
432
-
433
- ### 联系方式
434
-
435
- 如有问题,请通过以下方式联系:
436
- - GitHub Issues
437
- - Email: your-email@example.com
438
- """
439
-
440
  # ======================== Gradio界面 ========================
441
- demo = gr.Blocks(title="数据集清洗框架展示系统")
 
 
442
 
443
  with demo:
444
  gr.Markdown(
@@ -449,11 +502,9 @@ with demo:
449
  elem_classes="markdown-text"
450
  )
451
 
452
- # 加载leaderboard数据
453
  leaderboard_data = load_leaderboard_data()
454
 
455
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
456
- # ==================== Tab 1: Evaluation Table ====================
457
  with gr.TabItem("📊 Evaluation Table", id=0):
458
  with gr.Column():
459
  gr.Markdown("### 清洗效果排行榜")
@@ -484,7 +535,6 @@ with demo:
484
  visible=False
485
  )
486
 
487
- # 绑定搜索和筛选
488
  search_bar.submit(
489
  lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
490
  [hidden_leaderboard, search_bar],
@@ -502,22 +552,15 @@ with demo:
502
  - **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
503
  - **WAR**: 空白符异常率变化 (正值表示改善)
504
  - **SED**: 拼写错误密度变化 (负值表示改善)
505
- - 绿色: 正向提升 | 红色: 负向影响
506
  """, elem_classes="markdown-text")
507
 
508
- # ==================== Tab 2: Performance Plot ====================
509
  with gr.TabItem("📈 Performance Plot", id=1):
510
  gr.Markdown("### 性能可视化分析")
511
  gr.Markdown("**注意**: 性能图表功能开发中,敬请期待。")
512
-
513
- # 这里可以添加性能图表
514
- # 例如: WAR和SED的对比图、不同方法的效果对比等
515
 
516
- # ==================== Tab 3: About ====================
517
  with gr.TabItem("📝 About", id=2):
518
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
519
 
520
- # ==================== Tab 4: Submit Results ====================
521
  with gr.TabItem("🚀 Submit Results", id=3):
522
  gr.Markdown("## 提交去噪结果")
523
 
@@ -565,28 +608,29 @@ with demo:
565
  max_lines=15
566
  )
567
 
568
- preview_df = gr.Dataframe(
569
- label="🔍 结果预览",
570
- wrap=True
571
- )
572
-
573
  download_file = gr.File(label="📥 下载去噪后的数据集")
574
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  clean_btn.click(
576
  fn=clean_dataset,
577
  inputs=[file_input, question_column, model_choice, temperature, max_samples],
578
- outputs=[output_text, download_file, preview_df]
579
  )
580
-
581
- gr.Markdown("""
582
- ### WAC-GEC方法 (开发中)
583
-
584
- WAC-GEC (Whitespace Anomaly Correction - Grammar Error Correction) 方法结合了:
585
- - 空白符异常检测与修正
586
- - 语法错误检测与修正
587
-
588
- 该功能即将上线,敬请期待!
589
- """, elem_classes="markdown-text")
590
 
591
  if __name__ == "__main__":
592
  demo.launch(
 
 
1
  import gradio as gr
2
  import json
3
  import pandas as pd
 
9
  import re
10
  import spacy
11
  from spellchecker import SpellChecker
12
+ import difflib
13
 
14
  # ======================== API配置 ========================
15
  DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
 
60
 
61
  [input]: """
62
 
63
+ # ======================== 新增:颜色对比函数 ========================
64
+ def generate_colored_diff(original, cleaned):
65
+ """
66
+ 生成带颜色标注的HTML差异对比
67
+ 原始文本中的错误:红色
68
+ 清洗后的修正:绿色
69
+ """
70
+ # 分词处理
71
+ original_words = original.split()
72
+ cleaned_words = cleaned.split()
73
+
74
+ # 使用difflib进行序列匹配
75
+ matcher = difflib.SequenceMatcher(None, original_words, cleaned_words)
76
+
77
+ original_html = []
78
+ cleaned_html = []
79
+
80
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
81
+ if tag == 'equal':
82
+ # 相同部分保持黑色
83
+ original_html.extend(original_words[i1:i2])
84
+ cleaned_html.extend(cleaned_words[j1:j2])
85
+ elif tag == 'replace':
86
+ # 替换部分:原文红色,新文绿色
87
+ original_html.extend([f'<span style="color: #dc3545; font-weight: bold;">{w}</span>'
88
+ for w in original_words[i1:i2]])
89
+ cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>'
90
+ for w in cleaned_words[j1:j2]])
91
+ elif tag == 'delete':
92
+ # 删除部分:原文红色带删除线
93
+ original_html.extend([f'<span style="color: #dc3545; text-decoration: line-through;">{w}</span>'
94
+ for w in original_words[i1:i2]])
95
+ elif tag == 'insert':
96
+ # 插入部分:新文绿色
97
+ cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>'
98
+ for w in cleaned_words[j1:j2]])
99
+
100
+ return ' '.join(original_html), ' '.join(cleaned_html)
101
+
102
+ def create_comparison_html(original_list, cleaned_list):
103
+ """
104
+ 创建HTML表格展示对比
105
+ """
106
+ html = """
107
+ <div style="font-family: 'Segoe UI', Arial, sans-serif; max-width: 100%; overflow-x: auto;">
108
+ <style>
109
+ .comparison-table {
110
+ width: 100%;
111
+ border-collapse: collapse;
112
+ margin: 20px 0;
113
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
114
+ }
115
+ .comparison-table th {
116
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
117
+ color: white;
118
+ padding: 12px;
119
+ text-align: left;
120
+ font-weight: 600;
121
+ }
122
+ .comparison-table td {
123
+ padding: 12px;
124
+ border-bottom: 1px solid #e0e0e0;
125
+ line-height: 1.6;
126
+ }
127
+ .comparison-table tr:hover {
128
+ background-color: #f8f9fa;
129
+ }
130
+ .index-col {
131
+ width: 50px;
132
+ text-align: center;
133
+ font-weight: bold;
134
+ color: #6c757d;
135
+ }
136
+ </style>
137
+ <table class="comparison-table">
138
+ <thead>
139
+ <tr>
140
+ <th class="index-col">#</th>
141
+ <th>原始问题 (红色=错误)</th>
142
+ <th>清洗后问题 (绿色=修正)</th>
143
+ </tr>
144
+ </thead>
145
+ <tbody>
146
+ """
147
+
148
+ for idx, (orig, clean) in enumerate(zip(original_list, cleaned_list), 1):
149
+ orig_colored, clean_colored = generate_colored_diff(str(orig), str(clean))
150
+ html += f"""
151
+ <tr>
152
+ <td class="index-col">{idx}</td>
153
+ <td>{orig_colored}</td>
154
+ <td>{clean_colored}</td>
155
+ </tr>
156
+ """
157
+
158
+ html += """
159
+ </tbody>
160
+ </table>
161
+ </div>
162
+ """
163
+
164
+ return html
165
+
166
  # ======================== 工具函数 ========================
167
  def check_api_key():
168
  if not DEEPSEEK_API_KEY:
 
267
 
268
  # ======================== Leaderboard数据处理 ========================
269
  def load_leaderboard_data():
 
270
  json_path = "leaderboard.json"
271
  try:
272
  with open(json_path, 'r', encoding='utf-8') as f:
273
  data = json.load(f)
 
 
274
  return pd.DataFrame(data)
275
  except Exception as e:
276
  print(f"Error loading leaderboard: {e}")
277
  return pd.DataFrame()
278
 
 
 
 
 
 
 
 
279
  def filter_leaderboard(df, query):
 
280
  if query == "all":
281
  return df
282
  else:
283
  return df[df['Category'] == query]
284
 
285
  def search_leaderboard(df, query):
 
286
  if not query:
287
  return df
288
  return df[df['Benchmark'].str.contains(query, case=False, na=False)]
289
 
290
+ # ======================== 数据清洗函数(修改版)========================
291
  def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
292
  try:
293
  try:
294
  check_api_key()
295
  except ValueError as e:
296
+ return str(e), None, None, ""
297
 
298
  progress(0.05, desc="📁 读取数据文件...")
299
  df = pd.read_parquet(file_path)
300
 
301
  if question_column not in df.columns:
302
  available_columns = ", ".join(df.columns.tolist())
303
+ return f"❌ 列名 '{question_column}' 不存在!\n可用列名: {available_columns}", None, None, ""
304
 
305
  data_ori = df[question_column].tolist()[:int(max_samples)]
306
  total = len(data_ori)
 
411
  log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
412
  log_text += f"{'='*50}\n"
413
 
414
+ # 生成带颜色的对比HTML
415
+ preview_html = create_comparison_html(data_ori[:5], lst_final[:5])
416
+
417
  preview_df = pd.DataFrame({
418
  '原始问题': [str(x)[:100] for x in data_ori[:5]],
419
  '清洗后问题': [str(x)[:100] for x in lst_final[:5]]
 
421
 
422
  progress(1.0, desc="✅ 完成!")
423
 
424
+ return log_text, output_path, preview_df, preview_html
425
 
426
  except Exception as e:
427
  import traceback
428
  error_detail = traceback.format_exc()
429
+ return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None, ""
430
 
431
  # ======================== 文本内容 ========================
432
  ABOUT_TEXT = """
 
463
  - **CoQA**: 对话问答
464
  - 以及更多...
465
 
466
+ ### 颜色标注说明
467
+
468
+ - 🔴 **红色**: 原始文本中的错误(拼写、语法、空格等)
469
+ - 🟢 **绿色**: 清洗后的修正内容
470
+ - ⚫ **黑色**: 未修改的正确部分
471
+
472
  ### 技术栈
473
 
474
  - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
475
  - **前端**: Gradio 4.16.0
476
  - **数据处理**: Pandas + PyArrow (Parquet)
477
+ - **差异对比**: Python difflib
478
  - **API调用**: OpenAI SDK
479
  - **部署**: Hugging Face Spaces
480
 
 
483
  - **WAR (Whitespace Anomaly Rate)**: 空白符异常率
484
  - **SED (Spelling Error Density)**: 拼写错误密度
485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  ---
487
 
488
  **研究生毕业论文成果展示** | Powered by DeepSeek API
489
  """
490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
  # ======================== Gradio界面 ========================
492
+ demo = gr.Blocks(title="数据集清洗框架展示系统", css="""
493
+ .markdown-text { font-size: 16px; line-height: 1.6; }
494
+ """)
495
 
496
  with demo:
497
  gr.Markdown(
 
502
  elem_classes="markdown-text"
503
  )
504
 
 
505
  leaderboard_data = load_leaderboard_data()
506
 
507
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
 
508
  with gr.TabItem("📊 Evaluation Table", id=0):
509
  with gr.Column():
510
  gr.Markdown("### 清洗效果排行榜")
 
535
  visible=False
536
  )
537
 
 
538
  search_bar.submit(
539
  lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
540
  [hidden_leaderboard, search_bar],
 
552
  - **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
553
  - **WAR**: 空白符异常率变化 (正值表示改善)
554
  - **SED**: 拼写错误密度变化 (负值表示改善)
 
555
  """, elem_classes="markdown-text")
556
 
 
557
  with gr.TabItem("📈 Performance Plot", id=1):
558
  gr.Markdown("### 性能可视化分析")
559
  gr.Markdown("**注意**: 性能图表功能开发中,敬请期待。")
 
 
 
560
 
 
561
  with gr.TabItem("📝 About", id=2):
562
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
563
 
 
564
  with gr.TabItem("🚀 Submit Results", id=3):
565
  gr.Markdown("## 提交去噪结果")
566
 
 
608
  max_lines=15
609
  )
610
 
 
 
 
 
 
611
  download_file = gr.File(label="📥 下载去噪后的数据集")
612
 
613
+ # 新增:颜色对比预览区域
614
+ gr.Markdown("### 🎨 清洗效果对比预览")
615
+ gr.Markdown("""
616
+ **颜色说明**:
617
+ - 🔴 <span style="color: #dc3545;">红色</span> = 原始文本中的错误
618
+ - 🟢 <span style="color: #28a745;">绿色</span> = 清洗后的修正
619
+ - ⚫ 黑色 = 未修改的正确部分
620
+ """)
621
+
622
+ colored_preview = gr.HTML(label="带颜色标注的对比")
623
+
624
+ preview_df = gr.Dataframe(
625
+ label="🔍 原始对比表格",
626
+ wrap=True
627
+ )
628
+
629
  clean_btn.click(
630
  fn=clean_dataset,
631
  inputs=[file_input, question_column, model_choice, temperature, max_samples],
632
+ outputs=[output_text, download_file, preview_df, colored_preview]
633
  )
 
 
 
 
 
 
 
 
 
 
634
 
635
  if __name__ == "__main__":
636
  demo.launch(