lllouo commited on
Commit
b96d100
·
1 Parent(s): cc7eba8
Files changed (1) hide show
  1. app.py +392 -450
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py - 使用 requests 调用 DeepSeek API(稳定版本)
2
  import gradio as gr
3
  import json
4
  import pandas as pd
@@ -11,62 +11,158 @@ import re
11
  import spacy
12
  from spellchecker import SpellChecker
13
 
14
- # DeepSeek API配置
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
16
  DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
17
 
18
- # 在全局初始化 spaCy 和 SpellChecker(放在文件开头,DEEPSEEK_API_KEY 定义之后)
19
  try:
20
  nlp = spacy.load("en_core_web_sm")
21
  except OSError:
22
- # 如果模型未安装,自动下载
23
  import subprocess
24
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
25
  nlp = spacy.load("en_core_web_sm")
26
 
27
  spell = SpellChecker()
28
 
29
- # 空白符异常检测的正则模式
30
  WHITESPACE_PATTERNS = [
31
- re.compile(r'[ \t]{2,}'), # 连续空格或 tab
32
- re.compile(r'\u200B|\u2060'), # zero-width space / word joiner
33
- re.compile(r'\s+([.,!?;:])'), # 标点前多余空格
34
- re.compile(r'([.,!?;:])\s{2,}'), # 标点后异常空格
35
  ]
36
 
37
- def check_api_key():
38
- """检查API Key是否配置"""
39
- if not DEEPSEEK_API_KEY:
40
- raise ValueError("⚠️ 请在 Space Settings 中配置 DEEPSEEK_API_KEY!\n\n前往:Settings → Repository secrets → New secret")
41
-
42
- def call_deepseek_api(prompt, model="deepseek-r1-distill-llama-8b", temperature=0.1, stream=True):
43
- """使用 OpenAI 客户端调用 DeepSeek API"""
44
- check_api_key()
45
-
46
- client = OpenAI(
47
- api_key=DEEPSEEK_API_KEY,
48
- base_url=DEEPSEEK_BASE_URL,
49
- )
50
-
51
- completion = client.chat.completions.create(
52
- model="deepseek-r1-distill-llama-8b",
53
- messages=[{"role": "user", "content": prompt}],
54
- temperature=temperature,
55
- stream=stream
56
- )
57
-
58
- if stream:
59
- # 流式响应处理
60
- response_content = ""
61
- for chunk in completion:
62
- if chunk.choices and chunk.choices[0].delta.content:
63
- response_content += chunk.choices[0].delta.content
64
- return response_content
65
- else:
66
- # 非流式响应
67
- return completion.choices[0].message.content
68
-
69
- # 系统Prompt模板
70
  PROMPT_TEMPLATE = """## Positioning
71
  You are a **LANGUAGE grammatical error correction tool** that can identify and correct grammatical errors in a text.
72
  Reply with a corrected version of the input sentence with all **grammatical**, **spelling** and **whitespace errors** fixed, making only necessary changes.
@@ -87,10 +183,6 @@ Example 2: No errors, reply with a copy of the original sentence, don't fill in
87
  [input]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
88
  [output]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
89
 
90
- Example 3: No errors, reply with a copy of the original sentence, don't fill in the contents of ___.
91
- [input]: The Sun is the largest body in the solar system. The Sun is a ___.
92
- [output]: The Sun is the largest body in the solar system. The Sun is a ___.
93
-
94
  ## Task
95
  Next, please correct the following sentence according to the above requirements.
96
  **If there are no errors, reply with a copy of the original sentence. Don't fill in the contents of ___.**
@@ -98,21 +190,35 @@ Next, please correct the following sentence according to the above requirements.
98
 
99
  [input]: """
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  def process_sentence(sentence):
102
- """检查问题是否完整,不完整则添加标记"""
103
  sentence = sentence.strip()
104
-
105
- # 判断是否为多行文本
106
  lines = [line.strip() for line in sentence.split('\n') if line.strip()]
107
  is_multiline = len(lines) > 1
108
-
109
- # 根据是否多行选择处理逻辑
110
- if is_multiline:
111
- target_line = lines[-1]
112
- else:
113
- target_line = sentence
114
-
115
- # 检查最后一个字符是否是标点符号
116
  last_char = target_line[-1] if target_line else ''
117
  if last_char in {'.', '?', '!', ';', ','}:
118
  return target_line
@@ -120,27 +226,15 @@ def process_sentence(sentence):
120
  return target_line + " ___."
121
 
122
  def is_valid_output(content_2, content_1, content_0):
123
- """检查输出格式是否符合要求"""
124
- # 检查基本格式
125
  if not (content_2.startswith('[output]:') and '\n' not in content_2):
126
  return False
127
-
128
- # 原始句子有下划线,但生成的句子没有下划线 => 返回False
129
  if ('___' in content_0 or '___' in content_1) and '___' not in content_2:
130
  return False
131
-
132
- # content_2 的字符数不能超过 content_1 的两倍
133
- if len(content_2) > 2 * len(content_1):
134
- return False
135
-
136
- # content_1 的字符数不能超过 content_2 的两倍
137
- if len(content_1) > 2 * len(content_2):
138
  return False
139
-
140
  return True
141
 
142
  def extract_output_content(item):
143
- """提取输出内容"""
144
  if item.startswith('[output]:'):
145
  output_content = item[len('[output]:'):].strip()
146
  if output_content and output_content[0] == '"' and output_content[-1] == '"':
@@ -155,102 +249,114 @@ def extract_output_content(item):
155
  return None
156
 
157
  def has_missing_spaces(sentence):
158
- """
159
- 启发式检测:长度足够 + 多个词形变化 + 无空格
160
- """
161
  if ' ' in sentence:
162
  return False
163
  doc = nlp(sentence)
164
- # 多个 alpha token 且原文无空格
165
  alpha_tokens = [t for t in doc if t.is_alpha]
166
  return len(alpha_tokens) >= 2
167
 
168
  def calculate_whitespace_anomaly_rate(sentences):
169
- """
170
- 计算空白符异常率(WAR)
171
- """
172
  if not sentences:
173
  return 0.0
174
-
175
  anomaly_count = 0
176
-
177
  for sent in sentences:
178
- # 检测缺少空格
179
  if has_missing_spaces(sent):
180
  anomaly_count += 1
181
  continue
182
-
183
- # 检测其他空白异常
184
  if any(p.search(sent) for p in WHITESPACE_PATTERNS):
185
  anomaly_count += 1
186
-
187
  return anomaly_count / len(sentences) * 100
188
 
189
  def normalize_tokens(text):
190
- """
191
- 标准化文本token,用于拼写检查
192
- """
193
  doc = nlp(text)
194
  tokens = []
195
  for t in doc:
196
- if not t.is_alpha:
197
- continue
198
- if len(t.text) <= 2:
199
- continue
200
- if t.text.isupper():
201
  continue
202
  tokens.append(t.text.lower())
203
  return tokens
204
 
205
  def calculate_spelling_error_density(sentences):
206
- """
207
- 计算拼写错误密度(SED)
208
- """
209
  total_words = 0
210
  total_errors = 0
211
-
212
  for sent in sentences:
213
- # missing-space 单独处理:不计入拼写错误
214
  if has_missing_spaces(sent):
215
  continue
216
-
217
  tokens = normalize_tokens(sent)
218
  if not tokens:
219
  continue
220
-
221
  misspelled = spell.unknown(tokens)
222
-
223
  total_errors += len(misspelled)
224
  total_words += len(tokens)
225
-
226
  if total_words == 0:
227
  return 0.0
228
-
229
  return total_errors / total_words * 100
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
232
- """清洗数据集的核心函数(增强版:包含WAR和SED指标计算)"""
233
  try:
234
- # 检查 API Key
235
  try:
236
  check_api_key()
237
  except ValueError as e:
238
  return str(e), None, None
239
 
240
- # 读取 parquet 文件
241
  progress(0.05, desc="📁 读取数据文件...")
242
  df = pd.read_parquet(file_path)
243
 
244
- # 检查列名是否存在
245
  if question_column not in df.columns:
246
  available_columns = ", ".join(df.columns.tolist())
247
  return f"❌ 列名 '{question_column}' 不存在!\n可用列名: {available_columns}", None, None
248
 
249
- # 提取问题数据
250
  data_ori = df[question_column].tolist()[:int(max_samples)]
251
  total = len(data_ori)
252
 
253
- # === 计算原始数据的 WAR 和 SED ===
254
  progress(0.08, desc="📊 计算原始指标...")
255
  original_sentences = [str(item) for item in data_ori]
256
  war_original = calculate_whitespace_anomaly_rate(original_sentences)
@@ -258,13 +364,9 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
258
 
259
  progress(0.1, desc=f"🚀 开始清洗 {total} 个样本...")
260
 
261
- # 预处理:添加标记
262
  data_corrupt = [process_sentence(str(item)) for item in data_ori]
263
-
264
- # 清洗结果
265
  results = []
266
  max_retries = 5
267
-
268
  log_text = f"🚀 开始处理 {total} 个样本...\n\n"
269
 
270
  for idx in range(total):
@@ -277,14 +379,12 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
277
 
278
  while retry_count < max_retries:
279
  try:
280
- # 调用 DeepSeek API
281
  response_content = call_deepseek_api(
282
  PROMPT_TEMPLATE + original_text,
283
  model=model_choice,
284
  temperature=float(temperature)
285
  )
286
 
287
- # 验证输出格式
288
  if is_valid_output(response_content, original_text, unprocess_text):
289
  results.append(response_content)
290
  break
@@ -295,13 +395,11 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
295
  retry_count += 1
296
  log_text += f"⚠️ 样本 {idx+1} API错误,重试 {retry_count}/{max_retries}: {str(e)}\n"
297
  else:
298
- # 重试次数用尽
299
  results.append(f"[ERROR] Failed to process: {original_text}")
300
  log_text += f"❌ 样本 {idx+1} 处理失败\n"
301
 
302
  progress(0.85, desc="📊 后处理中...")
303
 
304
- # 提取清洗后的内容
305
  lst_extracted = []
306
  error_count = 0
307
  unknown_count = 0
@@ -316,7 +414,6 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
316
  if item.startswith('[ERROR]'):
317
  error_count += 1
318
 
319
- # 恢复多行格式
320
  lst_final = []
321
  for i in range(len(data_ori)):
322
  item = str(data_ori[i])
@@ -327,32 +424,26 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
327
  else:
328
  lst_final.append(lst_extracted[i])
329
 
330
- # === 计算清洗后的 WAR 和 SED ===
331
  progress(0.90, desc="📊 计算清洗后指标...")
332
  cleaned_sentences = [str(item) for item in lst_final]
333
  war_cleaned = calculate_whitespace_anomaly_rate(cleaned_sentences)
334
  sed_cleaned = calculate_spelling_error_density(cleaned_sentences)
335
 
336
- # 计算变化
337
  delta_war = war_cleaned - war_original
338
  delta_sed = sed_cleaned - sed_original
339
 
340
  progress(0.95, desc="💾 保存结果...")
341
 
342
- # 创建新的DataFrame
343
  df_cleaned = df.copy()
344
  df_cleaned[question_column + '_cleaned'] = lst_final[:len(df)]
345
 
346
- # 生成输出文件名
347
  original_filename = os.path.basename(file_path)
348
  base_name = original_filename.replace('.parquet', '')
349
  output_filename = f"{base_name}-Denoising.parquet"
350
  output_path = os.path.join(tempfile.gettempdir(), output_filename)
351
 
352
- # 保存为 parquet
353
  df_cleaned.to_parquet(output_path, index=False)
354
 
355
- # 生成统计信息(增强版)
356
  log_text += f"\n\n📊 处理完成!\n"
357
  log_text += f"{'='*50}\n"
358
  log_text += f"【基础统计】\n"
@@ -372,7 +463,6 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
372
  log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
373
  log_text += f"{'='*50}\n"
374
 
375
- # 生成预览数据
376
  preview_df = pd.DataFrame({
377
  '原始问题': [str(x)[:100] for x in data_ori[:5]],
378
  '清洗后问题': [str(x)[:100] for x in lst_final[:5]]
@@ -387,200 +477,182 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
387
  error_detail = traceback.format_exc()
388
  return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None
389
 
390
- def show_leaderboard():
391
- """显示Leaderboard,从 JSON 文件加载并格式化为HTML"""
392
- json_path = "leaderboard.json"
393
- try:
394
- with open(json_path, 'r', encoding='utf-8') as f:
395
- data = json.load(f)
396
-
397
- # 构建HTML表格
398
- html = """
399
- <style>
400
- .leaderboard-table {
401
- width: 100%;
402
- border-collapse: collapse;
403
- margin: 20px 0;
404
- font-size: 14px;
405
- box-shadow: 0 2px 8px rgba(0,0,0,0.1);
406
- border-radius: 8px;
407
- overflow: hidden;
408
- }
409
- .leaderboard-table thead tr {
410
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
411
- color: white;
412
- text-align: left;
413
- font-weight: bold;
414
- }
415
- .leaderboard-table th,
416
- .leaderboard-table td {
417
- padding: 12px 15px;
418
- text-align: center;
419
- }
420
- .leaderboard-table tbody tr {
421
- border-bottom: 1px solid #dddddd;
422
- transition: all 0.2s ease;
423
- }
424
- .leaderboard-table tbody tr:nth-of-type(even) {
425
- background-color: #f9fafb;
426
- }
427
- .leaderboard-table tbody tr:hover {
428
- background-color: #e8eaf6;
429
- transform: scale(1.01);
430
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
431
- }
432
- .metric-positive {
433
- color: #10b981;
434
- font-weight: bold;
435
- }
436
- .metric-negative {
437
- color: #ef4444;
438
- font-weight: bold;
439
- }
440
- .download-link {
441
- color: #667eea;
442
- text-decoration: none;
443
- font-weight: 500;
444
- padding: 4px 12px;
445
- border-radius: 4px;
446
- border: 1px solid #667eea;
447
- transition: all 0.2s;
448
- display: inline-block;
449
- }
450
- .download-link:hover {
451
- background-color: #667eea;
452
- color: white;
453
- transform: translateY(-1px);
454
- }
455
- .benchmark-name {
456
- font-weight: 500;
457
- color: #1f2937;
458
- }
459
- </style>
460
- <table class="leaderboard-table">
461
- <thead>
462
- <tr>
463
- <th>ID</th>
464
- <th>Benchmark</th>
465
- <th>ΔWAR (%)</th>
466
- <th>ΔSED</th>
467
- <th>下载</th>
468
- </tr>
469
- </thead>
470
- <tbody>
471
- """
472
-
473
- for item in data:
474
- # 提取下载链接
475
- download_text = item['Download']
476
- # 从 Markdown 格中提取 URL
477
- if '[下载](' in download_text:
478
- url = download_text.split('(')[1].rstrip(')')
479
- download_html = f'<a href="{url}" class="download-link" target="_blank">下载</a>'
480
- else:
481
- download_html = download_text
482
-
483
- # 格式化 ΔWAR 和 ΔSED
484
- war_value = item['ΔWAR']
485
- sed_value = item['ΔSED']
486
-
487
- war_class = 'metric-positive' if war_value > 0 else 'metric-negative'
488
- sed_class = 'metric-positive' if sed_value < 0 else 'metric-negative'
489
-
490
- html += f"""
491
- <tr>
492
- <td>{item['ID']}</td>
493
- <td class="benchmark-name">{item['Benchmark']}</td>
494
- <td class="{war_class}">{war_value:+.2f}</td>
495
- <td class="{sed_class}">{sed_value:+.2f}</td>
496
- <td>{download_html}</td>
497
- </tr>
498
- """
499
-
500
- html += """
501
- </tbody>
502
- </table>
503
- """
504
-
505
- return html
506
-
507
- except FileNotFoundError:
508
- return "<p style='color: red;'>❌ leaderboard.json 文件未找到</p>"
509
- except json.JSONDecodeError:
510
- return "<p style='color: red;'>❌ JSON 格式无效</p>"
511
-
512
- # 创建 Gradio 界面
513
- with gr.Blocks(title="数据集清洗框架展示系统", css="""
514
- .gradio-container {
515
- max-width: 1400px !important;
516
- }
517
- .stats-box {
518
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
519
- color: white;
520
- padding: 20px;
521
- border-radius: 10px;
522
- text-align: center;
523
- margin: 10px 0;
524
- }
525
- .stats-box h3 {
526
- margin: 0;
527
- font-size: 24px;
528
- }
529
- .stats-box p {
530
- margin: 5px 0;
531
- font-size: 14px;
532
- opacity: 0.9;
533
- }
534
- """) as demo:
535
 
536
- gr.Markdown("""
537
- # 🚀 基于基准去噪框架的去噪工厂展示系统
538
- """)
539
 
540
- with gr.Tabs():
541
-
542
- # Tab 1: Leaderboard
543
- with gr.Tab("📊 Leaderboard"):
544
- gr.Markdown("""
545
- ## 清洗效果排行榜
546
- 展示主流benchmark数据集的去噪结果(按数据集排序)
547
- """)
548
-
549
- with gr.Row():
550
- with gr.Column(scale=2):
551
- gr.HTML("""
552
- <div class="stats-box">
553
- <h3>📈 关键指标</h3>
554
- <p><strong>数据集总数:</strong> 14个主流Benchmark</p>
555
- <p><strong>去噪方法:</strong> 2种</p>
556
- <p><strong>总配置:</strong> 28种</p>
557
- </div>
558
- """)
559
-
560
- gr.Markdown("""
561
- ### 📝 指标说明
562
- - **ΔWAR**: Word Accuracy Rate变化 (↑越高越好)
563
- - **ΔSED**: Sentence Edit Distance变化 (↓越低越好)
564
- - **绿色**: 正向提升
565
- - **红色**: 负向影响
566
- """)
567
 
568
- with gr.Column(scale=5):
569
- leaderboard_html = gr.HTML(
570
- value=show_leaderboard(),
571
- label="数据集去噪结果对比"
 
572
  )
573
-
574
- # Tab 2: 数据集上传与清洗 (DeepSeek-R1)
575
- with gr.Tab("🔧 数据集去噪 (DeepSeek-R1-denoising)"):
576
- gr.Markdown("""
577
- ## 上传 Parquet 数据集进行去噪
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
- **支持的数据集格式**:
580
- - MMLU / GSM8K / ARC-Challenge / MedMCQA
581
- - 文件格式: `.parquet`
582
- - 清洗后文件命名: `原文件名-Denoising.parquet`
583
- """)
 
 
 
 
 
584
 
585
  with gr.Row():
586
  with gr.Column():
@@ -617,7 +689,7 @@ with gr.Blocks(title="数据集清洗框架展示系统", css="""
617
  label="📊 处理样本数 (Demo限制)"
618
  )
619
 
620
- clean_btn = gr.Button("🚀 开始清洗", variant="primary", size="lg")
621
 
622
  with gr.Column():
623
  output_text = gr.Textbox(
@@ -638,150 +710,20 @@ with gr.Blocks(title="数据集清洗框架展示系统", css="""
638
  inputs=[file_input, question_column, model_choice, temperature, max_samples],
639
  outputs=[output_text, download_file, preview_df]
640
  )
641
-
642
- # Tab 3: 数据集上传与清洗 (WAC-GEC)
643
- with gr.Tab("🔧 数据集去噪 (WAC-GEC)"):
644
- gr.Markdown("""
645
- ## 上传 Parquet 数据集进行去噪
646
-
647
- **支持的数据集格式**:
648
- - MMLU / GSM8K / ARC-Challenge / MedMCQA 等
649
- - 文件格式: `.parquet`
650
- - 清洗后文件命名: `原文件名-Denoising.parquet`
651
- """)
652
-
653
- with gr.Row():
654
- with gr.Column():
655
- file_input_wac = gr.File(
656
- label="📁 上传 Parquet 文件",
657
- file_types=[".parquet"]
658
- )
659
-
660
- question_column_wac = gr.Textbox(
661
- label="📝 问题列名",
662
- value="question",
663
- placeholder="例如: question, input_text, prompt"
664
- )
665
-
666
- model_choice_wsc = gr.Dropdown(
667
- choices=["eo_larger_byte", "ed_larger_byte"],
668
- value="eo_larger_byte",
669
- label="🤖 选择WSC模型"
670
- )
671
-
672
- model_choice_gec = gr.Dropdown(
673
- choices=["Chat-Llama-2-13B", "T5-11B", "GECToR-Roberta-L"],
674
- value="Chat-Llama-2-13B",
675
- label="🤖 选择GEC模型"
676
- )
677
-
678
- temperature_wac = gr.Slider(
679
- minimum=0.0,
680
- maximum=1.0,
681
- value=0.1,
682
- step=0.1,
683
- label="🌡️ Temperature"
684
- )
685
-
686
- max_samples_wac = gr.Slider(
687
- minimum=1,
688
- maximum=100,
689
- value=5,
690
- step=1,
691
- label="📊 处理样本数"
692
- )
693
-
694
- clean_btn_wac = gr.Button("🚀 开始去噪", variant="primary", size="lg")
695
-
696
- with gr.Column():
697
- output_text_wac = gr.Textbox(
698
- label="⏳ 处理进度",
699
- lines=10,
700
- max_lines=15
701
- )
702
-
703
- preview_df_wac = gr.Dataframe(
704
- label="🔍 结果预览",
705
- wrap=True
706
- )
707
-
708
- download_file_wac = gr.File(label="📥 下载去噪后的数据集")
709
 
710
- # Note: This would need a separate function for WAC-GEC processing
711
- gr.Markdown("⚠️ WAC-GEC 功能需要额外实现对应的处理函数")
712
-
713
- # Tab 4: 关于
714
- with gr.Tab("ℹ️ 关于"):
715
  gr.Markdown("""
716
- ## 清洗流程说明
717
-
718
- ### 核心算法
719
-
720
- 1. **预处理 (process_sentence)**
721
- - 检测句子完整性
722
- - 为不完整的句子添加标记 `___`
723
- - 保留多行文本格式
724
-
725
- 2. **LLM清洗**
726
- - 使用 DeepSeek API 进行语法、拼写、��格错误修正
727
- - 重试机制:最多重试5次
728
- - 稳定的 REST API 调用
729
-
730
- 3. **格式验证 (is_valid_output)**
731
- - 验证输出格式正确性
732
- - 检查是否保留了 `___` 标记
733
- - 长度合理性检查
734
-
735
- 4. **后处理**
736
- - 提取清洗后的内容
737
- - 恢复原始多行格式
738
- - 生成 `XXX-Denoising.parquet` 文件
739
-
740
- ### 支持的数据集
741
-
742
- - **MMLU**: 57个学科的多选题
743
- - **GSM8K**: 数学推理题
744
- - **ARC-Challenge**: 科学问答
745
- - **MedMCQA**: 医学选择题
746
- - **CoQA**: 对话问答
747
- - 以及更多...
748
-
749
- ### 技术栈
750
-
751
- - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
752
- - **前端**: Gradio 4.16.0
753
- - **数据处理**: Pandas + PyArrow (Parquet)
754
- - **API调用**: OpenAI SDK
755
- - **部署**: Hugging Face Spaces
756
-
757
- ### 研究成果
758
-
759
- 本框架在多个主流benchmark上取得了显著的性能提升,
760
- 通过两种不同的去噪方法(DeepSeek-R1和WAC-GEC)实现数据质量优化。
761
-
762
- ### 使用说明
763
-
764
- 1. **配置 API Key**: Settings → Repository secrets → `DEEPSEEK_API_KEY`
765
- 2. **上传数据集**: 选择 `.parquet` 文件
766
- 3. **指定列名**: 输入包含问题的列名(通常是 `question`)
767
- 4. **调整参数**: 选择模型、temperature等
768
- 5. **开始清洗**: 点击按钮开始处理
769
- 6. **下载结果**: 下载 `XXX-Denoising.parquet` 文件
770
-
771
- ⚠️ **重要提示**:
772
- - Demo版本限制最多处理100个样本
773
- - 完整版本可处理数万样本
774
- - 建议 temperature=0.1 以获得稳定输出
775
 
776
- ---
 
 
777
 
778
- **研究生毕业论文成果展示** | Powered by DeepSeek API
779
- """)
780
 
781
- # 启动应用
782
  if __name__ == "__main__":
783
  demo.launch(
784
- server_name="0.0.0.0",
785
  server_port=7860,
786
  ssr_mode=False
787
  )
 
1
+ # app_refactored.py - 重构后的展示系统
2
  import gradio as gr
3
  import json
4
  import pandas as pd
 
11
  import spacy
12
  from spellchecker import SpellChecker
13
 
14
+ # ======================== CSS样式 ========================
15
+ custom_css = """
16
+ .gradio-container {
17
+ max-width: 1400px !important;
18
+ }
19
+
20
+ .markdown-text {
21
+ font-size: 16px;
22
+ line-height: 1.6;
23
+ }
24
+
25
+ .markdown-text h1 {
26
+ text-align: center;
27
+ margin-bottom: 1em;
28
+ }
29
+
30
+ .tab-buttons button {
31
+ font-size: 18px;
32
+ font-weight: 600;
33
+ padding: 12px 24px;
34
+ }
35
+
36
+ #leaderboard-table {
37
+ margin-top: 20px;
38
+ }
39
+
40
+ #search-bar {
41
+ width: 100%;
42
+ font-size: 16px;
43
+ }
44
+
45
+ #filter-columns {
46
+ margin-top: 10px;
47
+ }
48
+
49
+ #column-select {
50
+ font-size: 14px;
51
+ }
52
+
53
+ .stats-box {
54
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
55
+ color: white;
56
+ padding: 20px;
57
+ border-radius: 10px;
58
+ text-align: center;
59
+ margin: 10px 0;
60
+ }
61
+
62
+ .stats-box h3 {
63
+ margin: 0;
64
+ font-size: 24px;
65
+ }
66
+
67
+ .stats-box p {
68
+ margin: 5px 0;
69
+ font-size: 14px;
70
+ opacity: 0.9;
71
+ }
72
+
73
+ .leaderboard-table {
74
+ width: 100%;
75
+ border-collapse: collapse;
76
+ margin: 20px 0;
77
+ font-size: 14px;
78
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
79
+ border-radius: 8px;
80
+ overflow: hidden;
81
+ }
82
+
83
+ .leaderboard-table thead tr {
84
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
85
+ color: white;
86
+ text-align: left;
87
+ font-weight: bold;
88
+ }
89
+
90
+ .leaderboard-table th,
91
+ .leaderboard-table td {
92
+ padding: 12px 15px;
93
+ text-align: center;
94
+ }
95
+
96
+ .leaderboard-table tbody tr {
97
+ border-bottom: 1px solid #dddddd;
98
+ transition: all 0.2s ease;
99
+ }
100
+
101
+ .leaderboard-table tbody tr:nth-of-type(even) {
102
+ background-color: #f9fafb;
103
+ }
104
+
105
+ .leaderboard-table tbody tr:hover {
106
+ background-color: #e8eaf6;
107
+ transform: scale(1.01);
108
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
109
+ }
110
+
111
+ .metric-positive {
112
+ color: #10b981;
113
+ font-weight: bold;
114
+ }
115
+
116
+ .metric-negative {
117
+ color: #ef4444;
118
+ font-weight: bold;
119
+ }
120
+
121
+ .download-link {
122
+ color: #667eea;
123
+ text-decoration: none;
124
+ font-weight: 500;
125
+ padding: 4px 12px;
126
+ border-radius: 4px;
127
+ border: 1px solid #667eea;
128
+ transition: all 0.2s;
129
+ display: inline-block;
130
+ }
131
+
132
+ .download-link:hover {
133
+ background-color: #667eea;
134
+ color: white;
135
+ transform: translateY(-1px);
136
+ }
137
+
138
+ .benchmark-name {
139
+ font-weight: 500;
140
+ color: #1f2937;
141
+ }
142
+ """
143
+
144
+ # ======================== API配置 ========================
145
  DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
146
  DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
147
 
148
+ # ======================== NLP工具初始化 ========================
149
  try:
150
  nlp = spacy.load("en_core_web_sm")
151
  except OSError:
 
152
  import subprocess
153
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
154
  nlp = spacy.load("en_core_web_sm")
155
 
156
  spell = SpellChecker()
157
 
 
158
  WHITESPACE_PATTERNS = [
159
+ re.compile(r'[ \t]{2,}'),
160
+ re.compile(r'\u200B|\u2060'),
161
+ re.compile(r'\s+([.,!?;:])'),
162
+ re.compile(r'([.,!?;:])\s{2,}'),
163
  ]
164
 
165
+ # ======================== Prompt模板 ========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  PROMPT_TEMPLATE = """## Positioning
167
  You are a **LANGUAGE grammatical error correction tool** that can identify and correct grammatical errors in a text.
168
  Reply with a corrected version of the input sentence with all **grammatical**, **spelling** and **whitespace errors** fixed, making only necessary changes.
 
183
  [input]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
184
  [output]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
185
 
 
 
 
 
186
  ## Task
187
  Next, please correct the following sentence according to the above requirements.
188
  **If there are no errors, reply with a copy of the original sentence. Don't fill in the contents of ___.**
 
190
 
191
  [input]: """
192
 
193
+ # ======================== 工具函数 ========================
194
+ def check_api_key():
195
+ if not DEEPSEEK_API_KEY:
196
+ raise ValueError("⚠️ 请在 Space Settings 中配置 DEEPSEEK_API_KEY!")
197
+
198
+ def call_deepseek_api(prompt, model="deepseek-r1-distill-llama-8b", temperature=0.1, stream=True):
199
+ check_api_key()
200
+ client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL)
201
+ completion = client.chat.completions.create(
202
+ model=model,
203
+ messages=[{"role": "user", "content": prompt}],
204
+ temperature=temperature,
205
+ stream=stream
206
+ )
207
+
208
+ if stream:
209
+ response_content = ""
210
+ for chunk in completion:
211
+ if chunk.choices and chunk.choices[0].delta.content:
212
+ response_content += chunk.choices[0].delta.content
213
+ return response_content
214
+ else:
215
+ return completion.choices[0].message.content
216
+
217
  def process_sentence(sentence):
 
218
  sentence = sentence.strip()
 
 
219
  lines = [line.strip() for line in sentence.split('\n') if line.strip()]
220
  is_multiline = len(lines) > 1
221
+ target_line = lines[-1] if is_multiline else sentence
 
 
 
 
 
 
 
222
  last_char = target_line[-1] if target_line else ''
223
  if last_char in {'.', '?', '!', ';', ','}:
224
  return target_line
 
226
  return target_line + " ___."
227
 
228
  def is_valid_output(content_2, content_1, content_0):
 
 
229
  if not (content_2.startswith('[output]:') and '\n' not in content_2):
230
  return False
 
 
231
  if ('___' in content_0 or '___' in content_1) and '___' not in content_2:
232
  return False
233
+ if len(content_2) > 2 * len(content_1) or len(content_1) > 2 * len(content_2):
 
 
 
 
 
 
234
  return False
 
235
  return True
236
 
237
  def extract_output_content(item):
 
238
  if item.startswith('[output]:'):
239
  output_content = item[len('[output]:'):].strip()
240
  if output_content and output_content[0] == '"' and output_content[-1] == '"':
 
249
  return None
250
 
251
  def has_missing_spaces(sentence):
 
 
 
252
  if ' ' in sentence:
253
  return False
254
  doc = nlp(sentence)
 
255
  alpha_tokens = [t for t in doc if t.is_alpha]
256
  return len(alpha_tokens) >= 2
257
 
258
  def calculate_whitespace_anomaly_rate(sentences):
 
 
 
259
  if not sentences:
260
  return 0.0
 
261
  anomaly_count = 0
 
262
  for sent in sentences:
 
263
  if has_missing_spaces(sent):
264
  anomaly_count += 1
265
  continue
 
 
266
  if any(p.search(sent) for p in WHITESPACE_PATTERNS):
267
  anomaly_count += 1
 
268
  return anomaly_count / len(sentences) * 100
269
 
270
  def normalize_tokens(text):
 
 
 
271
  doc = nlp(text)
272
  tokens = []
273
  for t in doc:
274
+ if not t.is_alpha or len(t.text) <= 2 or t.text.isupper():
 
 
 
 
275
  continue
276
  tokens.append(t.text.lower())
277
  return tokens
278
 
279
  def calculate_spelling_error_density(sentences):
 
 
 
280
  total_words = 0
281
  total_errors = 0
 
282
  for sent in sentences:
 
283
  if has_missing_spaces(sent):
284
  continue
 
285
  tokens = normalize_tokens(sent)
286
  if not tokens:
287
  continue
 
288
  misspelled = spell.unknown(tokens)
 
289
  total_errors += len(misspelled)
290
  total_words += len(tokens)
 
291
  if total_words == 0:
292
  return 0.0
 
293
  return total_errors / total_words * 100
294
 
295
+ # ======================== Leaderboard数据处理 ========================
296
+ def load_leaderboard_data():
297
+ """从JSON加载Leaderboard数据并添加类型分类"""
298
+ json_path = "leaderboard.json"
299
+ try:
300
+ with open(json_path, 'r', encoding='utf-8') as f:
301
+ data = json.load(f)
302
+
303
+ # 添加类型分类 (示例分类规则)
304
+ for item in data:
305
+ benchmark_name = item['Benchmark'].lower()
306
+ if 'mmlu' in benchmark_name or 'arc' in benchmark_name:
307
+ item['Type'] = 'A'
308
+ elif 'gsm' in benchmark_name or 'math' in benchmark_name:
309
+ item['Type'] = 'B'
310
+ elif 'med' in benchmark_name or 'bio' in benchmark_name:
311
+ item['Type'] = 'C'
312
+ elif 'code' in benchmark_name or 'human' in benchmark_name:
313
+ item['Type'] = 'D'
314
+ else:
315
+ item['Type'] = 'E'
316
+
317
+ return pd.DataFrame(data)
318
+ except Exception as e:
319
+ print(f"Error loading leaderboard: {e}")
320
+ return pd.DataFrame()
321
+
322
+ def make_clickable_download(download_text):
323
+ """将Markdown链接转换为HTML链接"""
324
+ if '[下载](' in download_text:
325
+ url = download_text.split('(')[1].rstrip(')')
326
+ return f'<a href="{url}" class="download-link" target="_blank">下载</a>'
327
+ return download_text
328
+
329
+ def filter_leaderboard(df, query):
330
+ """根据类型筛选Leaderboard"""
331
+ if query == "all":
332
+ return df
333
+ else:
334
+ return df[df['Type'] == query]
335
+
336
+ def search_leaderboard(df, query):
337
+ """搜索Leaderboard"""
338
+ if not query:
339
+ return df
340
+ return df[df['Benchmark'].str.contains(query, case=False, na=False)]
341
+
342
+ # ======================== 数据清洗函数 ========================
343
  def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
 
344
  try:
 
345
  try:
346
  check_api_key()
347
  except ValueError as e:
348
  return str(e), None, None
349
 
 
350
  progress(0.05, desc="📁 读取数据文件...")
351
  df = pd.read_parquet(file_path)
352
 
 
353
  if question_column not in df.columns:
354
  available_columns = ", ".join(df.columns.tolist())
355
  return f"❌ 列名 '{question_column}' 不存在!\n可用列名: {available_columns}", None, None
356
 
 
357
  data_ori = df[question_column].tolist()[:int(max_samples)]
358
  total = len(data_ori)
359
 
 
360
  progress(0.08, desc="📊 计算原始指标...")
361
  original_sentences = [str(item) for item in data_ori]
362
  war_original = calculate_whitespace_anomaly_rate(original_sentences)
 
364
 
365
  progress(0.1, desc=f"🚀 开始清洗 {total} 个样本...")
366
 
 
367
  data_corrupt = [process_sentence(str(item)) for item in data_ori]
 
 
368
  results = []
369
  max_retries = 5
 
370
  log_text = f"🚀 开始处理 {total} 个样本...\n\n"
371
 
372
  for idx in range(total):
 
379
 
380
  while retry_count < max_retries:
381
  try:
 
382
  response_content = call_deepseek_api(
383
  PROMPT_TEMPLATE + original_text,
384
  model=model_choice,
385
  temperature=float(temperature)
386
  )
387
 
 
388
  if is_valid_output(response_content, original_text, unprocess_text):
389
  results.append(response_content)
390
  break
 
395
  retry_count += 1
396
  log_text += f"⚠️ 样本 {idx+1} API错误,重试 {retry_count}/{max_retries}: {str(e)}\n"
397
  else:
 
398
  results.append(f"[ERROR] Failed to process: {original_text}")
399
  log_text += f"❌ 样本 {idx+1} 处理失败\n"
400
 
401
  progress(0.85, desc="📊 后处理中...")
402
 
 
403
  lst_extracted = []
404
  error_count = 0
405
  unknown_count = 0
 
414
  if item.startswith('[ERROR]'):
415
  error_count += 1
416
 
 
417
  lst_final = []
418
  for i in range(len(data_ori)):
419
  item = str(data_ori[i])
 
424
  else:
425
  lst_final.append(lst_extracted[i])
426
 
 
427
  progress(0.90, desc="📊 计算清洗后指标...")
428
  cleaned_sentences = [str(item) for item in lst_final]
429
  war_cleaned = calculate_whitespace_anomaly_rate(cleaned_sentences)
430
  sed_cleaned = calculate_spelling_error_density(cleaned_sentences)
431
 
 
432
  delta_war = war_cleaned - war_original
433
  delta_sed = sed_cleaned - sed_original
434
 
435
  progress(0.95, desc="💾 保存结果...")
436
 
 
437
  df_cleaned = df.copy()
438
  df_cleaned[question_column + '_cleaned'] = lst_final[:len(df)]
439
 
 
440
  original_filename = os.path.basename(file_path)
441
  base_name = original_filename.replace('.parquet', '')
442
  output_filename = f"{base_name}-Denoising.parquet"
443
  output_path = os.path.join(tempfile.gettempdir(), output_filename)
444
 
 
445
  df_cleaned.to_parquet(output_path, index=False)
446
 
 
447
  log_text += f"\n\n📊 处理完成!\n"
448
  log_text += f"{'='*50}\n"
449
  log_text += f"【基础统计】\n"
 
463
  log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
464
  log_text += f"{'='*50}\n"
465
 
 
466
  preview_df = pd.DataFrame({
467
  '原始问题': [str(x)[:100] for x in data_ori[:5]],
468
  '清洗后问题': [str(x)[:100] for x in lst_final[:5]]
 
477
  error_detail = traceback.format_exc()
478
  return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None
479
 
480
+ # ======================== 文本内容 ========================
481
+ ABOUT_TEXT = """
482
+ ## 清洗流程说明
483
+
484
+ ### 核心算法
485
+
486
+ 1. **预处理 (process_sentence)**
487
+ - 检测句子完整性
488
+ - 为不完整的句子添加标记 `___`
489
+ - 保留多行文本格式
490
+
491
+ 2. **LLM清洗**
492
+ - 使用 DeepSeek API 进行语法、拼写、空格错误修正
493
+ - 重试机制:最多重试5次
494
+ - 稳定的 REST API 调用
495
+
496
+ 3. **格式验证 (is_valid_output)**
497
+ - 验证输出格式正确性
498
+ - 检查是否保留了 `___` 标记
499
+ - 长度合理性检查
500
+
501
+ 4. **后处理**
502
+ - 提取清洗后的内容
503
+ - 恢复原始多行格式
504
+ - 生成 `XXX-Denoising.parquet` 文件
505
+
506
+ ### 支持的数据集
507
+
508
+ - **MMLU**: 57个学科的多选题
509
+ - **GSM8K**: 数学推理题
510
+ - **ARC-Challenge**: 科学问答
511
+ - **MedMCQA**: 医学选择题
512
+ - **CoQA**: 对话问答
513
+ - 以及更多...
514
+
515
+ ### 技术栈
516
+
517
+ - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
518
+ - **前端**: Gradio 4.16.0
519
+ - **数据处理**: Pandas + PyArrow (Parquet)
520
+ - **API调用**: OpenAI SDK
521
+ - **部署**: Hugging Face Spaces
522
+
523
+ ### 质量指标
524
+
525
+ - **WAR (Whitespace Anomaly Rate)**: 空白符异常率
526
+ - **SED (Spelling Error Density)**: 拼写错误密度
527
+
528
+ ### 使用说明
529
+
530
+ 1. **配置 API Key**: Settings → Repository secrets → `DEEPSEEK_API_KEY`
531
+ 2. **上传数据集**: 选择 `.parquet` 文件
532
+ 3. **指定列名**: 输入包含问题的列名(通常是 `question`)
533
+ 4. **调整参数**: 选择模型、temperature等
534
+ 5. **开始清洗**: 点击按钮开始处理
535
+ 6. **下载结果**: 下载 `XXX-Denoising.parquet` 文件
536
+
537
+ ⚠️ **重要提示**:
538
+ - Demo版本限制最多处理100个样本
539
+ - 完整版本可处理数万样本
540
+ - 建议 temperature=0.1 以获得稳定输出
541
+
542
+ ---
543
+
544
+ **研究生毕业论文成果展示** | Powered by DeepSeek API
545
+ """
546
+
547
+ SUBMISSION_TEXT = """
548
+ ## 提交说明
549
+
550
+ ### 如何提交新的去噪结果
551
+
552
+ 1. **准备数据**: 使用本系统对benchmark数据集进行去噪
553
+ 2. **记录指标**: 记录ΔWAR和ΔSED指标
554
+ 3. **提交PR**: 在GitHub上提交Pull Request
555
+ 4. **审核**: 等待维护者审核
556
+
557
+ ### 数据格式要求
558
+
559
+ 提交的数据需要包含以下字段:
560
+ - Benchmark名称
561
+ - 去噪方法
562
+ - ΔWAR (%)
563
+ - ΔSED
564
+ - 下载链接
565
+
566
+ ### 联系方
567
+
568
+ 如有问题,请通过以下方���联系:
569
+ - GitHub Issues
570
+ - Email: your-email@example.com
571
+ """
572
+
573
+ # ======================== Gradio界面 ========================
574
+ demo = gr.Blocks(title="数据集清洗框架展示系统", css=custom_css)
575
+
576
+ with demo:
577
+ gr.Markdown(
578
+ """<div style="text-align: center;"><h1>⭐ 基于基准去噪框架的 <span style='color: #e6b800;'>去噪工厂</span> 展示系统</h1></div>
579
+ <br>
580
+ <p>本系统展示了基于DeepSeek-R1和WAC-GEC两种方法对主流benchmark数据集的去噪效果。通过WAR(空白符异常率)和SED(拼写错误密度)两个指标评估去噪质量。</p>
581
+ """,
582
+ elem_classes="markdown-text"
583
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
 
585
+ # 加载leaderboard数据
586
+ leaderboard_data = load_leaderboard_data()
 
587
 
588
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
589
+ # ==================== Tab 1: Evaluation Table ====================
590
+ with gr.TabItem("📊 Evaluation Table", id=0):
591
+ with gr.Column():
592
+ gr.Markdown("### 清洗效果排行榜")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
594
+ with gr.Row():
595
+ search_bar = gr.Textbox(
596
+ placeholder="🔍 搜索Benchmark名称并按ENTER...",
597
+ show_label=False,
598
+ elem_id="search-bar",
599
  )
600
+ filter_types = gr.Radio(
601
+ label="⏚ 筛选Benchmark类型",
602
+ choices=["all", "A", "B", "C", "D", "E"],
603
+ value="all",
604
+ elem_id="filter-columns",
605
+ )
606
+
607
+ leaderboard_table = gr.Dataframe(
608
+ value=leaderboard_data[['ID', 'Benchmark', 'ΔWAR', 'ΔSED', 'Download']],
609
+ headers=['ID', 'Benchmark', 'ΔWAR (%)', 'ΔSED', '下载'],
610
+ datatype=['number', 'str', 'number', 'number', 'markdown'],
611
+ elem_id="leaderboard-table",
612
+ interactive=False,
613
+ )
614
+
615
+ hidden_leaderboard = gr.Dataframe(
616
+ value=leaderboard_data,
617
+ visible=False
618
+ )
619
+
620
+ # 绑定搜索和筛选
621
+ search_bar.submit(
622
+ lambda df, query: search_leaderboard(df, query)[['ID', 'Benchmark', 'ΔWAR', 'ΔSED', 'Download']],
623
+ [hidden_leaderboard, search_bar],
624
+ leaderboard_table
625
+ )
626
+
627
+ filter_types.change(
628
+ lambda df, query: filter_leaderboard(df, query)[['ID', 'Benchmark', 'ΔWAR', 'ΔSED', 'Download']],
629
+ [hidden_leaderboard, filter_types],
630
+ leaderboard_table
631
+ )
632
+
633
+ gr.Markdown("""
634
+ **说明:**
635
+ - ΔWAR: 空白符异常率变化 (正值表示改善)
636
+ - ΔSED: 拼写错误密度变化 (负值表示改善)
637
+ - 绿色: 正向提升 | 红色: 负向影响
638
+ - 类型分类: A=知识问答, B=数学推理, C=医学领域, D=代码生成, E=其他
639
+ """, elem_classes="markdown-text")
640
+
641
+ # ==================== Tab 2: Performance Plot ====================
642
+ with gr.TabItem("📈 Performance Plot", id=1):
643
+ gr.Markdown("### 性能可视化分析")
644
+ gr.Markdown("**注意**: 性能图表功能开发中,敬请期待。")
645
 
646
+ # 这里可以添加性能图表
647
+ # 例如: WAR和SED的对比图、不同方法的效果对比
648
+
649
+ # ==================== Tab 3: About ====================
650
+ with gr.TabItem("📝 About", id=2):
651
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
652
+
653
+ # ==================== Tab 4: Submit Results ====================
654
+ with gr.TabItem("🚀 Submit Results", id=3):
655
+ gr.Markdown("## 提交去噪结果")
656
 
657
  with gr.Row():
658
  with gr.Column():
 
689
  label="📊 处理样本数 (Demo限制)"
690
  )
691
 
692
+ clean_btn = gr.Button("🚀 开始去噪", variant="primary", size="lg")
693
 
694
  with gr.Column():
695
  output_text = gr.Textbox(
 
710
  inputs=[file_input, question_column, model_choice, temperature, max_samples],
711
  outputs=[output_text, download_file, preview_df]
712
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
 
 
 
 
 
 
714
  gr.Markdown("""
715
+ ### WAC-GEC方法 (开发中)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
 
717
+ WAC-GEC (Whitespace Anomaly Correction - Grammar Error Correction) 方法结合了:
718
+ - 空白符异常检测与修正
719
+ - 语法错误检测与修正
720
 
721
+ 该功能即将上线,敬请期待!
722
+ """, elem_classes="markdown-text")
723
 
 
724
  if __name__ == "__main__":
725
  demo.launch(
726
+ server_name="0.0.0.0",
727
  server_port=7860,
728
  ssr_mode=False
729
  )