lllouo commited on
Commit
27ccef7
·
1 Parent(s): 3fe659b
Files changed (2) hide show
  1. app.py +29 -32
  2. leaderboard.json +40 -12
app.py CHANGED
@@ -164,26 +164,13 @@ def calculate_spelling_error_density(sentences):
164
 
165
  # ======================== Leaderboard数据处理 ========================
166
  def load_leaderboard_data():
167
- """从JSON加载Leaderboard数据并添加类型分类"""
168
  json_path = "leaderboard.json"
169
  try:
170
  with open(json_path, 'r', encoding='utf-8') as f:
171
  data = json.load(f)
172
 
173
- # 类型分类 (示例分类规则)
174
- for item in data:
175
- benchmark_name = item['Benchmark'].lower()
176
- if 'mmlu' in benchmark_name or 'arc' in benchmark_name:
177
- item['Type'] = 'A'
178
- elif 'gsm' in benchmark_name or 'math' in benchmark_name:
179
- item['Type'] = 'B'
180
- elif 'med' in benchmark_name or 'bio' in benchmark_name:
181
- item['Type'] = 'C'
182
- elif 'code' in benchmark_name or 'human' in benchmark_name:
183
- item['Type'] = 'D'
184
- else:
185
- item['Type'] = 'E'
186
-
187
  return pd.DataFrame(data)
188
  except Exception as e:
189
  print(f"Error loading leaderboard: {e}")
@@ -197,11 +184,11 @@ def make_clickable_download(download_text):
197
  return download_text
198
 
199
  def filter_leaderboard(df, query):
200
- """根据类型筛选Leaderboard"""
201
  if query == "all":
202
  return df
203
  else:
204
- return df[df['Type'] == query]
205
 
206
  def search_leaderboard(df, query):
207
  """搜索Leaderboard"""
@@ -395,6 +382,15 @@ ABOUT_TEXT = """
395
  - **WAR (Whitespace Anomaly Rate)**: 空白符异常率
396
  - **SED (Spelling Error Density)**: 拼写错误密度
397
 
 
 
 
 
 
 
 
 
 
398
  ### 使用说明
399
 
400
  1. **配置 API Key**: Settings → Repository secrets → `DEEPSEEK_API_KEY`
@@ -427,11 +423,12 @@ SUBMISSION_TEXT = """
427
  ### 数据格式要求
428
 
429
  提交的数据需要包含以下字段:
 
 
430
  - Benchmark名称
431
- - 去噪方法
432
  - WAR (%)
433
  - SED
434
- - 下载链接
435
 
436
  ### 联系方式
437
 
@@ -467,17 +464,17 @@ with demo:
467
  show_label=False,
468
  elem_id="search-bar",
469
  )
470
- filter_types = gr.Radio(
471
- label=" 筛选Benchmark类",
472
- choices=["all", "A", "B", "C", "D", "E"],
473
  value="all",
474
  elem_id="filter-columns",
475
  )
476
 
477
  leaderboard_table = gr.Dataframe(
478
- value=leaderboard_data[['ID', 'Benchmark', 'WAR', 'SED', 'Download']],
479
- headers=['ID', 'Benchmark', 'WAR (%)', 'SED', '下载'],
480
- datatype=['number', 'str', 'number', 'number', 'markdown'],
481
  elem_id="leaderboard-table",
482
  interactive=False,
483
  )
@@ -489,23 +486,23 @@ with demo:
489
 
490
  # 绑定搜索和筛选
491
  search_bar.submit(
492
- lambda df, query: search_leaderboard(df, query)[['ID', 'Benchmark', 'WAR', 'SED', 'Download']],
493
  [hidden_leaderboard, search_bar],
494
  leaderboard_table
495
  )
496
 
497
- filter_types.change(
498
- lambda df, query: filter_leaderboard(df, query)[['ID', 'Benchmark', 'WAR', 'SED', 'Download']],
499
- [hidden_leaderboard, filter_types],
500
  leaderboard_table
501
  )
502
 
503
  gr.Markdown("""
504
  **说明:**
505
- - WAR: 空白符异常率变化 (正值表示改善)
506
- - SED: 拼写错误密度变化 (值表示改善)
 
507
  - 绿色: 正向提升 | 红色: 负向影响
508
- - 类型分类: A=知识问答, B=数学推理, C=医学领域, D=代码生成, E=其他
509
  """, elem_classes="markdown-text")
510
 
511
  # ==================== Tab 2: Performance Plot ====================
 
164
 
165
  # ======================== Leaderboard数据处理 ========================
166
  def load_leaderboard_data():
167
+ """从JSON加载Leaderboard数据(现在包含Category字段)"""
168
  json_path = "leaderboard.json"
169
  try:
170
  with open(json_path, 'r', encoding='utf-8') as f:
171
  data = json.load(f)
172
 
173
+ # Category已经在JSON中定义,直接载即可
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  return pd.DataFrame(data)
175
  except Exception as e:
176
  print(f"Error loading leaderboard: {e}")
 
184
  return download_text
185
 
186
  def filter_leaderboard(df, query):
187
+ """根据Category筛选Leaderboard"""
188
  if query == "all":
189
  return df
190
  else:
191
+ return df[df['Category'] == query]
192
 
193
  def search_leaderboard(df, query):
194
  """搜索Leaderboard"""
 
382
  - **WAR (Whitespace Anomaly Rate)**: 空白符异常率
383
  - **SED (Spelling Error Density)**: 拼写错误密度
384
 
385
+ ### 数据集分类
386
+
387
+ - **BT (Basic Tasks)**: 基础任务 - MRPC, RTE, SST2
388
+ - **RA (Reasoning Abilities)**: 推理能力 - ARC, GSM8K, MMLU
389
+ - **TG (Text Generation)**: 文本生成 - CoQA, DROP, Truthful_QA
390
+ - **SU (Speech Understanding)**: 语音理解 - WNLI, Natural_questions
391
+ - **ME (Medical)**: 医学领域 - MedMCQA, MedQA, PubMedQA
392
+ - **GR (Grammatical)**: 语法领域 - BEA-2019, CoNLL-2014
393
+
394
  ### 使用说明
395
 
396
  1. **配置 API Key**: Settings → Repository secrets → `DEEPSEEK_API_KEY`
 
423
  ### 数据格式要求
424
 
425
  提交的数据需要包含以下字段:
426
+ - ID: 序号
427
+ - Category: 类别 (BT/RA/TG/SU/ME/GR)
428
  - Benchmark名称
 
429
  - WAR (%)
430
  - SED
431
+ - Download: 下载链接
432
 
433
  ### 联系方式
434
 
 
464
  show_label=False,
465
  elem_id="search-bar",
466
  )
467
+ filter_categories = gr.Radio(
468
+ label="📂 筛选Benchmark类",
469
+ choices=["all", "BT", "RA", "TG", "SU", "ME", "GR"],
470
  value="all",
471
  elem_id="filter-columns",
472
  )
473
 
474
  leaderboard_table = gr.Dataframe(
475
+ value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
476
+ headers=['ID', 'Category', 'Benchmark', 'WAR (%)', 'SED', '下载'],
477
+ datatype=['number', 'str', 'str', 'number', 'number', 'markdown'],
478
  elem_id="leaderboard-table",
479
  interactive=False,
480
  )
 
486
 
487
  # 绑定搜索和筛选
488
  search_bar.submit(
489
+ lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
490
  [hidden_leaderboard, search_bar],
491
  leaderboard_table
492
  )
493
 
494
+ filter_categories.change(
495
+ lambda df, query: filter_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
496
+ [hidden_leaderboard, filter_categories],
497
  leaderboard_table
498
  )
499
 
500
  gr.Markdown("""
501
  **说明:**
502
+ - **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
503
+ - **WAR**: 空白符异常率变化 (值表示改善)
504
+ - **SED**: 拼写错误密度变化 (负值表示改善)
505
  - 绿色: 正向提升 | 红色: 负向影响
 
506
  """, elem_classes="markdown-text")
507
 
508
  # ==================== Tab 2: Performance Plot ====================
leaderboard.json CHANGED
@@ -1,6 +1,7 @@
1
  [
2
  {
3
  "ID": 1,
 
4
  "Benchmark": "ARC_deepseek_r1_denoising",
5
  "WAR": 0.00,
6
  "SED": 0.67,
@@ -8,6 +9,7 @@
8
  },
9
  {
10
  "ID": 2,
 
11
  "Benchmark": "ARC_wac_gec",
12
  "WAR": 0.00,
13
  "SED": 0.66,
@@ -15,6 +17,7 @@
15
  },
16
  {
17
  "ID": 3,
 
18
  "Benchmark": "COQA_deepseek_r1_denoising",
19
  "WAR": 4.18,
20
  "SED": 2.57,
@@ -22,6 +25,7 @@
22
  },
23
  {
24
  "ID": 4,
 
25
  "Benchmark": "COQA_wac_gec",
26
  "WAR": 4.70,
27
  "SED": 2.56,
@@ -29,6 +33,7 @@
29
  },
30
  {
31
  "ID": 5,
 
32
  "Benchmark": "DROP_deepseek_r1_denoising",
33
  "WAR": 0.02,
34
  "SED": 3.24,
@@ -36,6 +41,7 @@
36
  },
37
  {
38
  "ID": 6,
 
39
  "Benchmark": "DROP_wac_gec",
40
  "WAR": 0.64,
41
  "SED": 3.25,
@@ -43,27 +49,31 @@
43
  },
44
  {
45
  "ID": 7,
 
46
  "Benchmark": "MRPC_deepseek_r1_denoising",
47
  "WAR": 3.80,
48
  "SED": 4.70,
49
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/mrpc)"
50
  },
51
- {
52
  "ID": 8,
 
53
  "Benchmark": "MRPC_wac_gec",
54
  "WAR": 1.84,
55
  "SED": 4.50,
56
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/mrpc)"
57
  },
58
- {
59
  "ID": 9,
 
60
  "Benchmark": "RTE_deepseek_r1_denoising",
61
  "WAR": 0.36,
62
  "SED": 4.50,
63
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/rte)"
64
  },
65
- {
66
  "ID": 10,
 
67
  "Benchmark": "RTE_wac_gec",
68
  "WAR": 0.72,
69
  "SED": 4.43,
@@ -71,13 +81,15 @@
71
  },
72
  {
73
  "ID": 11,
 
74
  "Benchmark": "SST2_deepseek_r1_denoising",
75
  "WAR": 7.22,
76
  "SED": 3.66,
77
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/sst2)"
78
  },
79
- {
80
  "ID": 12,
 
81
  "Benchmark": "SST2_wac_gec",
82
  "WAR": 5.39,
83
  "SED": 3.52,
@@ -85,13 +97,15 @@
85
  },
86
  {
87
  "ID": 13,
 
88
  "Benchmark": "WNLI_deepseek_r1_denoising",
89
  "WAR": 0.00,
90
  "SED": 0.59,
91
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/wnli)"
92
  },
93
- {
94
  "ID": 14,
 
95
  "Benchmark": "WNLI_wac_gec",
96
  "WAR": 0.00,
97
  "SED": 0.64,
@@ -99,13 +113,15 @@
99
  },
100
  {
101
  "ID": 15,
 
102
  "Benchmark": "GSM8K_deepseek_r1_denoising",
103
  "WAR": 0.30,
104
  "SED": 1.13,
105
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k_deepseek_r1_denoising)"
106
  },
107
- {
108
  "ID": 16,
 
109
  "Benchmark": "GSM8K_wac_gec",
110
  "WAR": 1.97,
111
  "SED": 1.11,
@@ -113,13 +129,15 @@
113
  },
114
  {
115
  "ID": 17,
 
116
  "Benchmark": "MMLU_deepseek_r1_denoising",
117
  "WAR": 6.56,
118
  "SED": 2.15,
119
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu_deepseek_r1_denoising)"
120
  },
121
- {
122
  "ID": 18,
 
123
  "Benchmark": "MMLU_wac_gec",
124
  "WAR": 2.98,
125
  "SED": 2.08,
@@ -127,13 +145,15 @@
127
  },
128
  {
129
  "ID": 19,
 
130
  "Benchmark": "MedMCQA_deepseek_r1_denoising",
131
  "WAR": 3.44,
132
  "SED": 5.70,
133
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa_deepseek_r1_denoising)"
134
  },
135
- {
136
  "ID": 20,
 
137
  "Benchmark": "MedMCQA_wac_gec",
138
  "WAR": 2.44,
139
  "SED": 5.91,
@@ -141,13 +161,15 @@
141
  },
142
  {
143
  "ID": 21,
 
144
  "Benchmark": "MedQA_deepseek_r1_denoising",
145
  "WAR": 16.26,
146
  "SED": 6.49,
147
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA_deepseek_r1_denoising)"
148
  },
149
- {
150
  "ID": 22,
 
151
  "Benchmark": "MedQA_wac_gec",
152
  "WAR": 0.79,
153
  "SED": 6.51,
@@ -155,13 +177,15 @@
155
  },
156
  {
157
  "ID": 23,
 
158
  "Benchmark": "Natural_questions_deepseek_r1_denoising",
159
  "WAR": 0.06,
160
  "SED": 3.06,
161
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open_deepseek_r1_denoising)"
162
  },
163
- {
164
  "ID": 24,
 
165
  "Benchmark": "Natural_questions_wac_gec",
166
  "WAR": 0.28,
167
  "SED": 2.93,
@@ -169,13 +193,15 @@
169
  },
170
  {
171
  "ID": 25,
 
172
  "Benchmark": "PubMedQA_deepseek_r1_denoising",
173
  "WAR": 0.20,
174
  "SED": 8.19,
175
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa_deepseek_r1_denoising)"
176
  },
177
- {
178
  "ID": 26,
 
179
  "Benchmark": "PubMedQA_wac_gec",
180
  "WAR": 0.00,
181
  "SED": 8.10,
@@ -183,13 +209,15 @@
183
  },
184
  {
185
  "ID": 27,
 
186
  "Benchmark": "Truthful_QA_deepseek_r1_denoising",
187
  "WAR": 0.00,
188
  "SED": 1.73,
189
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa_deepseek_r1_denoising)"
190
  },
191
- {
192
  "ID": 28,
 
193
  "Benchmark": "Truthful_QA_wac_gec",
194
  "WAR": 0.00,
195
  "SED": 1.53,
 
1
  [
2
  {
3
  "ID": 1,
4
+ "Category": "RA",
5
  "Benchmark": "ARC_deepseek_r1_denoising",
6
  "WAR": 0.00,
7
  "SED": 0.67,
 
9
  },
10
  {
11
  "ID": 2,
12
+ "Category": "RA",
13
  "Benchmark": "ARC_wac_gec",
14
  "WAR": 0.00,
15
  "SED": 0.66,
 
17
  },
18
  {
19
  "ID": 3,
20
+ "Category": "TG",
21
  "Benchmark": "COQA_deepseek_r1_denoising",
22
  "WAR": 4.18,
23
  "SED": 2.57,
 
25
  },
26
  {
27
  "ID": 4,
28
+ "Category": "TG",
29
  "Benchmark": "COQA_wac_gec",
30
  "WAR": 4.70,
31
  "SED": 2.56,
 
33
  },
34
  {
35
  "ID": 5,
36
+ "Category": "TG",
37
  "Benchmark": "DROP_deepseek_r1_denoising",
38
  "WAR": 0.02,
39
  "SED": 3.24,
 
41
  },
42
  {
43
  "ID": 6,
44
+ "Category": "TG",
45
  "Benchmark": "DROP_wac_gec",
46
  "WAR": 0.64,
47
  "SED": 3.25,
 
49
  },
50
  {
51
  "ID": 7,
52
+ "Category": "BT",
53
  "Benchmark": "MRPC_deepseek_r1_denoising",
54
  "WAR": 3.80,
55
  "SED": 4.70,
56
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/mrpc)"
57
  },
58
+ {
59
  "ID": 8,
60
+ "Category": "BT",
61
  "Benchmark": "MRPC_wac_gec",
62
  "WAR": 1.84,
63
  "SED": 4.50,
64
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_wac_gec/mrpc)"
65
  },
66
+ {
67
  "ID": 9,
68
+ "Category": "BT",
69
  "Benchmark": "RTE_deepseek_r1_denoising",
70
  "WAR": 0.36,
71
  "SED": 4.50,
72
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/rte)"
73
  },
74
+ {
75
  "ID": 10,
76
+ "Category": "BT",
77
  "Benchmark": "RTE_wac_gec",
78
  "WAR": 0.72,
79
  "SED": 4.43,
 
81
  },
82
  {
83
  "ID": 11,
84
+ "Category": "BT",
85
  "Benchmark": "SST2_deepseek_r1_denoising",
86
  "WAR": 7.22,
87
  "SED": 3.66,
88
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/sst2)"
89
  },
90
+ {
91
  "ID": 12,
92
+ "Category": "BT",
93
  "Benchmark": "SST2_wac_gec",
94
  "WAR": 5.39,
95
  "SED": 3.52,
 
97
  },
98
  {
99
  "ID": 13,
100
+ "Category": "SU",
101
  "Benchmark": "WNLI_deepseek_r1_denoising",
102
  "WAR": 0.00,
103
  "SED": 0.59,
104
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue_deepseek_r1_denoising/wnli)"
105
  },
106
+ {
107
  "ID": 14,
108
+ "Category": "SU",
109
  "Benchmark": "WNLI_wac_gec",
110
  "WAR": 0.00,
111
  "SED": 0.64,
 
113
  },
114
  {
115
  "ID": 15,
116
+ "Category": "RA",
117
  "Benchmark": "GSM8K_deepseek_r1_denoising",
118
  "WAR": 0.30,
119
  "SED": 1.13,
120
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k_deepseek_r1_denoising)"
121
  },
122
+ {
123
  "ID": 16,
124
+ "Category": "RA",
125
  "Benchmark": "GSM8K_wac_gec",
126
  "WAR": 1.97,
127
  "SED": 1.11,
 
129
  },
130
  {
131
  "ID": 17,
132
+ "Category": "RA",
133
  "Benchmark": "MMLU_deepseek_r1_denoising",
134
  "WAR": 6.56,
135
  "SED": 2.15,
136
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu_deepseek_r1_denoising)"
137
  },
138
+ {
139
  "ID": 18,
140
+ "Category": "RA",
141
  "Benchmark": "MMLU_wac_gec",
142
  "WAR": 2.98,
143
  "SED": 2.08,
 
145
  },
146
  {
147
  "ID": 19,
148
+ "Category": "ME",
149
  "Benchmark": "MedMCQA_deepseek_r1_denoising",
150
  "WAR": 3.44,
151
  "SED": 5.70,
152
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa_deepseek_r1_denoising)"
153
  },
154
+ {
155
  "ID": 20,
156
+ "Category": "ME",
157
  "Benchmark": "MedMCQA_wac_gec",
158
  "WAR": 2.44,
159
  "SED": 5.91,
 
161
  },
162
  {
163
  "ID": 21,
164
+ "Category": "ME",
165
  "Benchmark": "MedQA_deepseek_r1_denoising",
166
  "WAR": 16.26,
167
  "SED": 6.49,
168
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA_deepseek_r1_denoising)"
169
  },
170
+ {
171
  "ID": 22,
172
+ "Category": "ME",
173
  "Benchmark": "MedQA_wac_gec",
174
  "WAR": 0.79,
175
  "SED": 6.51,
 
177
  },
178
  {
179
  "ID": 23,
180
+ "Category": "SU",
181
  "Benchmark": "Natural_questions_deepseek_r1_denoising",
182
  "WAR": 0.06,
183
  "SED": 3.06,
184
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open_deepseek_r1_denoising)"
185
  },
186
+ {
187
  "ID": 24,
188
+ "Category": "SU",
189
  "Benchmark": "Natural_questions_wac_gec",
190
  "WAR": 0.28,
191
  "SED": 2.93,
 
193
  },
194
  {
195
  "ID": 25,
196
+ "Category": "ME",
197
  "Benchmark": "PubMedQA_deepseek_r1_denoising",
198
  "WAR": 0.20,
199
  "SED": 8.19,
200
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa_deepseek_r1_denoising)"
201
  },
202
+ {
203
  "ID": 26,
204
+ "Category": "ME",
205
  "Benchmark": "PubMedQA_wac_gec",
206
  "WAR": 0.00,
207
  "SED": 8.10,
 
209
  },
210
  {
211
  "ID": 27,
212
+ "Category": "TG",
213
  "Benchmark": "Truthful_QA_deepseek_r1_denoising",
214
  "WAR": 0.00,
215
  "SED": 1.73,
216
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa_deepseek_r1_denoising)"
217
  },
218
+ {
219
  "ID": 28,
220
+ "Category": "TG",
221
  "Benchmark": "Truthful_QA_wac_gec",
222
  "WAR": 0.00,
223
  "SED": 1.53,