lllouo commited on
Commit
ba32277
·
1 Parent(s): cce4575

update filter_categories

Browse files
Files changed (2) hide show
  1. app.py +50 -10
  2. leaderboard.json +14 -14
app.py CHANGED
@@ -436,11 +436,26 @@ def load_leaderboard_data():
436
  print(f"Error loading leaderboard: {e}")
437
  return pd.DataFrame()
438
 
439
- def filter_leaderboard(df, query):
440
- if query == "all":
441
- return df
442
- else:
443
- return df[df['Category'] == query]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
  def search_leaderboard(df, query):
446
  if not query:
@@ -738,6 +753,17 @@ with demo:
738
  value="all",
739
  elem_id="filter-columns",
740
  )
 
 
 
 
 
 
 
 
 
 
 
741
 
742
  leaderboard_table = gr.Dataframe(
743
  value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
@@ -752,23 +778,37 @@ with demo:
752
  visible=False
753
  )
754
 
 
755
  search_bar.submit(
756
  lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
757
  [hidden_leaderboard, search_bar],
758
  leaderboard_table
759
  )
760
 
 
 
 
 
 
761
  filter_categories.change(
762
- lambda df, query: filter_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
763
- [hidden_leaderboard, filter_categories],
 
 
 
 
 
 
 
764
  leaderboard_table
765
  )
766
 
767
  gr.Markdown("""
768
  **说明:**
769
  - **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
770
- - **WAR**: 空白符异常率
771
- - **SED**: 拼写错误密度
 
772
  """, elem_classes="markdown-text")
773
 
774
  with gr.TabItem("📈 Performance Plot", id=1):
@@ -867,7 +907,7 @@ with demo:
867
 
868
  if __name__ == "__main__":
869
  # 可选:预加载模型(会增加启动时间)
870
- # 如果想要预加载取消下面两行的注释
871
  print("🚀 预加载WAC-GEC模型...")
872
  initialize_wac_gec()
873
 
 
436
  print(f"Error loading leaderboard: {e}")
437
  return pd.DataFrame()
438
 
439
+ def filter_leaderboard(df, category_query, version_query):
440
+ """
441
+ 同时按类别和版本筛选
442
+ """
443
+ result = df.copy()
444
+
445
+ # 按类别筛选
446
+ if category_query != "all":
447
+ result = result[result['Category'] == category_query]
448
+
449
+ # 按版本筛选
450
+ if version_query != "all":
451
+ if version_query == "original":
452
+ result = result[result['Benchmark'].str.contains('_original', case=False, na=False)]
453
+ elif version_query == "deepseek":
454
+ result = result[result['Benchmark'].str.contains('deepseek_r1_denoising', case=False, na=False)]
455
+ elif version_query == "wac_gec":
456
+ result = result[result['Benchmark'].str.contains('wac_gec', case=False, na=False)]
457
+
458
+ return result
459
 
460
  def search_leaderboard(df, query):
461
  if not query:
 
753
  value="all",
754
  elem_id="filter-columns",
755
  )
756
+ filter_versions = gr.Radio(
757
+ label="🔖 筛选数据集版本",
758
+ choices=[
759
+ ("全部版本", "all"),
760
+ ("原始数据集", "original"),
761
+ ("DeepSeek-R1去噪", "deepseek"),
762
+ ("WAC-GEC去噪", "wac_gec")
763
+ ],
764
+ value="all",
765
+ elem_id="filter-versions",
766
+ )
767
 
768
  leaderboard_table = gr.Dataframe(
769
  value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
 
778
  visible=False
779
  )
780
 
781
+ # 搜索功能
782
  search_bar.submit(
783
  lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
784
  [hidden_leaderboard, search_bar],
785
  leaderboard_table
786
  )
787
 
788
+ # 类别筛选功能(需要考虑版本筛选)
789
+ def combined_filter(df, category, version):
790
+ filtered = filter_leaderboard(df, category, version)
791
+ return filtered[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']]
792
+
793
  filter_categories.change(
794
+ combined_filter,
795
+ [hidden_leaderboard, filter_categories, filter_versions],
796
+ leaderboard_table
797
+ )
798
+
799
+ # 版本筛选功能(需要考虑类别筛选)
800
+ filter_versions.change(
801
+ combined_filter,
802
+ [hidden_leaderboard, filter_categories, filter_versions],
803
  leaderboard_table
804
  )
805
 
806
  gr.Markdown("""
807
  **说明:**
808
  - **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
809
+ - **Version**: 原始=未处理数据集, DeepSeek-R1=DeepSeek去噪版本, WAC-GEC=WAC-GEC去噪版本
810
+ - **WAR**: 空白符异常率(越低越好)
811
+ - **SED**: 拼写错误密度(越低越好)
812
  """, elem_classes="markdown-text")
813
 
814
  with gr.TabItem("📈 Performance Plot", id=1):
 
907
 
908
  if __name__ == "__main__":
909
  # 可选:预加载模型(会增加启动时间)
910
+ # 如果想要预加载,取消下面两行的注释
911
  print("🚀 预加载WAC-GEC模型...")
912
  initialize_wac_gec()
913
 
leaderboard.json CHANGED
@@ -2,7 +2,7 @@
2
  {
3
  "ID": 1,
4
  "Category": "RA",
5
- "Benchmark": "ARC",
6
  "WAR": 0.11,
7
  "SED": 0.67,
8
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc)"
@@ -26,7 +26,7 @@
26
  {
27
  "ID": 4,
28
  "Category": "TG",
29
- "Benchmark": "COQA",
30
  "WAR": 6.79,
31
  "SED": 2.74,
32
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa)"
@@ -50,7 +50,7 @@
50
  {
51
  "ID": 7,
52
  "Category": "TG",
53
- "Benchmark": "DROP",
54
  "WAR": 1.50,
55
  "SED": 3.38,
56
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop)"
@@ -74,7 +74,7 @@
74
  {
75
  "ID": 10,
76
  "Category": "BT",
77
- "Benchmark": "MRPC",
78
  "WAR": 100.00,
79
  "SED": 5.65,
80
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/mrpc)"
@@ -98,7 +98,7 @@
98
  {
99
  "ID": 13,
100
  "Category": "BT",
101
- "Benchmark": "RTE",
102
  "WAR": 2.17,
103
  "SED": 4.47,
104
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/rte)"
@@ -122,7 +122,7 @@
122
  {
123
  "ID": 16,
124
  "Category": "BT",
125
- "Benchmark": "SST2",
126
  "WAR": 98.97,
127
  "SED": 5.42,
128
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/sst2)"
@@ -146,7 +146,7 @@
146
  {
147
  "ID": 19,
148
  "Category": "SU",
149
- "Benchmark": "WNLI",
150
  "WAR": 0.70,
151
  "SED": 0.64,
152
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/wnli)"
@@ -170,7 +170,7 @@
170
  {
171
  "ID": 22,
172
  "Category": "RA",
173
- "Benchmark": "GSM8K",
174
  "WAR": 25.70,
175
  "SED": 1.11,
176
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k)"
@@ -194,7 +194,7 @@
194
  {
195
  "ID": 25,
196
  "Category": "RA",
197
- "Benchmark": "MMLU",
198
  "WAR": 10.06,
199
  "SED": 2.21,
200
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu)"
@@ -218,7 +218,7 @@
218
  {
219
  "ID": 28,
220
  "Category": "ME",
221
- "Benchmark": "MedMCQA",
222
  "WAR": 6.31,
223
  "SED": 6.18,
224
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa)"
@@ -242,7 +242,7 @@
242
  {
243
  "ID": 31,
244
  "Category": "ME",
245
- "Benchmark": "MedQA",
246
  "WAR": 16.97,
247
  "SED": 6.49,
248
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA-USMLE-4-options)"
@@ -266,7 +266,7 @@
266
  {
267
  "ID": 34,
268
  "Category": "SU",
269
- "Benchmark": "Natural_questions",
270
  "WAR": 0.17,
271
  "SED": 2.90,
272
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open)"
@@ -290,7 +290,7 @@
290
  {
291
  "ID": 37,
292
  "Category": "ME",
293
- "Benchmark": "PubMedQA",
294
  "WAR": 0.60,
295
  "SED": 8.15,
296
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa)"
@@ -314,7 +314,7 @@
314
  {
315
  "ID": 40,
316
  "Category": "TG",
317
- "Benchmark": "Truthful_QA",
318
  "WAR": 0.00,
319
  "SED": 1.75,
320
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa)"
 
2
  {
3
  "ID": 1,
4
  "Category": "RA",
5
+ "Benchmark": "ARC_original",
6
  "WAR": 0.11,
7
  "SED": 0.67,
8
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc)"
 
26
  {
27
  "ID": 4,
28
  "Category": "TG",
29
+ "Benchmark": "COQA_original",
30
  "WAR": 6.79,
31
  "SED": 2.74,
32
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa)"
 
50
  {
51
  "ID": 7,
52
  "Category": "TG",
53
+ "Benchmark": "DROP_original",
54
  "WAR": 1.50,
55
  "SED": 3.38,
56
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop)"
 
74
  {
75
  "ID": 10,
76
  "Category": "BT",
77
+ "Benchmark": "MRPC_original",
78
  "WAR": 100.00,
79
  "SED": 5.65,
80
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/mrpc)"
 
98
  {
99
  "ID": 13,
100
  "Category": "BT",
101
+ "Benchmark": "RTE_original",
102
  "WAR": 2.17,
103
  "SED": 4.47,
104
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/rte)"
 
122
  {
123
  "ID": 16,
124
  "Category": "BT",
125
+ "Benchmark": "SST2_original",
126
  "WAR": 98.97,
127
  "SED": 5.42,
128
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/sst2)"
 
146
  {
147
  "ID": 19,
148
  "Category": "SU",
149
+ "Benchmark": "WNLI_original",
150
  "WAR": 0.70,
151
  "SED": 0.64,
152
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/wnli)"
 
170
  {
171
  "ID": 22,
172
  "Category": "RA",
173
+ "Benchmark": "GSM8K_original",
174
  "WAR": 25.70,
175
  "SED": 1.11,
176
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k)"
 
194
  {
195
  "ID": 25,
196
  "Category": "RA",
197
+ "Benchmark": "MMLU_original",
198
  "WAR": 10.06,
199
  "SED": 2.21,
200
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu)"
 
218
  {
219
  "ID": 28,
220
  "Category": "ME",
221
+ "Benchmark": "MedMCQA_original",
222
  "WAR": 6.31,
223
  "SED": 6.18,
224
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa)"
 
242
  {
243
  "ID": 31,
244
  "Category": "ME",
245
+ "Benchmark": "MedQA_original",
246
  "WAR": 16.97,
247
  "SED": 6.49,
248
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA-USMLE-4-options)"
 
266
  {
267
  "ID": 34,
268
  "Category": "SU",
269
+ "Benchmark": "Natural_questions_original",
270
  "WAR": 0.17,
271
  "SED": 2.90,
272
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open)"
 
290
  {
291
  "ID": 37,
292
  "Category": "ME",
293
+ "Benchmark": "PubMedQA_original",
294
  "WAR": 0.60,
295
  "SED": 8.15,
296
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa)"
 
314
  {
315
  "ID": 40,
316
  "Category": "TG",
317
+ "Benchmark": "Truthful_QA_original",
318
  "WAR": 0.00,
319
  "SED": 1.75,
320
  "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa)"