Spaces:
Sleeping
Sleeping
update filter_categories
Browse files- app.py +50 -10
- leaderboard.json +14 -14
app.py
CHANGED
|
@@ -436,11 +436,26 @@ def load_leaderboard_data():
|
|
| 436 |
print(f"Error loading leaderboard: {e}")
|
| 437 |
return pd.DataFrame()
|
| 438 |
|
| 439 |
-
def filter_leaderboard(df,
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
def search_leaderboard(df, query):
|
| 446 |
if not query:
|
|
@@ -738,6 +753,17 @@ with demo:
|
|
| 738 |
value="all",
|
| 739 |
elem_id="filter-columns",
|
| 740 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
|
| 742 |
leaderboard_table = gr.Dataframe(
|
| 743 |
value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
|
|
@@ -752,23 +778,37 @@ with demo:
|
|
| 752 |
visible=False
|
| 753 |
)
|
| 754 |
|
|
|
|
| 755 |
search_bar.submit(
|
| 756 |
lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
|
| 757 |
[hidden_leaderboard, search_bar],
|
| 758 |
leaderboard_table
|
| 759 |
)
|
| 760 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 761 |
filter_categories.change(
|
| 762 |
-
|
| 763 |
-
[hidden_leaderboard, filter_categories],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
leaderboard_table
|
| 765 |
)
|
| 766 |
|
| 767 |
gr.Markdown("""
|
| 768 |
**说明:**
|
| 769 |
- **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
|
| 770 |
-
- **
|
| 771 |
-
- **
|
|
|
|
| 772 |
""", elem_classes="markdown-text")
|
| 773 |
|
| 774 |
with gr.TabItem("📈 Performance Plot", id=1):
|
|
@@ -867,7 +907,7 @@ with demo:
|
|
| 867 |
|
| 868 |
if __name__ == "__main__":
|
| 869 |
# 可选:预加载模型(会增加启动时间)
|
| 870 |
-
# 如果想要预加载
|
| 871 |
print("🚀 预加载WAC-GEC模型...")
|
| 872 |
initialize_wac_gec()
|
| 873 |
|
|
|
|
| 436 |
print(f"Error loading leaderboard: {e}")
|
| 437 |
return pd.DataFrame()
|
| 438 |
|
| 439 |
+
def filter_leaderboard(df, category_query, version_query):
|
| 440 |
+
"""
|
| 441 |
+
同时按类别和版本筛选
|
| 442 |
+
"""
|
| 443 |
+
result = df.copy()
|
| 444 |
+
|
| 445 |
+
# 按类别筛选
|
| 446 |
+
if category_query != "all":
|
| 447 |
+
result = result[result['Category'] == category_query]
|
| 448 |
+
|
| 449 |
+
# 按版本筛选
|
| 450 |
+
if version_query != "all":
|
| 451 |
+
if version_query == "original":
|
| 452 |
+
result = result[result['Benchmark'].str.contains('_original', case=False, na=False)]
|
| 453 |
+
elif version_query == "deepseek":
|
| 454 |
+
result = result[result['Benchmark'].str.contains('deepseek_r1_denoising', case=False, na=False)]
|
| 455 |
+
elif version_query == "wac_gec":
|
| 456 |
+
result = result[result['Benchmark'].str.contains('wac_gec', case=False, na=False)]
|
| 457 |
+
|
| 458 |
+
return result
|
| 459 |
|
| 460 |
def search_leaderboard(df, query):
|
| 461 |
if not query:
|
|
|
|
| 753 |
value="all",
|
| 754 |
elem_id="filter-columns",
|
| 755 |
)
|
| 756 |
+
filter_versions = gr.Radio(
|
| 757 |
+
label="🔖 筛选数据集版本",
|
| 758 |
+
choices=[
|
| 759 |
+
("全部版本", "all"),
|
| 760 |
+
("原始数据集", "original"),
|
| 761 |
+
("DeepSeek-R1去噪", "deepseek"),
|
| 762 |
+
("WAC-GEC去噪", "wac_gec")
|
| 763 |
+
],
|
| 764 |
+
value="all",
|
| 765 |
+
elem_id="filter-versions",
|
| 766 |
+
)
|
| 767 |
|
| 768 |
leaderboard_table = gr.Dataframe(
|
| 769 |
value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
|
|
|
|
| 778 |
visible=False
|
| 779 |
)
|
| 780 |
|
| 781 |
+
# 搜索功能
|
| 782 |
search_bar.submit(
|
| 783 |
lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
|
| 784 |
[hidden_leaderboard, search_bar],
|
| 785 |
leaderboard_table
|
| 786 |
)
|
| 787 |
|
| 788 |
+
# 类别筛选功能(需要考虑版本筛选)
|
| 789 |
+
def combined_filter(df, category, version):
|
| 790 |
+
filtered = filter_leaderboard(df, category, version)
|
| 791 |
+
return filtered[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']]
|
| 792 |
+
|
| 793 |
filter_categories.change(
|
| 794 |
+
combined_filter,
|
| 795 |
+
[hidden_leaderboard, filter_categories, filter_versions],
|
| 796 |
+
leaderboard_table
|
| 797 |
+
)
|
| 798 |
+
|
| 799 |
+
# 版本筛选功能(需要考虑类别筛选)
|
| 800 |
+
filter_versions.change(
|
| 801 |
+
combined_filter,
|
| 802 |
+
[hidden_leaderboard, filter_categories, filter_versions],
|
| 803 |
leaderboard_table
|
| 804 |
)
|
| 805 |
|
| 806 |
gr.Markdown("""
|
| 807 |
**说明:**
|
| 808 |
- **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
|
| 809 |
+
- **Version**: 原始=未处理数据集, DeepSeek-R1=DeepSeek去噪版本, WAC-GEC=WAC-GEC去噪版本
|
| 810 |
+
- **WAR**: 空白符异常率(越低越好)
|
| 811 |
+
- **SED**: 拼写错误密度(越低越好)
|
| 812 |
""", elem_classes="markdown-text")
|
| 813 |
|
| 814 |
with gr.TabItem("📈 Performance Plot", id=1):
|
|
|
|
| 907 |
|
| 908 |
if __name__ == "__main__":
|
| 909 |
# 可选:预加载模型(会增加启动时间)
|
| 910 |
+
# 如果想要预加载,取消下面两行的注释
|
| 911 |
print("🚀 预加载WAC-GEC模型...")
|
| 912 |
initialize_wac_gec()
|
| 913 |
|
leaderboard.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
{
|
| 3 |
"ID": 1,
|
| 4 |
"Category": "RA",
|
| 5 |
-
"Benchmark": "
|
| 6 |
"WAR": 0.11,
|
| 7 |
"SED": 0.67,
|
| 8 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc)"
|
|
@@ -26,7 +26,7 @@
|
|
| 26 |
{
|
| 27 |
"ID": 4,
|
| 28 |
"Category": "TG",
|
| 29 |
-
"Benchmark": "
|
| 30 |
"WAR": 6.79,
|
| 31 |
"SED": 2.74,
|
| 32 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa)"
|
|
@@ -50,7 +50,7 @@
|
|
| 50 |
{
|
| 51 |
"ID": 7,
|
| 52 |
"Category": "TG",
|
| 53 |
-
"Benchmark": "
|
| 54 |
"WAR": 1.50,
|
| 55 |
"SED": 3.38,
|
| 56 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop)"
|
|
@@ -74,7 +74,7 @@
|
|
| 74 |
{
|
| 75 |
"ID": 10,
|
| 76 |
"Category": "BT",
|
| 77 |
-
"Benchmark": "
|
| 78 |
"WAR": 100.00,
|
| 79 |
"SED": 5.65,
|
| 80 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/mrpc)"
|
|
@@ -98,7 +98,7 @@
|
|
| 98 |
{
|
| 99 |
"ID": 13,
|
| 100 |
"Category": "BT",
|
| 101 |
-
"Benchmark": "
|
| 102 |
"WAR": 2.17,
|
| 103 |
"SED": 4.47,
|
| 104 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/rte)"
|
|
@@ -122,7 +122,7 @@
|
|
| 122 |
{
|
| 123 |
"ID": 16,
|
| 124 |
"Category": "BT",
|
| 125 |
-
"Benchmark": "
|
| 126 |
"WAR": 98.97,
|
| 127 |
"SED": 5.42,
|
| 128 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/sst2)"
|
|
@@ -146,7 +146,7 @@
|
|
| 146 |
{
|
| 147 |
"ID": 19,
|
| 148 |
"Category": "SU",
|
| 149 |
-
"Benchmark": "
|
| 150 |
"WAR": 0.70,
|
| 151 |
"SED": 0.64,
|
| 152 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/wnli)"
|
|
@@ -170,7 +170,7 @@
|
|
| 170 |
{
|
| 171 |
"ID": 22,
|
| 172 |
"Category": "RA",
|
| 173 |
-
"Benchmark": "
|
| 174 |
"WAR": 25.70,
|
| 175 |
"SED": 1.11,
|
| 176 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k)"
|
|
@@ -194,7 +194,7 @@
|
|
| 194 |
{
|
| 195 |
"ID": 25,
|
| 196 |
"Category": "RA",
|
| 197 |
-
"Benchmark": "
|
| 198 |
"WAR": 10.06,
|
| 199 |
"SED": 2.21,
|
| 200 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu)"
|
|
@@ -218,7 +218,7 @@
|
|
| 218 |
{
|
| 219 |
"ID": 28,
|
| 220 |
"Category": "ME",
|
| 221 |
-
"Benchmark": "
|
| 222 |
"WAR": 6.31,
|
| 223 |
"SED": 6.18,
|
| 224 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa)"
|
|
@@ -242,7 +242,7 @@
|
|
| 242 |
{
|
| 243 |
"ID": 31,
|
| 244 |
"Category": "ME",
|
| 245 |
-
"Benchmark": "
|
| 246 |
"WAR": 16.97,
|
| 247 |
"SED": 6.49,
|
| 248 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA-USMLE-4-options)"
|
|
@@ -266,7 +266,7 @@
|
|
| 266 |
{
|
| 267 |
"ID": 34,
|
| 268 |
"Category": "SU",
|
| 269 |
-
"Benchmark": "
|
| 270 |
"WAR": 0.17,
|
| 271 |
"SED": 2.90,
|
| 272 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open)"
|
|
@@ -290,7 +290,7 @@
|
|
| 290 |
{
|
| 291 |
"ID": 37,
|
| 292 |
"Category": "ME",
|
| 293 |
-
"Benchmark": "
|
| 294 |
"WAR": 0.60,
|
| 295 |
"SED": 8.15,
|
| 296 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa)"
|
|
@@ -314,7 +314,7 @@
|
|
| 314 |
{
|
| 315 |
"ID": 40,
|
| 316 |
"Category": "TG",
|
| 317 |
-
"Benchmark": "
|
| 318 |
"WAR": 0.00,
|
| 319 |
"SED": 1.75,
|
| 320 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa)"
|
|
|
|
| 2 |
{
|
| 3 |
"ID": 1,
|
| 4 |
"Category": "RA",
|
| 5 |
+
"Benchmark": "ARC_original",
|
| 6 |
"WAR": 0.11,
|
| 7 |
"SED": 0.67,
|
| 8 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc)"
|
|
|
|
| 26 |
{
|
| 27 |
"ID": 4,
|
| 28 |
"Category": "TG",
|
| 29 |
+
"Benchmark": "COQA_original",
|
| 30 |
"WAR": 6.79,
|
| 31 |
"SED": 2.74,
|
| 32 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa)"
|
|
|
|
| 50 |
{
|
| 51 |
"ID": 7,
|
| 52 |
"Category": "TG",
|
| 53 |
+
"Benchmark": "DROP_original",
|
| 54 |
"WAR": 1.50,
|
| 55 |
"SED": 3.38,
|
| 56 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop)"
|
|
|
|
| 74 |
{
|
| 75 |
"ID": 10,
|
| 76 |
"Category": "BT",
|
| 77 |
+
"Benchmark": "MRPC_original",
|
| 78 |
"WAR": 100.00,
|
| 79 |
"SED": 5.65,
|
| 80 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/mrpc)"
|
|
|
|
| 98 |
{
|
| 99 |
"ID": 13,
|
| 100 |
"Category": "BT",
|
| 101 |
+
"Benchmark": "RTE_original",
|
| 102 |
"WAR": 2.17,
|
| 103 |
"SED": 4.47,
|
| 104 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/rte)"
|
|
|
|
| 122 |
{
|
| 123 |
"ID": 16,
|
| 124 |
"Category": "BT",
|
| 125 |
+
"Benchmark": "SST2_original",
|
| 126 |
"WAR": 98.97,
|
| 127 |
"SED": 5.42,
|
| 128 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/sst2)"
|
|
|
|
| 146 |
{
|
| 147 |
"ID": 19,
|
| 148 |
"Category": "SU",
|
| 149 |
+
"Benchmark": "WNLI_original",
|
| 150 |
"WAR": 0.70,
|
| 151 |
"SED": 0.64,
|
| 152 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/wnli)"
|
|
|
|
| 170 |
{
|
| 171 |
"ID": 22,
|
| 172 |
"Category": "RA",
|
| 173 |
+
"Benchmark": "GSM8K_original",
|
| 174 |
"WAR": 25.70,
|
| 175 |
"SED": 1.11,
|
| 176 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k)"
|
|
|
|
| 194 |
{
|
| 195 |
"ID": 25,
|
| 196 |
"Category": "RA",
|
| 197 |
+
"Benchmark": "MMLU_original",
|
| 198 |
"WAR": 10.06,
|
| 199 |
"SED": 2.21,
|
| 200 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu)"
|
|
|
|
| 218 |
{
|
| 219 |
"ID": 28,
|
| 220 |
"Category": "ME",
|
| 221 |
+
"Benchmark": "MedMCQA_original",
|
| 222 |
"WAR": 6.31,
|
| 223 |
"SED": 6.18,
|
| 224 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa)"
|
|
|
|
| 242 |
{
|
| 243 |
"ID": 31,
|
| 244 |
"Category": "ME",
|
| 245 |
+
"Benchmark": "MedQA_original",
|
| 246 |
"WAR": 16.97,
|
| 247 |
"SED": 6.49,
|
| 248 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA-USMLE-4-options)"
|
|
|
|
| 266 |
{
|
| 267 |
"ID": 34,
|
| 268 |
"Category": "SU",
|
| 269 |
+
"Benchmark": "Natural_questions_original",
|
| 270 |
"WAR": 0.17,
|
| 271 |
"SED": 2.90,
|
| 272 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open)"
|
|
|
|
| 290 |
{
|
| 291 |
"ID": 37,
|
| 292 |
"Category": "ME",
|
| 293 |
+
"Benchmark": "PubMedQA_original",
|
| 294 |
"WAR": 0.60,
|
| 295 |
"SED": 8.15,
|
| 296 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa)"
|
|
|
|
| 314 |
{
|
| 315 |
"ID": 40,
|
| 316 |
"Category": "TG",
|
| 317 |
+
"Benchmark": "Truthful_QA_original",
|
| 318 |
"WAR": 0.00,
|
| 319 |
"SED": 1.75,
|
| 320 |
"Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa)"
|