Spaces:

lllouo
/

BD_framework_test

Sleeping

App Files Files Community

lllouo commited on Feb 2

Commit

ba32277

1 Parent(s): cce4575

update filter_categories

Browse files

Files changed (2) hide show

app.py +50 -10
leaderboard.json +14 -14

app.py CHANGED Viewed

@@ -436,11 +436,26 @@ def load_leaderboard_data():
         print(f"Error loading leaderboard: {e}")
         return pd.DataFrame()
-def filter_leaderboard(df, query):
-    if query == "all":
-        return df
-    else:
-        return df[df['Category'] == query]
 def search_leaderboard(df, query):
     if not query:
@@ -738,6 +753,17 @@ with demo:
                         value="all",
                         elem_id="filter-columns",
                     )
                 leaderboard_table = gr.Dataframe(
                     value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
@@ -752,23 +778,37 @@ with demo:
                     visible=False
                 )
                 search_bar.submit(
                     lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
                     [hidden_leaderboard, search_bar],
                     leaderboard_table
                 )
                 filter_categories.change(
-                    lambda df, query: filter_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
-                    [hidden_leaderboard, filter_categories],
                     leaderboard_table
                 )
                 gr.Markdown("""
                 **说明:**
                 - **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
-                - **WAR**: 空白符异常率
-                - **SED**: 拼写错误密度
                 """, elem_classes="markdown-text")
         with gr.TabItem("📈 Performance Plot", id=1):
@@ -867,7 +907,7 @@ with demo:
 if __name__ == "__main__":
     # 可选：预加载模型（会增加启动时间）
-    # 如果想要预加载，取消下面两行的注释
     print("🚀 预加载WAC-GEC模型...")
     initialize_wac_gec()

         print(f"Error loading leaderboard: {e}")
         return pd.DataFrame()
+def filter_leaderboard(df, category_query, version_query):
+    """
+    同时按类别和版本筛选
+    """
+    result = df.copy()
+    # 按类别筛选
+    if category_query != "all":
+        result = result[result['Category'] == category_query]
+    # 按版本筛选
+    if version_query != "all":
+        if version_query == "original":
+            result = result[result['Benchmark'].str.contains('_original', case=False, na=False)]
+        elif version_query == "deepseek":
+            result = result[result['Benchmark'].str.contains('deepseek_r1_denoising', case=False, na=False)]
+        elif version_query == "wac_gec":
+            result = result[result['Benchmark'].str.contains('wac_gec', case=False, na=False)]
+    return result
 def search_leaderboard(df, query):
     if not query:
                         value="all",
                         elem_id="filter-columns",
                     )
+                    filter_versions = gr.Radio(
+                        label="🔖 筛选数据集版本",
+                        choices=[
+                            ("全部版本", "all"),
+                            ("原始数据集", "original"),
+                            ("DeepSeek-R1去噪", "deepseek"),
+                            ("WAC-GEC去噪", "wac_gec")
+                        ],
+                        value="all",
+                        elem_id="filter-versions",
+                    )
                 leaderboard_table = gr.Dataframe(
                     value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
                     visible=False
                 )
+                # 搜索功能
                 search_bar.submit(
                     lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
                     [hidden_leaderboard, search_bar],
                     leaderboard_table
                 )
+                # 类别筛选功能（需要考虑版本筛选）
+                def combined_filter(df, category, version):
+                    filtered = filter_leaderboard(df, category, version)
+                    return filtered[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']]
                 filter_categories.change(
+                    combined_filter,
+                    [hidden_leaderboard, filter_categories, filter_versions],
+                    leaderboard_table
+                )
+                # 版本筛选功能（需要考虑类别筛选）
+                filter_versions.change(
+                    combined_filter,
+                    [hidden_leaderboard, filter_categories, filter_versions],
                     leaderboard_table
                 )
                 gr.Markdown("""
                 **说明:**
                 - **Category**: BT=基础任务, RA=推理能力, TG=文本生成, SU=语音理解, ME=医学领域, GR=语法领域
+                - **Version**: 原始=未处理数据集, DeepSeek-R1=DeepSeek去噪版本, WAC-GEC=WAC-GEC去噪版本
+                - **WAR**: 空白符异常率（越低越好）
+                - **SED**: 拼写错误密度（越低越好）
                 """, elem_classes="markdown-text")
         with gr.TabItem("📈 Performance Plot", id=1):
 if __name__ == "__main__":
     # 可选：预加载模型（会增加启动时间）
+    # 如果想要预加载,取消下面两行的注释
     print("🚀 预加载WAC-GEC模型...")
     initialize_wac_gec()

leaderboard.json CHANGED Viewed

@@ -2,7 +2,7 @@
     {
         "ID": 1,
         "Category": "RA",
-        "Benchmark": "ARC",
         "WAR": 0.11,
         "SED": 0.67,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc)"
@@ -26,7 +26,7 @@
     {
         "ID": 4,
         "Category": "TG",
-        "Benchmark": "COQA",
         "WAR": 6.79,
         "SED": 2.74,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa)"
@@ -50,7 +50,7 @@
     {
         "ID": 7,
         "Category": "TG",
-        "Benchmark": "DROP",
         "WAR": 1.50,
         "SED": 3.38,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop)"
@@ -74,7 +74,7 @@
     {
         "ID": 10,
         "Category": "BT",
-        "Benchmark": "MRPC",
         "WAR": 100.00,
         "SED": 5.65,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/mrpc)"
@@ -98,7 +98,7 @@
     {
         "ID": 13,
         "Category": "BT",
-        "Benchmark": "RTE",
         "WAR": 2.17,
         "SED": 4.47,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/rte)"
@@ -122,7 +122,7 @@
     {
         "ID": 16,
         "Category": "BT",
-        "Benchmark": "SST2",
         "WAR": 98.97,
         "SED": 5.42,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/sst2)"
@@ -146,7 +146,7 @@
     {
         "ID": 19,
         "Category": "SU",
-        "Benchmark": "WNLI",
         "WAR": 0.70,
         "SED": 0.64,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/wnli)"
@@ -170,7 +170,7 @@
     {
         "ID": 22,
         "Category": "RA",
-        "Benchmark": "GSM8K",
         "WAR": 25.70,
         "SED": 1.11,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k)"
@@ -194,7 +194,7 @@
     {
         "ID": 25,
         "Category": "RA",
-        "Benchmark": "MMLU",
         "WAR": 10.06,
         "SED": 2.21,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu)"
@@ -218,7 +218,7 @@
     {
         "ID": 28,
         "Category": "ME",
-        "Benchmark": "MedMCQA",
         "WAR": 6.31,
         "SED": 6.18,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa)"
@@ -242,7 +242,7 @@
     {
         "ID": 31,
         "Category": "ME",
-        "Benchmark": "MedQA",
         "WAR": 16.97,
         "SED": 6.49,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA-USMLE-4-options)"
@@ -266,7 +266,7 @@
     {
         "ID": 34,
         "Category": "SU",
-        "Benchmark": "Natural_questions",
         "WAR": 0.17,
         "SED": 2.90,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open)"
@@ -290,7 +290,7 @@
     {
         "ID": 37,
         "Category": "ME",
-        "Benchmark": "PubMedQA",
         "WAR": 0.60,
         "SED": 8.15,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa)"
@@ -314,7 +314,7 @@
     {
         "ID": 40,
         "Category": "TG",
-        "Benchmark": "Truthful_QA",
         "WAR": 0.00,
         "SED": 1.75,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa)"

     {
         "ID": 1,
         "Category": "RA",
+        "Benchmark": "ARC_original",
         "WAR": 0.11,
         "SED": 0.67,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/ARC/arc)"
     {
         "ID": 4,
         "Category": "TG",
+        "Benchmark": "COQA_original",
         "WAR": 6.79,
         "SED": 2.74,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/COQA/coqa)"
     {
         "ID": 7,
         "Category": "TG",
+        "Benchmark": "DROP_original",
         "WAR": 1.50,
         "SED": 3.38,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/DROP/drop)"
     {
         "ID": 10,
         "Category": "BT",
+        "Benchmark": "MRPC_original",
         "WAR": 100.00,
         "SED": 5.65,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/mrpc)"
     {
         "ID": 13,
         "Category": "BT",
+        "Benchmark": "RTE_original",
         "WAR": 2.17,
         "SED": 4.47,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/rte)"
     {
         "ID": 16,
         "Category": "BT",
+        "Benchmark": "SST2_original",
         "WAR": 98.97,
         "SED": 5.42,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/sst2)"
     {
         "ID": 19,
         "Category": "SU",
+        "Benchmark": "WNLI_original",
         "WAR": 0.70,
         "SED": 0.64,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GLUE/glue/wnli)"
     {
         "ID": 22,
         "Category": "RA",
+        "Benchmark": "GSM8K_original",
         "WAR": 25.70,
         "SED": 1.11,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/GSM8K/gsm8k)"
     {
         "ID": 25,
         "Category": "RA",
+        "Benchmark": "MMLU_original",
         "WAR": 10.06,
         "SED": 2.21,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MMLU/mmlu)"
     {
         "ID": 28,
         "Category": "ME",
+        "Benchmark": "MedMCQA_original",
         "WAR": 6.31,
         "SED": 6.18,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedMCQA/medmcqa)"
     {
         "ID": 31,
         "Category": "ME",
+        "Benchmark": "MedQA_original",
         "WAR": 16.97,
         "SED": 6.49,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/MedQA/MedQA-USMLE-4-options)"
     {
         "ID": 34,
         "Category": "SU",
+        "Benchmark": "Natural_questions_original",
         "WAR": 0.17,
         "SED": 2.90,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Natural_questions/nq_open)"
     {
         "ID": 37,
         "Category": "ME",
+        "Benchmark": "PubMedQA_original",
         "WAR": 0.60,
         "SED": 8.15,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/PubMedQA/pubmed_qa)"
     {
         "ID": 40,
         "Category": "TG",
+        "Benchmark": "Truthful_QA_original",
         "WAR": 0.00,
         "SED": 1.75,
         "Download": "[下载](https://huggingface.co/datasets/lllouo/BD-benchmarks/tree/main/Truthful_QA/truthful_qa)"