Spaces:

DeepSynthesisTeam
/

deepsynth-leaderboard

Running

App Files Files Community

debjitpaul commited on Apr 19

Commit

a380ad5

1 Parent(s): 1f21b1a

update app.py and submissions

Browse files

Files changed (13) hide show

app.py +42 -12
submissions/2026-02-15-huawei-deepseek-chat.json +22 -0
submissions/2026-02-15-huawei-deepseek-reasoner.json +22 -0
submissions/2026-02-15-huawei-gemini-pro-2-5 (1).json +22 -0
submissions/2026-02-15-huawei-gemini-pro-3.json +22 -0
submissions/2026-02-15-huawei-gpt-4-1 (1).json +22 -0
submissions/2026-02-15-huawei-gpt-5-1 (1).json +22 -0
submissions/2026-02-15-huawei-gpt-5-2.json +22 -0
submissions/2026-02-15-huawei-o3 (1).json +22 -0
submissions/2026-02-15-huawei-o3-deep-research (1).json +25 -0
submissions/2026-02-15-huawei-o4-mini (1).json +22 -0
submissions/2026-02-15-huawei-owl-gpt4-1 (1).json +25 -0
submissions/2026-02-15-huawei-smolagent-gpt4-1 (1).json +25 -0

app.py CHANGED Viewed

@@ -22,6 +22,7 @@ import pandas as pd
 # ---------------------------------------------------------------------------
 RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
 QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
 QUEUE_DIR.mkdir(exist_ok=True, parents=True)
@@ -59,7 +60,11 @@ CUSTOM_CSS = """
     letter-spacing: -0.02em !important;
 }
-.gradio-container h1 { font-size: 2.2rem !important; margin-bottom: 0.25rem !important; }
 .gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
 .gradio-container code, .gradio-container pre {
@@ -72,6 +77,7 @@ CUSTOM_CSS = """
     font-size: 1rem;
     margin-bottom: 1.5rem;
     font-weight: 500;
 }
 .gradio-container .table-wrap table {
@@ -96,6 +102,10 @@ CUSTOM_CSS = """
     border-bottom: 1px solid #e5e7eb;
 }
 .link-row a {
     display: inline-block;
     padding: 0.25rem 0.75rem;
@@ -172,9 +182,9 @@ def leaderboard_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
     df = pd.DataFrame(rows)
-    # Drop efficiency columns if every row is None (keeps the table narrow
-    # when no submitters have supplied them yet).
-    for col in ("Avg Cost ($)", "Avg Latency (s)"):
         if col in df.columns and df[col].isna().all():
             df = df.drop(columns=[col])
@@ -276,7 +286,8 @@ def submit_predictions(
 # ---------------------------------------------------------------------------
 def build_app() -> gr.Blocks:
-    df = leaderboard_dataframe(load_submissions(RESULTS_DIR))
     with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
         gr.Markdown(f"# {TITLE}")
@@ -294,18 +305,37 @@ def build_app() -> gr.Blocks:
             # -------------------------------------------------------------
             with gr.Tab("🏆 Leaderboard"):
                 gr.Markdown(
-                    "Results on the **DeepSynth test set** (80 tasks, Pass@1), "
-                    "ranked by **F1** score (LLM Judge used as tiebreaker). "
                     "F1 / Precision / Recall measure prediction quality against gold "
                     "answers; **LLM Judge** reports average precision under semantic "
                     "matching. 🔒 = closed model, 🔓 = open-weights.",
                     elem_classes=["section-header"],
                 )
-                gr.Dataframe(
-                    value=df,
-                    interactive=False,
-                    wrap=True,
-                )
             # -------------------------------------------------------------
             with gr.Tab("📤 Submit"):

 # ---------------------------------------------------------------------------
 RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
+DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
 QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
 QUEUE_DIR.mkdir(exist_ok=True, parents=True)
     letter-spacing: -0.02em !important;
 }
+.gradio-container h1 {
+    font-size: 2.2rem !important;
+    margin-bottom: 0.25rem !important;
+    text-align: center !important;
+}
 .gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
 .gradio-container code, .gradio-container pre {
     font-size: 1rem;
     margin-bottom: 1.5rem;
     font-weight: 500;
+    text-align: center;
 }
 .gradio-container .table-wrap table {
     border-bottom: 1px solid #e5e7eb;
 }
+.link-row {
+    text-align: center;
+    margin-bottom: 1.5rem;
+}
 .link-row a {
     display: inline-block;
     padding: 0.25rem 0.75rem;
     df = pd.DataFrame(rows)
+    # Drop optional metric columns if every row is None — keeps the table
+    # clean when a split (e.g. dev) only reports a subset of metrics.
+    for col in ("Avg Cost ($)", "Avg Latency (s)", "Precision", "Recall", "EM"):
         if col in df.columns and df[col].isna().all():
             df = df.drop(columns=[col])
 # ---------------------------------------------------------------------------
 def build_app() -> gr.Blocks:
+    df_test = leaderboard_dataframe(load_submissions(RESULTS_DIR))
+    df_dev  = leaderboard_dataframe(load_submissions(DEV_RESULTS_DIR))
     with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
         gr.Markdown(f"# {TITLE}")
             # -------------------------------------------------------------
             with gr.Tab("🏆 Leaderboard"):
                 gr.Markdown(
+                    "Results ranked by **F1** score (LLM Judge used as tiebreaker). "
                     "F1 / Precision / Recall measure prediction quality against gold "
                     "answers; **LLM Judge** reports average precision under semantic "
                     "matching. 🔒 = closed model, 🔓 = open-weights.",
                     elem_classes=["section-header"],
                 )
+                with gr.Tabs():
+                    with gr.Tab("Dev (40 tasks · public)"):
+                        gr.Markdown(
+                            "Self-reported numbers on the **public dev set** (40 tasks, "
+                            "Pass@1). Useful for prototyping and comparing methods during "
+                            "development. Anyone can score themselves locally on this split.",
+                        )
+                        gr.Dataframe(
+                            value=df_dev,
+                            interactive=False,
+                            wrap=True,
+                        )
+                    with gr.Tab("Test (80 tasks · held-out)"):
+                        gr.Markdown(
+                            "Official numbers on the **held-out test set** (80 tasks, "
+                            "Pass@1). Gold answers are private; submissions are scored "
+                            "by the maintainers.",
+                        )
+                        gr.Dataframe(
+                            value=df_test,
+                            interactive=False,
+                            wrap=True,
+                        )
             # -------------------------------------------------------------
             with gr.Tab("📤 Submit"):

submissions/2026-02-15-huawei-deepseek-chat.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "DeepSeek-Chat",
+    "base_model": "deepseek-chat (2025-08)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "open",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 2.1,
+      "llm_judge": 5.0
+    }
+  }
+}

submissions/2026-02-15-huawei-deepseek-reasoner.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "DeepSeek-Reasoner",
+    "base_model": "deepseek-r1 (2026-02)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "open",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 5.0,
+      "llm_judge": 7.5
+    }
+  }
+}

submissions/2026-02-15-huawei-gemini-pro-2-5 (1).json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "Gemini-Pro-2.5",
+    "base_model": "gemini-pro-2.5 (2025-08)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "closed",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 5.9,
+      "llm_judge": 5.0
+    }
+  }
+}

submissions/2026-02-15-huawei-gemini-pro-3.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "Gemini-Pro-3",
+    "base_model": "gemini-pro-3 (2026-02)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "closed",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 8.6,
+      "llm_judge": 15.0
+    }
+  }
+}

submissions/2026-02-15-huawei-gpt-4-1 (1).json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "GPT-4.1",
+    "base_model": "gpt-4.1 (2025-08)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "closed",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 1.8,
+      "llm_judge": 7.5
+    }
+  }
+}

submissions/2026-02-15-huawei-gpt-5-1 (1).json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "GPT-5.1",
+    "base_model": "gpt-5.1 (2025-08)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "closed",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 6.2,
+      "llm_judge": 12.5
+    }
+  }
+}

submissions/2026-02-15-huawei-gpt-5-2.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "GPT-5.2",
+    "base_model": "gpt-5.2-pro (2026-02)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "closed",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 15.6,
+      "llm_judge": 5.0
+    }
+  }
+}

submissions/2026-02-15-huawei-o3 (1).json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "o3",
+    "base_model": "o3 (2025-08)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "closed",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 6.3,
+      "llm_judge": 10.0
+    }
+  }
+}

submissions/2026-02-15-huawei-o3-deep-research (1).json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "metadata": {
+    "agent_name": "o3-deep-research",
+    "base_model": "o3-deep-research (2025-08)",
+    "scaffold": "Custom",
+    "category": "Agent Framework",
+    "access": "closed",
+    "tools_used": [
+      "web_search",
+      "python_interpreter"
+    ],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 9.9,
+      "llm_judge": 20.0
+    }
+  }
+}

submissions/2026-02-15-huawei-o4-mini (1).json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "metadata": {
+    "agent_name": "o4-mini",
+    "base_model": "o4-mini (2025-08)",
+    "scaffold": "none",
+    "category": "LLM Baseline",
+    "access": "closed",
+    "tools_used": [],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 3.3,
+      "llm_judge": 2.5
+    }
+  }
+}

submissions/2026-02-15-huawei-owl-gpt4-1 (1).json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "metadata": {
+    "agent_name": "OWL-GPT4.1",
+    "base_model": "gpt-4.1",
+    "scaffold": "Custom",
+    "category": "Agent Framework",
+    "access": "open",
+    "tools_used": [
+      "web_search",
+      "python_interpreter"
+    ],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 4.1,
+      "llm_judge": 12.5
+    }
+  }
+}

submissions/2026-02-15-huawei-smolagent-gpt4-1 (1).json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "metadata": {
+    "agent_name": "Smolagent-GPT4.1",
+    "base_model": "gpt-4.1",
+    "scaffold": "CodeAct",
+    "category": "Agent Framework",
+    "access": "open",
+    "tools_used": [
+      "web_search",
+      "python_interpreter"
+    ],
+    "organization": "Huawei (paper baseline)",
+    "paper_url": "https://arxiv.org/abs/2602.21143",
+    "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
+    "submission_date": "2026-02-15",
+    "split": "dev",
+    "num_seeds": 1
+  },
+  "scores": {
+    "overall": {
+      "f1": 6.3,
+      "llm_judge": 7.5
+    }
+  }
+}