debjitpaul commited on
Commit Β·
63fb47d
1
Parent(s): 369761f
Overhaul: Inter/JetBrains Mono typography, paper baselines, submission form
Browse files
app.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
| 1 |
"""
|
| 2 |
DeepSynth Leaderboard β Hugging Face Space (Gradio)
|
| 3 |
|
| 4 |
-
|
| 5 |
-
filterable leaderboard with per-domain breakdowns and efficiency metrics.
|
| 6 |
-
|
| 7 |
-
Deploy by pushing this file + requirements.txt to a HF Space with SDK=gradio.
|
| 8 |
"""
|
| 9 |
|
| 10 |
from __future__ import annotations
|
| 11 |
|
|
|
|
| 12 |
import json
|
| 13 |
import os
|
|
|
|
| 14 |
from pathlib import Path
|
| 15 |
from typing import Any
|
| 16 |
|
|
@@ -22,27 +21,158 @@ import pandas as pd
|
|
| 22 |
# ---------------------------------------------------------------------------
|
| 23 |
|
| 24 |
RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
|
|
|
|
|
|
|
|
|
|
| 25 |
TITLE = "π DeepSynth Leaderboard"
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
"
|
| 29 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
)
|
| 31 |
REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
|
| 32 |
PAPER_URL = "https://arxiv.org/abs/2602.21143"
|
| 33 |
DATASET_URL = "https://huggingface.co/datasets/DeepSynthesisTeam/deepsynth-bench"
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
]
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# ---------------------------------------------------------------------------
|
| 41 |
-
#
|
| 42 |
# ---------------------------------------------------------------------------
|
| 43 |
|
| 44 |
def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
|
| 45 |
-
"""Load every *.json file under results_dir as a submission result row."""
|
| 46 |
if not results_dir.exists():
|
| 47 |
return []
|
| 48 |
rows = []
|
|
@@ -55,10 +185,12 @@ def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
|
|
| 55 |
return rows
|
| 56 |
|
| 57 |
|
| 58 |
-
def
|
| 59 |
-
"""Flatten submission result records into a leaderboard DataFrame."""
|
| 60 |
if not submissions:
|
| 61 |
-
return pd.DataFrame(
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
records = []
|
| 64 |
for s in submissions:
|
|
@@ -66,153 +198,262 @@ def build_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
|
|
| 66 |
scores = s.get("scores", {})
|
| 67 |
efficiency = s.get("efficiency", {})
|
| 68 |
|
| 69 |
-
row = {
|
| 70 |
-
"Agent": meta.get("agent_name", "β"),
|
| 71 |
-
"Base Model": meta.get("base_model", "β"),
|
| 72 |
-
"Scaffold": meta.get("scaffold", "β"),
|
| 73 |
-
"Overall EM": scores.get("overall", {}).get("exact_match"),
|
| 74 |
-
"Overall F1": scores.get("overall", {}).get("f1"),
|
| 75 |
-
"LLM Judge": scores.get("overall", {}).get("llm_judge"),
|
| 76 |
-
}
|
| 77 |
-
#for domain in DOMAINS:
|
| 78 |
-
# row[f"{domain.title()}"] = scores.get("per_domain", {}).get(domain, {}).get("f1")
|
| 79 |
-
|
| 80 |
-
row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
|
| 81 |
-
row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
|
| 82 |
-
row["Avg Tool Calls"] = efficiency.get("avg_num_tool_calls")
|
| 83 |
-
|
| 84 |
-
row["Split"] = meta.get("split", "β")
|
| 85 |
-
row["Org"] = meta.get("organization", "β")
|
| 86 |
-
row["Date"] = meta.get("submission_date", "β")
|
| 87 |
-
|
| 88 |
paper = meta.get("paper_url")
|
| 89 |
code = meta.get("code_url")
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
df = pd.DataFrame(records)
|
| 96 |
-
if "LLM Judge" in df.columns:
|
| 97 |
df = df.sort_values("LLM Judge", ascending=False, na_position="last").reset_index(drop=True)
|
| 98 |
return df
|
| 99 |
|
| 100 |
|
| 101 |
# ---------------------------------------------------------------------------
|
| 102 |
-
#
|
| 103 |
# ---------------------------------------------------------------------------
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
|
| 126 |
-
|
|
|
|
|
|
|
| 127 |
|
| 128 |
-
with gr.Blocks(title=
|
| 129 |
gr.Markdown(f"# {TITLE}")
|
| 130 |
-
gr.
|
| 131 |
-
gr.
|
| 132 |
-
|
| 133 |
-
f"
|
| 134 |
-
f"
|
| 135 |
-
f"
|
|
|
|
|
|
|
| 136 |
)
|
| 137 |
|
| 138 |
with gr.Tabs():
|
|
|
|
| 139 |
with gr.Tab("π Leaderboard"):
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
scaffold_filter = gr.CheckboxGroup(
|
| 147 |
-
choices=all_scaffolds,
|
| 148 |
-
value=all_scaffolds,
|
| 149 |
-
label="Scaffold",
|
| 150 |
-
)
|
| 151 |
-
min_seeds = gr.Slider(
|
| 152 |
-
minimum=1, maximum=5, value=1, step=1, label="Min seeds",
|
| 153 |
-
)
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
|
|
|
| 157 |
interactive=False,
|
| 158 |
wrap=True,
|
| 159 |
-
datatype=["str"
|
|
|
|
| 160 |
)
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
| 167 |
)
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
##
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
5.
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
|
|
|
|
| 202 |
with gr.Tab("π Citation"):
|
| 203 |
-
gr.Markdown(""
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
return app
|
| 215 |
|
| 216 |
|
| 217 |
if __name__ == "__main__":
|
| 218 |
-
build_app().launch()
|
|
|
|
| 1 |
"""
|
| 2 |
DeepSynth Leaderboard β Hugging Face Space (Gradio)
|
| 3 |
|
| 4 |
+
GAIA-style leaderboard with tabs, custom typography, and a submission upload form.
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
import datetime
|
| 10 |
import json
|
| 11 |
import os
|
| 12 |
+
import re
|
| 13 |
from pathlib import Path
|
| 14 |
from typing import Any
|
| 15 |
|
|
|
|
| 21 |
# ---------------------------------------------------------------------------
|
| 22 |
|
| 23 |
RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
|
| 24 |
+
QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
|
| 25 |
+
QUEUE_DIR.mkdir(exist_ok=True, parents=True)
|
| 26 |
+
|
| 27 |
TITLE = "π DeepSynth Leaderboard"
|
| 28 |
+
TAGLINE = "A Benchmark for Deep Information Synthesis Β· ICLR 2026"
|
| 29 |
+
ABOUT_BLURB = (
|
| 30 |
+
"Large language model (LLM)-based agents are increasingly used to solve complex tasks "
|
| 31 |
+
"involving tool use β web browsing, code execution, data analysis. Current benchmarks "
|
| 32 |
+
"do not adequately assess their ability to solve real-world tasks requiring synthesis "
|
| 33 |
+
"across multiple sources and inference beyond simple fact retrieval.\n\n"
|
| 34 |
+
"**DeepSynth** introduces 120 tasks across 7 domains and 67 countries, designed to evaluate "
|
| 35 |
+
"agents on realistic, time-consuming problems that combine information gathering, synthesis, "
|
| 36 |
+
"and structured reasoning."
|
| 37 |
)
|
| 38 |
REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
|
| 39 |
PAPER_URL = "https://arxiv.org/abs/2602.21143"
|
| 40 |
DATASET_URL = "https://huggingface.co/datasets/DeepSynthesisTeam/deepsynth-bench"
|
| 41 |
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
# Styling β GAIA-inspired: mono/sans headers, compact table, subtle colors
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
CUSTOM_CSS = """
|
| 47 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap');
|
| 48 |
+
|
| 49 |
+
.gradio-container {
|
| 50 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
|
| 51 |
+
max-width: 1400px !important;
|
| 52 |
+
margin: 0 auto !important;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.gradio-container h1, .gradio-container h2, .gradio-container h3 {
|
| 56 |
+
font-family: 'Inter', sans-serif !important;
|
| 57 |
+
font-weight: 700 !important;
|
| 58 |
+
letter-spacing: -0.02em !important;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.gradio-container h1 { font-size: 2.2rem !important; margin-bottom: 0.25rem !important; }
|
| 62 |
+
.gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
|
| 63 |
+
|
| 64 |
+
.gradio-container code, .gradio-container pre {
|
| 65 |
+
font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
|
| 66 |
+
font-size: 0.9em !important;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
/* Tagline under title */
|
| 70 |
+
.tagline {
|
| 71 |
+
color: #6b7280;
|
| 72 |
+
font-size: 1rem;
|
| 73 |
+
margin-bottom: 1.5rem;
|
| 74 |
+
font-weight: 500;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/* Leaderboard table β tighter, more readable */
|
| 78 |
+
.gradio-container .table-wrap table {
|
| 79 |
+
font-family: 'Inter', sans-serif !important;
|
| 80 |
+
font-size: 0.92rem !important;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
.gradio-container .table-wrap th {
|
| 84 |
+
font-weight: 600 !important;
|
| 85 |
+
background: #f9fafb !important;
|
| 86 |
+
border-bottom: 2px solid #e5e7eb !important;
|
| 87 |
+
text-align: left !important;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.gradio-container .table-wrap td {
|
| 91 |
+
padding: 0.55rem 0.75rem !important;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
/* Section headers (like GAIA's "Test set" / "Validation set") */
|
| 95 |
+
.section-header {
|
| 96 |
+
font-size: 1.1rem;
|
| 97 |
+
font-weight: 700;
|
| 98 |
+
margin: 1rem 0 0.5rem 0;
|
| 99 |
+
padding-bottom: 0.4rem;
|
| 100 |
+
border-bottom: 1px solid #e5e7eb;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
/* Badge-style links under the title */
|
| 104 |
+
.link-row a {
|
| 105 |
+
display: inline-block;
|
| 106 |
+
padding: 0.25rem 0.75rem;
|
| 107 |
+
margin-right: 0.5rem;
|
| 108 |
+
border-radius: 6px;
|
| 109 |
+
background: #f3f4f6;
|
| 110 |
+
color: #374151 !important;
|
| 111 |
+
text-decoration: none !important;
|
| 112 |
+
font-size: 0.9rem;
|
| 113 |
+
font-weight: 500;
|
| 114 |
+
}
|
| 115 |
+
.link-row a:hover { background: #e5e7eb; }
|
| 116 |
+
|
| 117 |
+
/* Tabs β cleaner look */
|
| 118 |
+
.gradio-container button.tab-nav {
|
| 119 |
+
font-weight: 600 !important;
|
| 120 |
+
font-size: 1rem !important;
|
| 121 |
+
}
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
# Paper baselines β pre-populated from Table 1 of the DeepSynth paper
|
| 126 |
+
# ---------------------------------------------------------------------------
|
| 127 |
+
|
| 128 |
+
PAPER_BASELINES: list[dict[str, Any]] = [
|
| 129 |
+
# LLM Baselines (no agent scaffold, no tool use)
|
| 130 |
+
{"category": "LLM Baseline", "agent": "o4-mini", "model": "o4-mini (2025-08)", "access": "closed", "scaffold": "none", "f1": 3.05, "precision": 2.33, "recall": 4.39, "em": 0.00, "llm_judge": 0.00},
|
| 131 |
+
{"category": "LLM Baseline", "agent": "GPT-4.1", "model": "gpt-4.1 (2025-08)", "access": "closed", "scaffold": "none", "f1": 3.46, "precision": 2.86, "recall": 4.39, "em": 0.00, "llm_judge": 0.00},
|
| 132 |
+
{"category": "LLM Baseline", "agent": "o3", "model": "o3 (2025-08)", "access": "closed", "scaffold": "none", "f1": 3.29, "precision": 2.85, "recall": 3.90, "em": 0.00, "llm_judge": 0.00},
|
| 133 |
+
{"category": "LLM Baseline", "agent": "GPT-5.1", "model": "gpt-5.1 (2025-08)", "access": "closed", "scaffold": "none", "f1": 3.83, "precision": 2.98, "recall": 5.37, "em": 0.00, "llm_judge": 0.00},
|
| 134 |
+
{"category": "LLM Baseline", "agent": "Gemini-Pro-2.5", "model": "gemini-pro-2.5 (2025-08)", "access": "closed", "scaffold": "none", "f1": 6.25, "precision": 4.71, "recall": 9.27, "em": 0.00, "llm_judge": 5.00},
|
| 135 |
+
{"category": "LLM Baseline", "agent": "GPT-5.2-Pro", "model": "gpt-5.2-pro (2026-02)", "access": "closed", "scaffold": "none", "f1": 8.70, "precision": 8.45, "recall": 8.96, "em": 6.25, "llm_judge": 6.67},
|
| 136 |
+
{"category": "LLM Baseline", "agent": "DeepSeek-R1-Chat", "model": "deepseek-r1-chat (2025-08)","access": "open", "scaffold": "none", "f1": 3.23, "precision": 2.75, "recall": 3.90, "em": 1.67, "llm_judge": 2.50},
|
| 137 |
+
{"category": "LLM Baseline", "agent": "DeepSeek-R1-Reasoner", "model": "deepseek-r1 (2026-02)", "access": "open", "scaffold": "none", "f1": 2.80, "precision": 2.73, "recall": 2.87, "em": 2.50, "llm_judge": 6.67},
|
| 138 |
+
# Framework-based Agents
|
| 139 |
+
{"category": "Agent Framework", "agent": "o3-deep-research", "model": "o3-deep-research (2025-08)","access": "closed", "scaffold": "Custom", "f1": 8.97, "precision": 7.73, "recall": 10.69, "em": 2.50, "llm_judge": 17.50},
|
| 140 |
+
{"category": "Agent Framework", "agent": "Smolagent", "model": "gpt-4.1", "access": "open", "scaffold": "CodeAct", "f1": 3.75, "precision": 3.27, "recall": 4.39, "em": 2.50, "llm_judge": 7.50},
|
| 141 |
+
{"category": "Agent Framework", "agent": "Smolagent", "model": "gpt-5", "access": "open", "scaffold": "CodeAct", "f1": 6.42, "precision": 6.34, "recall": 6.50, "em": 1.67, "llm_judge": 2.50},
|
| 142 |
+
{"category": "Agent Framework", "agent": "OWL", "model": "gpt-4.1", "access": "open", "scaffold": "Custom", "f1": 5.41, "precision": 4.62, "recall": 6.52, "em": 1.67, "llm_judge": 12.50},
|
| 143 |
]
|
| 144 |
|
| 145 |
+
|
| 146 |
+
def baselines_dataframe() -> pd.DataFrame:
|
| 147 |
+
"""Build the paper-baselines DataFrame in display order."""
|
| 148 |
+
rows = []
|
| 149 |
+
for b in PAPER_BASELINES:
|
| 150 |
+
lock = "π" if b["access"] == "closed" else "π"
|
| 151 |
+
rows.append({
|
| 152 |
+
"Category": b["category"],
|
| 153 |
+
"Agent": b["agent"],
|
| 154 |
+
"Model": b["model"],
|
| 155 |
+
"Access": f"{lock} {b['access']}",
|
| 156 |
+
"Scaffold": b["scaffold"],
|
| 157 |
+
"F1": b["f1"],
|
| 158 |
+
"Precision": b["precision"],
|
| 159 |
+
"Recall": b["recall"],
|
| 160 |
+
"EM": b["em"],
|
| 161 |
+
"LLM Judge": b["llm_judge"],
|
| 162 |
+
})
|
| 163 |
+
df = pd.DataFrame(rows)
|
| 164 |
+
# Sort by LLM Judge within each category, preserving LLM Baseline first
|
| 165 |
+
df["__cat_order"] = df["Category"].map({"LLM Baseline": 0, "Agent Framework": 1})
|
| 166 |
+
df = df.sort_values(by=["__cat_order", "LLM Judge", "F1"], ascending=[True, False, False])
|
| 167 |
+
df = df.drop(columns=["__cat_order"]).reset_index(drop=True)
|
| 168 |
+
return df
|
| 169 |
+
|
| 170 |
+
|
| 171 |
# ---------------------------------------------------------------------------
|
| 172 |
+
# Community submissions β loaded from the submissions/ directory
|
| 173 |
# ---------------------------------------------------------------------------
|
| 174 |
|
| 175 |
def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
|
|
|
|
| 176 |
if not results_dir.exists():
|
| 177 |
return []
|
| 178 |
rows = []
|
|
|
|
| 185 |
return rows
|
| 186 |
|
| 187 |
|
| 188 |
+
def community_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
|
|
|
|
| 189 |
if not submissions:
|
| 190 |
+
return pd.DataFrame(columns=[
|
| 191 |
+
"Agent", "Model", "Scaffold", "Split", "F1", "EM", "LLM Judge",
|
| 192 |
+
"Avg Cost ($)", "Avg Latency (s)", "Org", "Date", "Links"
|
| 193 |
+
])
|
| 194 |
|
| 195 |
records = []
|
| 196 |
for s in submissions:
|
|
|
|
| 198 |
scores = s.get("scores", {})
|
| 199 |
efficiency = s.get("efficiency", {})
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
paper = meta.get("paper_url")
|
| 202 |
code = meta.get("code_url")
|
| 203 |
+
links = " Β· ".join(filter(None, [
|
| 204 |
+
f"[paper]({paper})" if paper else None,
|
| 205 |
+
f"[code]({code})" if code else None,
|
| 206 |
+
]))
|
| 207 |
+
|
| 208 |
+
records.append({
|
| 209 |
+
"Agent": meta.get("agent_name", "β"),
|
| 210 |
+
"Model": meta.get("base_model", "β"),
|
| 211 |
+
"Scaffold": meta.get("scaffold", "β"),
|
| 212 |
+
"Split": meta.get("split", "β"),
|
| 213 |
+
"F1": scores.get("overall", {}).get("f1"),
|
| 214 |
+
"EM": scores.get("overall", {}).get("exact_match"),
|
| 215 |
+
"LLM Judge": scores.get("overall", {}).get("llm_judge"),
|
| 216 |
+
"Avg Cost ($)": efficiency.get("avg_cost_usd"),
|
| 217 |
+
"Avg Latency (s)": efficiency.get("avg_latency_s"),
|
| 218 |
+
"Org": meta.get("organization", "β"),
|
| 219 |
+
"Date": meta.get("submission_date", "β"),
|
| 220 |
+
"Links": links,
|
| 221 |
+
})
|
| 222 |
|
| 223 |
df = pd.DataFrame(records)
|
| 224 |
+
if "LLM Judge" in df.columns and not df["LLM Judge"].isna().all():
|
| 225 |
df = df.sort_values("LLM Judge", ascending=False, na_position="last").reset_index(drop=True)
|
| 226 |
return df
|
| 227 |
|
| 228 |
|
| 229 |
# ---------------------------------------------------------------------------
|
| 230 |
+
# Submission upload handler β saves file to queue for maintainer review
|
| 231 |
# ---------------------------------------------------------------------------
|
| 232 |
|
| 233 |
+
SAFE_NAME_RE = re.compile(r"[^a-zA-Z0-9._-]+")
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def _safe_slug(text: str, maxlen: int = 40) -> str:
|
| 237 |
+
slug = SAFE_NAME_RE.sub("-", (text or "unnamed").strip()).strip("-").lower()
|
| 238 |
+
return slug[:maxlen] or "unnamed"
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def submit_predictions(
|
| 242 |
+
file_obj,
|
| 243 |
+
agent_name: str,
|
| 244 |
+
base_model: str,
|
| 245 |
+
scaffold: str,
|
| 246 |
+
organization: str,
|
| 247 |
+
contact_email: str,
|
| 248 |
+
code_url: str,
|
| 249 |
+
split: str,
|
| 250 |
+
) -> str:
|
| 251 |
+
"""Validate basic fields, save the uploaded predictions file to the review queue."""
|
| 252 |
+
if file_obj is None:
|
| 253 |
+
return "β **Missing file.** Please attach a predictions JSON."
|
| 254 |
+
if not agent_name.strip():
|
| 255 |
+
return "β **Missing agent name.**"
|
| 256 |
+
if not base_model.strip():
|
| 257 |
+
return "β **Missing base model.**"
|
| 258 |
+
if not organization.strip():
|
| 259 |
+
return "β **Missing organization.**"
|
| 260 |
+
if not contact_email.strip() or "@" not in contact_email:
|
| 261 |
+
return "β **Invalid contact email.**"
|
| 262 |
+
if not code_url.strip() or not code_url.startswith(("http://", "https://")):
|
| 263 |
+
return "β **Missing or invalid code URL.** A public `code_url` that reproduces your results is required."
|
| 264 |
+
|
| 265 |
+
# Try to parse the uploaded file as JSON to catch corruption early
|
| 266 |
+
try:
|
| 267 |
+
src_path = Path(file_obj.name if hasattr(file_obj, "name") else file_obj)
|
| 268 |
+
with src_path.open("r", encoding="utf-8") as f:
|
| 269 |
+
predictions = json.load(f)
|
| 270 |
+
except json.JSONDecodeError as e:
|
| 271 |
+
return f"β **Invalid JSON in uploaded file:** {e}"
|
| 272 |
+
except OSError as e:
|
| 273 |
+
return f"β **Could not read uploaded file:** {e}"
|
| 274 |
+
|
| 275 |
+
if not isinstance(predictions, dict) or not predictions:
|
| 276 |
+
return "β **Predictions file must be a non-empty JSON object mapping task IDs to answers.**"
|
| 277 |
+
|
| 278 |
+
# Build a bundle with metadata + raw predictions for the maintainer to review
|
| 279 |
+
bundle = {
|
| 280 |
+
"received_at": datetime.datetime.utcnow().isoformat() + "Z",
|
| 281 |
+
"metadata": {
|
| 282 |
+
"agent_name": agent_name.strip(),
|
| 283 |
+
"base_model": base_model.strip(),
|
| 284 |
+
"scaffold": scaffold,
|
| 285 |
+
"organization": organization.strip(),
|
| 286 |
+
"contact_email": contact_email.strip(),
|
| 287 |
+
"code_url": code_url.strip(),
|
| 288 |
+
"split": split,
|
| 289 |
+
"submission_date": datetime.date.today().isoformat(),
|
| 290 |
+
},
|
| 291 |
+
"predictions": predictions,
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
date = datetime.date.today().isoformat()
|
| 295 |
+
fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
|
| 296 |
+
dest = QUEUE_DIR / fname
|
| 297 |
+
with dest.open("w", encoding="utf-8") as f:
|
| 298 |
+
json.dump(bundle, f, indent=2, ensure_ascii=False)
|
| 299 |
+
|
| 300 |
+
return (
|
| 301 |
+
f"β
**Submission received.** Your file has been queued for review as `{fname}`.\n\n"
|
| 302 |
+
f"A maintainer will score it against the private test-set answers and merge it to the "
|
| 303 |
+
f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
|
| 304 |
+
f"reproducibility via your `code_url`.\n\n"
|
| 305 |
+
f"**Note:** submissions in this Space's queue are held temporarily β for a permanent "
|
| 306 |
+
f"record, please also open a PR to the "
|
| 307 |
+
f"[benchmark repo]({REPO_URL}) with your predictions file under `submissions/`."
|
| 308 |
+
)
|
| 309 |
|
| 310 |
|
| 311 |
+
# ---------------------------------------------------------------------------
|
| 312 |
+
# UI
|
| 313 |
+
# ---------------------------------------------------------------------------
|
| 314 |
|
| 315 |
+
def build_app() -> gr.Blocks:
|
| 316 |
+
df_baselines = baselines_dataframe()
|
| 317 |
+
df_community = community_dataframe(load_submissions(RESULTS_DIR))
|
| 318 |
|
| 319 |
+
with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
|
| 320 |
gr.Markdown(f"# {TITLE}")
|
| 321 |
+
gr.HTML(f"<div class='tagline'>{TAGLINE}</div>")
|
| 322 |
+
gr.HTML(
|
| 323 |
+
"<div class='link-row'>"
|
| 324 |
+
f"<a href='{PAPER_URL}' target='_blank'>π Paper</a>"
|
| 325 |
+
f"<a href='{REPO_URL}' target='_blank'>π» Code</a>"
|
| 326 |
+
f"<a href='{DATASET_URL}' target='_blank'>π€ Dataset</a>"
|
| 327 |
+
f"<a href='{REPO_URL}#submitting-to-the-leaderboard' target='_blank'>π₯ How to submit</a>"
|
| 328 |
+
"</div>"
|
| 329 |
)
|
| 330 |
|
| 331 |
with gr.Tabs():
|
| 332 |
+
# -------------------------------------------------------------
|
| 333 |
with gr.Tab("π Leaderboard"):
|
| 334 |
+
gr.Markdown(
|
| 335 |
+
"Results on the **full DeepSynth test set** (120 tasks, Pass@1). "
|
| 336 |
+
"F1 / Precision / Recall measure prediction quality against gold answers; "
|
| 337 |
+
"LLM Judge reports average precision under semantic matching.",
|
| 338 |
+
elem_classes=["section-header"],
|
| 339 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
|
| 341 |
+
gr.Markdown("### Paper baselines")
|
| 342 |
+
gr.Dataframe(
|
| 343 |
+
value=df_baselines,
|
| 344 |
interactive=False,
|
| 345 |
wrap=True,
|
| 346 |
+
datatype=["str", "str", "str", "str", "str",
|
| 347 |
+
"number", "number", "number", "number", "number"],
|
| 348 |
)
|
| 349 |
|
| 350 |
+
if not df_community.empty:
|
| 351 |
+
gr.Markdown("### Community submissions")
|
| 352 |
+
gr.Dataframe(
|
| 353 |
+
value=df_community,
|
| 354 |
+
interactive=False,
|
| 355 |
+
wrap=True,
|
| 356 |
)
|
| 357 |
|
| 358 |
+
# -------------------------------------------------------------
|
| 359 |
+
with gr.Tab("π€ Submit"):
|
| 360 |
+
gr.Markdown("## Submit your agent's predictions")
|
| 361 |
+
gr.Markdown(
|
| 362 |
+
"Upload a JSON file of predictions on the DeepSynth **test set**. "
|
| 363 |
+
"We'll score it against the private gold answers and add your row to the "
|
| 364 |
+
"community leaderboard.\n\n"
|
| 365 |
+
f"**Format:** a JSON object mapping task IDs (`\"001\"` β¦ `\"120\"`) to your "
|
| 366 |
+
"agent's answer. See "
|
| 367 |
+
f"[`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json) "
|
| 368 |
+
"for the full spec."
|
| 369 |
+
)
|
| 370 |
|
| 371 |
+
with gr.Row():
|
| 372 |
+
with gr.Column():
|
| 373 |
+
agent_name_in = gr.Textbox(label="Agent name", placeholder="e.g. ReAct-GPT5")
|
| 374 |
+
base_model_in = gr.Textbox(label="Base model", placeholder="e.g. gpt-5.2-pro (2026-02)")
|
| 375 |
+
scaffold_in = gr.Dropdown(
|
| 376 |
+
choices=["none", "ReAct", "CodeAct", "Plan-and-Execute", "Reflexion", "MCTS", "Custom"],
|
| 377 |
+
label="Scaffold",
|
| 378 |
+
value="ReAct",
|
| 379 |
+
)
|
| 380 |
+
split_in = gr.Dropdown(
|
| 381 |
+
choices=["test", "full"],
|
| 382 |
+
label="Split evaluated",
|
| 383 |
+
value="test",
|
| 384 |
+
)
|
| 385 |
+
with gr.Column():
|
| 386 |
+
organization_in = gr.Textbox(label="Organization", placeholder="e.g. MSR India")
|
| 387 |
+
contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
|
| 388 |
+
code_url_in = gr.Textbox(
|
| 389 |
+
label="Code URL (required)",
|
| 390 |
+
placeholder="https://github.com/you/your-agent",
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
predictions_in = gr.File(
|
| 394 |
+
label="Predictions JSON file",
|
| 395 |
+
file_types=[".json"],
|
| 396 |
+
)
|
| 397 |
+
submit_btn = gr.Button("Submit for review", variant="primary")
|
| 398 |
+
submit_status = gr.Markdown()
|
| 399 |
+
|
| 400 |
+
submit_btn.click(
|
| 401 |
+
fn=submit_predictions,
|
| 402 |
+
inputs=[
|
| 403 |
+
predictions_in, agent_name_in, base_model_in, scaffold_in,
|
| 404 |
+
organization_in, contact_email_in, code_url_in, split_in,
|
| 405 |
+
],
|
| 406 |
+
outputs=submit_status,
|
| 407 |
+
)
|
| 408 |
|
| 409 |
+
gr.Markdown(
|
| 410 |
+
"---\n"
|
| 411 |
+
"**What happens next?** Submissions are queued for maintainer review. "
|
| 412 |
+
"We verify metadata honesty and spot-check reproducibility via your "
|
| 413 |
+
"`code_url` before computing scores and merging to the leaderboard.\n\n"
|
| 414 |
+
f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
|
| 415 |
+
"adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
|
| 416 |
+
)
|
| 417 |
|
| 418 |
+
# -------------------------------------------------------------
|
| 419 |
+
with gr.Tab("π About"):
|
| 420 |
+
gr.Markdown(ABOUT_BLURB)
|
| 421 |
+
gr.Markdown(
|
| 422 |
+
"## Metrics\n"
|
| 423 |
+
"- **F1 / Precision / Recall** β token-level overlap between predicted "
|
| 424 |
+
"and gold answers, averaged over all tasks.\n"
|
| 425 |
+
"- **Exact Match (EM)** β fraction of tasks where the predicted answer "
|
| 426 |
+
"exactly equals the gold answer (strict).\n"
|
| 427 |
+
"- **LLM Judge** β semantic-equivalence scoring with small numerical "
|
| 428 |
+
"tolerance (1β5.5%), evaluated by a strong frozen judge model.\n\n"
|
| 429 |
+
"## Dataset\n"
|
| 430 |
+
f"DeepSynth is hosted on π€ [`DeepSynthesisTeam/deepsynth-bench`]({DATASET_URL}). "
|
| 431 |
+
"The dev set (40 tasks) ships with gold answers for prototyping; the test "
|
| 432 |
+
"set (120 tasks) is released questions-only to prevent contamination."
|
| 433 |
+
)
|
| 434 |
|
| 435 |
+
# -------------------------------------------------------------
|
| 436 |
with gr.Tab("π Citation"):
|
| 437 |
+
gr.Markdown("### Please cite:")
|
| 438 |
+
gr.Code(
|
| 439 |
+
value=(
|
| 440 |
+
"@inproceedings{deepsynth2026,\n"
|
| 441 |
+
" title = {A Benchmark for Deep Information Synthesis},\n"
|
| 442 |
+
" author = {Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald\n"
|
| 443 |
+
" and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim\n"
|
| 444 |
+
" and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex\n"
|
| 445 |
+
" and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru\n"
|
| 446 |
+
" and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos},\n"
|
| 447 |
+
" booktitle = {International Conference on Learning Representations (ICLR)},\n"
|
| 448 |
+
" year = {2026},\n"
|
| 449 |
+
" url = {" + PAPER_URL + "}\n"
|
| 450 |
+
"}"
|
| 451 |
+
),
|
| 452 |
+
language="latex",
|
| 453 |
+
)
|
| 454 |
|
| 455 |
return app
|
| 456 |
|
| 457 |
|
| 458 |
if __name__ == "__main__":
|
| 459 |
+
build_app().launch()
|