debjitpaul commited on
Commit
63fb47d
Β·
1 Parent(s): 369761f

Overhaul: Inter/JetBrains Mono typography, paper baselines, submission form

Browse files
Files changed (1) hide show
  1. app.py +370 -129
app.py CHANGED
@@ -1,16 +1,15 @@
1
  """
2
  DeepSynth Leaderboard β€” Hugging Face Space (Gradio)
3
 
4
- Reads validated submissions from a results repo and renders a sortable,
5
- filterable leaderboard with per-domain breakdowns and efficiency metrics.
6
-
7
- Deploy by pushing this file + requirements.txt to a HF Space with SDK=gradio.
8
  """
9
 
10
  from __future__ import annotations
11
 
 
12
  import json
13
  import os
 
14
  from pathlib import Path
15
  from typing import Any
16
 
@@ -22,27 +21,158 @@ import pandas as pd
22
  # ---------------------------------------------------------------------------
23
 
24
  RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
 
 
 
25
  TITLE = "πŸ™ DeepSynth Leaderboard"
26
- SUBTITLE = (
27
- "Large language model (LLM)-based agents are increasingly used to solve complex tasks involving tool use, such as web browsing, code execution, and data analysis. However, current evaluation benchmarks do not adequately assess their ability to solve real-world tasks that require synthesizing information from multiple sources and inferring insights beyond simple fact retrieval."
28
- "We introduce DEEPSYNTH, a novel benchmark of 120 tasks across 7 domains and 67 countries, designed to evaluate agents on realistic, time-consuming problems that combine information gathering, synthesis, and structured reasoning. "
29
- "across 7 domains and 67 countries. ICLR 2026."
 
 
 
 
 
30
  )
31
  REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
32
  PAPER_URL = "https://arxiv.org/abs/2602.21143"
33
  DATASET_URL = "https://huggingface.co/datasets/DeepSynthesisTeam/deepsynth-bench"
34
 
35
- DOMAINS = [
36
- "science", "geography", "economics", "history",
37
- "culture", "politics", "technology",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  ]
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # ---------------------------------------------------------------------------
41
- # Data loading
42
  # ---------------------------------------------------------------------------
43
 
44
  def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
45
- """Load every *.json file under results_dir as a submission result row."""
46
  if not results_dir.exists():
47
  return []
48
  rows = []
@@ -55,10 +185,12 @@ def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
55
  return rows
56
 
57
 
58
- def build_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
59
- """Flatten submission result records into a leaderboard DataFrame."""
60
  if not submissions:
61
- return pd.DataFrame()
 
 
 
62
 
63
  records = []
64
  for s in submissions:
@@ -66,153 +198,262 @@ def build_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
66
  scores = s.get("scores", {})
67
  efficiency = s.get("efficiency", {})
68
 
69
- row = {
70
- "Agent": meta.get("agent_name", "β€”"),
71
- "Base Model": meta.get("base_model", "β€”"),
72
- "Scaffold": meta.get("scaffold", "β€”"),
73
- "Overall EM": scores.get("overall", {}).get("exact_match"),
74
- "Overall F1": scores.get("overall", {}).get("f1"),
75
- "LLM Judge": scores.get("overall", {}).get("llm_judge"),
76
- }
77
- #for domain in DOMAINS:
78
- # row[f"{domain.title()}"] = scores.get("per_domain", {}).get(domain, {}).get("f1")
79
-
80
- row["Avg Cost ($)"] = efficiency.get("avg_cost_usd")
81
- row["Avg Latency (s)"] = efficiency.get("avg_latency_s")
82
- row["Avg Tool Calls"] = efficiency.get("avg_num_tool_calls")
83
-
84
- row["Split"] = meta.get("split", "β€”")
85
- row["Org"] = meta.get("organization", "β€”")
86
- row["Date"] = meta.get("submission_date", "β€”")
87
-
88
  paper = meta.get("paper_url")
89
  code = meta.get("code_url")
90
- row["Paper"] = f"[link]({paper})" if paper else ""
91
- row["Code"] = f"[link]({code})" if code else ""
92
-
93
- records.append(row)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  df = pd.DataFrame(records)
96
- if "LLM Judge" in df.columns:
97
  df = df.sort_values("LLM Judge", ascending=False, na_position="last").reset_index(drop=True)
98
  return df
99
 
100
 
101
  # ---------------------------------------------------------------------------
102
- # UI
103
  # ---------------------------------------------------------------------------
104
 
105
- def filter_df(
106
- df: pd.DataFrame,
107
- split_filter: str,
108
- scaffolds: list[str],
109
- min_seeds: int,
110
- ) -> pd.DataFrame:
111
- if df.empty:
112
- return df
113
- out = df.copy()
114
- if split_filter != "all":
115
- out = out[out["Split"] == split_filter]
116
- if scaffolds:
117
- out = out[out["Scaffold"].isin(scaffolds)]
118
- # min_seeds filter omitted from display columns for clarity β€” expose in detail view later.
119
- return out.reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
 
122
- def build_app() -> gr.Blocks:
123
- submissions = load_submissions(RESULTS_DIR)
124
- df_full = build_dataframe(submissions)
125
 
126
- all_scaffolds = sorted(df_full["Scaffold"].unique().tolist()) if not df_full.empty else []
 
 
127
 
128
- with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as app:
129
  gr.Markdown(f"# {TITLE}")
130
- gr.Markdown(SUBTITLE)
131
- gr.Markdown(
132
- f"πŸ“„ [Paper]({PAPER_URL}) Β· "
133
- f"πŸ’» [Code]({REPO_URL}) Β· "
134
- f"πŸ€— [Dataset]({DATASET_URL}) Β· "
135
- f"πŸ“₯ [How to submit]({REPO_URL}#submitting-to-the-leaderboard)"
 
 
136
  )
137
 
138
  with gr.Tabs():
 
139
  with gr.Tab("πŸ† Leaderboard"):
140
- with gr.Row():
141
- split_filter = gr.Radio(
142
- choices=["all", "test", "full", "dev"],
143
- value="all",
144
- label="Split",
145
- )
146
- scaffold_filter = gr.CheckboxGroup(
147
- choices=all_scaffolds,
148
- value=all_scaffolds,
149
- label="Scaffold",
150
- )
151
- min_seeds = gr.Slider(
152
- minimum=1, maximum=5, value=1, step=1, label="Min seeds",
153
- )
154
 
155
- table = gr.Dataframe(
156
- value=df_full,
 
157
  interactive=False,
158
  wrap=True,
159
- datatype=["str"] * len(df_full.columns) if not df_full.empty else None,
 
160
  )
161
 
162
- for ctrl in (split_filter, scaffold_filter, min_seeds):
163
- ctrl.change(
164
- fn=lambda sp, sc, ms: filter_df(df_full, sp, sc, ms),
165
- inputs=[split_filter, scaffold_filter, min_seeds],
166
- outputs=table,
 
167
  )
168
 
169
- with gr.Tab("πŸ“– About"):
170
- gr.Markdown(f"""
171
- ## About DeepSynth
172
-
173
- DeepSynth evaluates LLM agents on **multi-step web information synthesis** β€”
174
- tasks where the correct answer requires combining many pieces of evidence
175
- retrieved across the open web, not single-hop lookup.
 
 
 
 
 
176
 
177
- **Scoring:** submissions are evaluated with three metrics β€” Exact Match (EM),
178
- F1 on key-value pairs, and LLM-Judge with small numerical tolerance. The
179
- leaderboard sorts by LLM-Judge by default; click any column header to re-sort.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
- **Efficiency columns** (cost, latency, tool calls) are optional and populated
182
- when submitters provide per-task instrumentation.
183
- """)
 
 
 
 
 
184
 
185
- with gr.Tab("πŸ“€ Submit"):
186
- gr.Markdown(f"""
187
- ## How to submit
188
-
189
- 1. Run your agent on the DeepSynth **test set** (120 tasks, questions-only
190
- file on the HF dataset).
191
- 2. Produce a submission JSON conforming to [`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json).
192
- 3. Validate locally: `python scripts/evaluation/validate_submission.py my_submission.json`.
193
- 4. Open a PR to the [leaderboard repo]({REPO_URL}) adding your file under
194
- `submissions/YYYY-MM-DD-agent-name.json`.
195
- 5. CI will run schema validation and score computation; on merge, your row
196
- appears here automatically.
197
-
198
- **Required:** a public code URL that reproduces your results. We may ask for
199
- a run trace for spot-check verification.
200
- """)
201
 
 
202
  with gr.Tab("πŸ“œ Citation"):
203
- gr.Markdown("""
204
- ```bibtex
205
- @inproceedings{deepsynth2026,
206
- title = {A Benchmark for Deep Information Synthesis},
207
- author = {{Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos}}},
208
- booktitle = {International Conference on Learning Representations (ICLR)},
209
- year = {2026}
210
- }
211
- ```
212
- """)
 
 
 
 
 
 
 
213
 
214
  return app
215
 
216
 
217
  if __name__ == "__main__":
218
- build_app().launch()
 
1
  """
2
  DeepSynth Leaderboard β€” Hugging Face Space (Gradio)
3
 
4
+ GAIA-style leaderboard with tabs, custom typography, and a submission upload form.
 
 
 
5
  """
6
 
7
  from __future__ import annotations
8
 
9
+ import datetime
10
  import json
11
  import os
12
+ import re
13
  from pathlib import Path
14
  from typing import Any
15
 
 
21
  # ---------------------------------------------------------------------------
22
 
23
  RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
24
+ QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
25
+ QUEUE_DIR.mkdir(exist_ok=True, parents=True)
26
+
27
  TITLE = "πŸ™ DeepSynth Leaderboard"
28
+ TAGLINE = "A Benchmark for Deep Information Synthesis Β· ICLR 2026"
29
+ ABOUT_BLURB = (
30
+ "Large language model (LLM)-based agents are increasingly used to solve complex tasks "
31
+ "involving tool use β€” web browsing, code execution, data analysis. Current benchmarks "
32
+ "do not adequately assess their ability to solve real-world tasks requiring synthesis "
33
+ "across multiple sources and inference beyond simple fact retrieval.\n\n"
34
+ "**DeepSynth** introduces 120 tasks across 7 domains and 67 countries, designed to evaluate "
35
+ "agents on realistic, time-consuming problems that combine information gathering, synthesis, "
36
+ "and structured reasoning."
37
  )
38
  REPO_URL = "https://github.com/agentdeepsynthesis/deepsynth-bench"
39
  PAPER_URL = "https://arxiv.org/abs/2602.21143"
40
  DATASET_URL = "https://huggingface.co/datasets/DeepSynthesisTeam/deepsynth-bench"
41
 
42
+ # ---------------------------------------------------------------------------
43
+ # Styling β€” GAIA-inspired: mono/sans headers, compact table, subtle colors
44
+ # ---------------------------------------------------------------------------
45
+
46
+ CUSTOM_CSS = """
47
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap');
48
+
49
+ .gradio-container {
50
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
51
+ max-width: 1400px !important;
52
+ margin: 0 auto !important;
53
+ }
54
+
55
+ .gradio-container h1, .gradio-container h2, .gradio-container h3 {
56
+ font-family: 'Inter', sans-serif !important;
57
+ font-weight: 700 !important;
58
+ letter-spacing: -0.02em !important;
59
+ }
60
+
61
+ .gradio-container h1 { font-size: 2.2rem !important; margin-bottom: 0.25rem !important; }
62
+ .gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
63
+
64
+ .gradio-container code, .gradio-container pre {
65
+ font-family: 'JetBrains Mono', 'Fira Code', monospace !important;
66
+ font-size: 0.9em !important;
67
+ }
68
+
69
+ /* Tagline under title */
70
+ .tagline {
71
+ color: #6b7280;
72
+ font-size: 1rem;
73
+ margin-bottom: 1.5rem;
74
+ font-weight: 500;
75
+ }
76
+
77
+ /* Leaderboard table β€” tighter, more readable */
78
+ .gradio-container .table-wrap table {
79
+ font-family: 'Inter', sans-serif !important;
80
+ font-size: 0.92rem !important;
81
+ }
82
+
83
+ .gradio-container .table-wrap th {
84
+ font-weight: 600 !important;
85
+ background: #f9fafb !important;
86
+ border-bottom: 2px solid #e5e7eb !important;
87
+ text-align: left !important;
88
+ }
89
+
90
+ .gradio-container .table-wrap td {
91
+ padding: 0.55rem 0.75rem !important;
92
+ }
93
+
94
+ /* Section headers (like GAIA's "Test set" / "Validation set") */
95
+ .section-header {
96
+ font-size: 1.1rem;
97
+ font-weight: 700;
98
+ margin: 1rem 0 0.5rem 0;
99
+ padding-bottom: 0.4rem;
100
+ border-bottom: 1px solid #e5e7eb;
101
+ }
102
+
103
+ /* Badge-style links under the title */
104
+ .link-row a {
105
+ display: inline-block;
106
+ padding: 0.25rem 0.75rem;
107
+ margin-right: 0.5rem;
108
+ border-radius: 6px;
109
+ background: #f3f4f6;
110
+ color: #374151 !important;
111
+ text-decoration: none !important;
112
+ font-size: 0.9rem;
113
+ font-weight: 500;
114
+ }
115
+ .link-row a:hover { background: #e5e7eb; }
116
+
117
+ /* Tabs β€” cleaner look */
118
+ .gradio-container button.tab-nav {
119
+ font-weight: 600 !important;
120
+ font-size: 1rem !important;
121
+ }
122
+ """
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # Paper baselines β€” pre-populated from Table 1 of the DeepSynth paper
126
+ # ---------------------------------------------------------------------------
127
+
128
+ PAPER_BASELINES: list[dict[str, Any]] = [
129
+ # LLM Baselines (no agent scaffold, no tool use)
130
+ {"category": "LLM Baseline", "agent": "o4-mini", "model": "o4-mini (2025-08)", "access": "closed", "scaffold": "none", "f1": 3.05, "precision": 2.33, "recall": 4.39, "em": 0.00, "llm_judge": 0.00},
131
+ {"category": "LLM Baseline", "agent": "GPT-4.1", "model": "gpt-4.1 (2025-08)", "access": "closed", "scaffold": "none", "f1": 3.46, "precision": 2.86, "recall": 4.39, "em": 0.00, "llm_judge": 0.00},
132
+ {"category": "LLM Baseline", "agent": "o3", "model": "o3 (2025-08)", "access": "closed", "scaffold": "none", "f1": 3.29, "precision": 2.85, "recall": 3.90, "em": 0.00, "llm_judge": 0.00},
133
+ {"category": "LLM Baseline", "agent": "GPT-5.1", "model": "gpt-5.1 (2025-08)", "access": "closed", "scaffold": "none", "f1": 3.83, "precision": 2.98, "recall": 5.37, "em": 0.00, "llm_judge": 0.00},
134
+ {"category": "LLM Baseline", "agent": "Gemini-Pro-2.5", "model": "gemini-pro-2.5 (2025-08)", "access": "closed", "scaffold": "none", "f1": 6.25, "precision": 4.71, "recall": 9.27, "em": 0.00, "llm_judge": 5.00},
135
+ {"category": "LLM Baseline", "agent": "GPT-5.2-Pro", "model": "gpt-5.2-pro (2026-02)", "access": "closed", "scaffold": "none", "f1": 8.70, "precision": 8.45, "recall": 8.96, "em": 6.25, "llm_judge": 6.67},
136
+ {"category": "LLM Baseline", "agent": "DeepSeek-R1-Chat", "model": "deepseek-r1-chat (2025-08)","access": "open", "scaffold": "none", "f1": 3.23, "precision": 2.75, "recall": 3.90, "em": 1.67, "llm_judge": 2.50},
137
+ {"category": "LLM Baseline", "agent": "DeepSeek-R1-Reasoner", "model": "deepseek-r1 (2026-02)", "access": "open", "scaffold": "none", "f1": 2.80, "precision": 2.73, "recall": 2.87, "em": 2.50, "llm_judge": 6.67},
138
+ # Framework-based Agents
139
+ {"category": "Agent Framework", "agent": "o3-deep-research", "model": "o3-deep-research (2025-08)","access": "closed", "scaffold": "Custom", "f1": 8.97, "precision": 7.73, "recall": 10.69, "em": 2.50, "llm_judge": 17.50},
140
+ {"category": "Agent Framework", "agent": "Smolagent", "model": "gpt-4.1", "access": "open", "scaffold": "CodeAct", "f1": 3.75, "precision": 3.27, "recall": 4.39, "em": 2.50, "llm_judge": 7.50},
141
+ {"category": "Agent Framework", "agent": "Smolagent", "model": "gpt-5", "access": "open", "scaffold": "CodeAct", "f1": 6.42, "precision": 6.34, "recall": 6.50, "em": 1.67, "llm_judge": 2.50},
142
+ {"category": "Agent Framework", "agent": "OWL", "model": "gpt-4.1", "access": "open", "scaffold": "Custom", "f1": 5.41, "precision": 4.62, "recall": 6.52, "em": 1.67, "llm_judge": 12.50},
143
  ]
144
 
145
+
146
+ def baselines_dataframe() -> pd.DataFrame:
147
+ """Build the paper-baselines DataFrame in display order."""
148
+ rows = []
149
+ for b in PAPER_BASELINES:
150
+ lock = "πŸ”’" if b["access"] == "closed" else "πŸ”“"
151
+ rows.append({
152
+ "Category": b["category"],
153
+ "Agent": b["agent"],
154
+ "Model": b["model"],
155
+ "Access": f"{lock} {b['access']}",
156
+ "Scaffold": b["scaffold"],
157
+ "F1": b["f1"],
158
+ "Precision": b["precision"],
159
+ "Recall": b["recall"],
160
+ "EM": b["em"],
161
+ "LLM Judge": b["llm_judge"],
162
+ })
163
+ df = pd.DataFrame(rows)
164
+ # Sort by LLM Judge within each category, preserving LLM Baseline first
165
+ df["__cat_order"] = df["Category"].map({"LLM Baseline": 0, "Agent Framework": 1})
166
+ df = df.sort_values(by=["__cat_order", "LLM Judge", "F1"], ascending=[True, False, False])
167
+ df = df.drop(columns=["__cat_order"]).reset_index(drop=True)
168
+ return df
169
+
170
+
171
  # ---------------------------------------------------------------------------
172
+ # Community submissions β€” loaded from the submissions/ directory
173
  # ---------------------------------------------------------------------------
174
 
175
  def load_submissions(results_dir: Path) -> list[dict[str, Any]]:
 
176
  if not results_dir.exists():
177
  return []
178
  rows = []
 
185
  return rows
186
 
187
 
188
+ def community_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
 
189
  if not submissions:
190
+ return pd.DataFrame(columns=[
191
+ "Agent", "Model", "Scaffold", "Split", "F1", "EM", "LLM Judge",
192
+ "Avg Cost ($)", "Avg Latency (s)", "Org", "Date", "Links"
193
+ ])
194
 
195
  records = []
196
  for s in submissions:
 
198
  scores = s.get("scores", {})
199
  efficiency = s.get("efficiency", {})
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  paper = meta.get("paper_url")
202
  code = meta.get("code_url")
203
+ links = " Β· ".join(filter(None, [
204
+ f"[paper]({paper})" if paper else None,
205
+ f"[code]({code})" if code else None,
206
+ ]))
207
+
208
+ records.append({
209
+ "Agent": meta.get("agent_name", "β€”"),
210
+ "Model": meta.get("base_model", "β€”"),
211
+ "Scaffold": meta.get("scaffold", "β€”"),
212
+ "Split": meta.get("split", "β€”"),
213
+ "F1": scores.get("overall", {}).get("f1"),
214
+ "EM": scores.get("overall", {}).get("exact_match"),
215
+ "LLM Judge": scores.get("overall", {}).get("llm_judge"),
216
+ "Avg Cost ($)": efficiency.get("avg_cost_usd"),
217
+ "Avg Latency (s)": efficiency.get("avg_latency_s"),
218
+ "Org": meta.get("organization", "β€”"),
219
+ "Date": meta.get("submission_date", "β€”"),
220
+ "Links": links,
221
+ })
222
 
223
  df = pd.DataFrame(records)
224
+ if "LLM Judge" in df.columns and not df["LLM Judge"].isna().all():
225
  df = df.sort_values("LLM Judge", ascending=False, na_position="last").reset_index(drop=True)
226
  return df
227
 
228
 
229
  # ---------------------------------------------------------------------------
230
+ # Submission upload handler β€” saves file to queue for maintainer review
231
  # ---------------------------------------------------------------------------
232
 
233
+ SAFE_NAME_RE = re.compile(r"[^a-zA-Z0-9._-]+")
234
+
235
+
236
+ def _safe_slug(text: str, maxlen: int = 40) -> str:
237
+ slug = SAFE_NAME_RE.sub("-", (text or "unnamed").strip()).strip("-").lower()
238
+ return slug[:maxlen] or "unnamed"
239
+
240
+
241
+ def submit_predictions(
242
+ file_obj,
243
+ agent_name: str,
244
+ base_model: str,
245
+ scaffold: str,
246
+ organization: str,
247
+ contact_email: str,
248
+ code_url: str,
249
+ split: str,
250
+ ) -> str:
251
+ """Validate basic fields, save the uploaded predictions file to the review queue."""
252
+ if file_obj is None:
253
+ return "❌ **Missing file.** Please attach a predictions JSON."
254
+ if not agent_name.strip():
255
+ return "❌ **Missing agent name.**"
256
+ if not base_model.strip():
257
+ return "❌ **Missing base model.**"
258
+ if not organization.strip():
259
+ return "❌ **Missing organization.**"
260
+ if not contact_email.strip() or "@" not in contact_email:
261
+ return "❌ **Invalid contact email.**"
262
+ if not code_url.strip() or not code_url.startswith(("http://", "https://")):
263
+ return "❌ **Missing or invalid code URL.** A public `code_url` that reproduces your results is required."
264
+
265
+ # Try to parse the uploaded file as JSON to catch corruption early
266
+ try:
267
+ src_path = Path(file_obj.name if hasattr(file_obj, "name") else file_obj)
268
+ with src_path.open("r", encoding="utf-8") as f:
269
+ predictions = json.load(f)
270
+ except json.JSONDecodeError as e:
271
+ return f"❌ **Invalid JSON in uploaded file:** {e}"
272
+ except OSError as e:
273
+ return f"❌ **Could not read uploaded file:** {e}"
274
+
275
+ if not isinstance(predictions, dict) or not predictions:
276
+ return "❌ **Predictions file must be a non-empty JSON object mapping task IDs to answers.**"
277
+
278
+ # Build a bundle with metadata + raw predictions for the maintainer to review
279
+ bundle = {
280
+ "received_at": datetime.datetime.utcnow().isoformat() + "Z",
281
+ "metadata": {
282
+ "agent_name": agent_name.strip(),
283
+ "base_model": base_model.strip(),
284
+ "scaffold": scaffold,
285
+ "organization": organization.strip(),
286
+ "contact_email": contact_email.strip(),
287
+ "code_url": code_url.strip(),
288
+ "split": split,
289
+ "submission_date": datetime.date.today().isoformat(),
290
+ },
291
+ "predictions": predictions,
292
+ }
293
+
294
+ date = datetime.date.today().isoformat()
295
+ fname = f"{date}-{_safe_slug(organization)}-{_safe_slug(agent_name)}.json"
296
+ dest = QUEUE_DIR / fname
297
+ with dest.open("w", encoding="utf-8") as f:
298
+ json.dump(bundle, f, indent=2, ensure_ascii=False)
299
+
300
+ return (
301
+ f"βœ… **Submission received.** Your file has been queued for review as `{fname}`.\n\n"
302
+ f"A maintainer will score it against the private test-set answers and merge it to the "
303
+ f"leaderboard within ~1 week. We may email `{contact_email}` if we need to verify "
304
+ f"reproducibility via your `code_url`.\n\n"
305
+ f"**Note:** submissions in this Space's queue are held temporarily β€” for a permanent "
306
+ f"record, please also open a PR to the "
307
+ f"[benchmark repo]({REPO_URL}) with your predictions file under `submissions/`."
308
+ )
309
 
310
 
311
+ # ---------------------------------------------------------------------------
312
+ # UI
313
+ # ---------------------------------------------------------------------------
314
 
315
+ def build_app() -> gr.Blocks:
316
+ df_baselines = baselines_dataframe()
317
+ df_community = community_dataframe(load_submissions(RESULTS_DIR))
318
 
319
+ with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
320
  gr.Markdown(f"# {TITLE}")
321
+ gr.HTML(f"<div class='tagline'>{TAGLINE}</div>")
322
+ gr.HTML(
323
+ "<div class='link-row'>"
324
+ f"<a href='{PAPER_URL}' target='_blank'>πŸ“„ Paper</a>"
325
+ f"<a href='{REPO_URL}' target='_blank'>πŸ’» Code</a>"
326
+ f"<a href='{DATASET_URL}' target='_blank'>πŸ€— Dataset</a>"
327
+ f"<a href='{REPO_URL}#submitting-to-the-leaderboard' target='_blank'>πŸ“₯ How to submit</a>"
328
+ "</div>"
329
  )
330
 
331
  with gr.Tabs():
332
+ # -------------------------------------------------------------
333
  with gr.Tab("πŸ† Leaderboard"):
334
+ gr.Markdown(
335
+ "Results on the **full DeepSynth test set** (120 tasks, Pass@1). "
336
+ "F1 / Precision / Recall measure prediction quality against gold answers; "
337
+ "LLM Judge reports average precision under semantic matching.",
338
+ elem_classes=["section-header"],
339
+ )
 
 
 
 
 
 
 
 
340
 
341
+ gr.Markdown("### Paper baselines")
342
+ gr.Dataframe(
343
+ value=df_baselines,
344
  interactive=False,
345
  wrap=True,
346
+ datatype=["str", "str", "str", "str", "str",
347
+ "number", "number", "number", "number", "number"],
348
  )
349
 
350
+ if not df_community.empty:
351
+ gr.Markdown("### Community submissions")
352
+ gr.Dataframe(
353
+ value=df_community,
354
+ interactive=False,
355
+ wrap=True,
356
  )
357
 
358
+ # -------------------------------------------------------------
359
+ with gr.Tab("πŸ“€ Submit"):
360
+ gr.Markdown("## Submit your agent's predictions")
361
+ gr.Markdown(
362
+ "Upload a JSON file of predictions on the DeepSynth **test set**. "
363
+ "We'll score it against the private gold answers and add your row to the "
364
+ "community leaderboard.\n\n"
365
+ f"**Format:** a JSON object mapping task IDs (`\"001\"` … `\"120\"`) to your "
366
+ "agent's answer. See "
367
+ f"[`submission_schema.json`]({REPO_URL}/blob/main/scripts/evaluation/submission_schema.json) "
368
+ "for the full spec."
369
+ )
370
 
371
+ with gr.Row():
372
+ with gr.Column():
373
+ agent_name_in = gr.Textbox(label="Agent name", placeholder="e.g. ReAct-GPT5")
374
+ base_model_in = gr.Textbox(label="Base model", placeholder="e.g. gpt-5.2-pro (2026-02)")
375
+ scaffold_in = gr.Dropdown(
376
+ choices=["none", "ReAct", "CodeAct", "Plan-and-Execute", "Reflexion", "MCTS", "Custom"],
377
+ label="Scaffold",
378
+ value="ReAct",
379
+ )
380
+ split_in = gr.Dropdown(
381
+ choices=["test", "full"],
382
+ label="Split evaluated",
383
+ value="test",
384
+ )
385
+ with gr.Column():
386
+ organization_in = gr.Textbox(label="Organization", placeholder="e.g. MSR India")
387
+ contact_email_in = gr.Textbox(label="Contact email", placeholder="you@org.edu")
388
+ code_url_in = gr.Textbox(
389
+ label="Code URL (required)",
390
+ placeholder="https://github.com/you/your-agent",
391
+ )
392
+
393
+ predictions_in = gr.File(
394
+ label="Predictions JSON file",
395
+ file_types=[".json"],
396
+ )
397
+ submit_btn = gr.Button("Submit for review", variant="primary")
398
+ submit_status = gr.Markdown()
399
+
400
+ submit_btn.click(
401
+ fn=submit_predictions,
402
+ inputs=[
403
+ predictions_in, agent_name_in, base_model_in, scaffold_in,
404
+ organization_in, contact_email_in, code_url_in, split_in,
405
+ ],
406
+ outputs=submit_status,
407
+ )
408
 
409
+ gr.Markdown(
410
+ "---\n"
411
+ "**What happens next?** Submissions are queued for maintainer review. "
412
+ "We verify metadata honesty and spot-check reproducibility via your "
413
+ "`code_url` before computing scores and merging to the leaderboard.\n\n"
414
+ f"**Prefer Git?** Open a PR to [{REPO_URL.split('//')[1]}]({REPO_URL}) "
415
+ "adding your file under `submissions/YYYY-MM-DD-org-agentname.json`."
416
+ )
417
 
418
+ # -------------------------------------------------------------
419
+ with gr.Tab("πŸ“– About"):
420
+ gr.Markdown(ABOUT_BLURB)
421
+ gr.Markdown(
422
+ "## Metrics\n"
423
+ "- **F1 / Precision / Recall** β€” token-level overlap between predicted "
424
+ "and gold answers, averaged over all tasks.\n"
425
+ "- **Exact Match (EM)** β€” fraction of tasks where the predicted answer "
426
+ "exactly equals the gold answer (strict).\n"
427
+ "- **LLM Judge** β€” semantic-equivalence scoring with small numerical "
428
+ "tolerance (1–5.5%), evaluated by a strong frozen judge model.\n\n"
429
+ "## Dataset\n"
430
+ f"DeepSynth is hosted on πŸ€— [`DeepSynthesisTeam/deepsynth-bench`]({DATASET_URL}). "
431
+ "The dev set (40 tasks) ships with gold answers for prototyping; the test "
432
+ "set (120 tasks) is released questions-only to prevent contamination."
433
+ )
434
 
435
+ # -------------------------------------------------------------
436
  with gr.Tab("πŸ“œ Citation"):
437
+ gr.Markdown("### Please cite:")
438
+ gr.Code(
439
+ value=(
440
+ "@inproceedings{deepsynth2026,\n"
441
+ " title = {A Benchmark for Deep Information Synthesis},\n"
442
+ " author = {Paul, Debjit and Murphy, Daniel and Gritta, Milan and Cardenas, Ronald\n"
443
+ " and Prokhorov, Victor and Bolliger, Lena Sophia and Toker, Aysim\n"
444
+ " and Miles, Roy and Oncescu, Andreea-Maria and Sivakumar, Jasivan Alex\n"
445
+ " and Borchert, Philipp and Elezi, Ismail and Zhang, Meiru\n"
446
+ " and Lee, Ka Yiu and Zhang, Guchun and Wang, Jun and Lampouras, Gerasimos},\n"
447
+ " booktitle = {International Conference on Learning Representations (ICLR)},\n"
448
+ " year = {2026},\n"
449
+ " url = {" + PAPER_URL + "}\n"
450
+ "}"
451
+ ),
452
+ language="latex",
453
+ )
454
 
455
  return app
456
 
457
 
458
  if __name__ == "__main__":
459
+ build_app().launch()