debjitpaul commited on
Commit
a380ad5
·
1 Parent(s): 1f21b1a

update app.py and submissions

Browse files
app.py CHANGED
@@ -22,6 +22,7 @@ import pandas as pd
22
  # ---------------------------------------------------------------------------
23
 
24
  RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
 
25
  QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
26
  QUEUE_DIR.mkdir(exist_ok=True, parents=True)
27
 
@@ -59,7 +60,11 @@ CUSTOM_CSS = """
59
  letter-spacing: -0.02em !important;
60
  }
61
 
62
- .gradio-container h1 { font-size: 2.2rem !important; margin-bottom: 0.25rem !important; }
 
 
 
 
63
  .gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
64
 
65
  .gradio-container code, .gradio-container pre {
@@ -72,6 +77,7 @@ CUSTOM_CSS = """
72
  font-size: 1rem;
73
  margin-bottom: 1.5rem;
74
  font-weight: 500;
 
75
  }
76
 
77
  .gradio-container .table-wrap table {
@@ -96,6 +102,10 @@ CUSTOM_CSS = """
96
  border-bottom: 1px solid #e5e7eb;
97
  }
98
 
 
 
 
 
99
  .link-row a {
100
  display: inline-block;
101
  padding: 0.25rem 0.75rem;
@@ -172,9 +182,9 @@ def leaderboard_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
172
 
173
  df = pd.DataFrame(rows)
174
 
175
- # Drop efficiency columns if every row is None (keeps the table narrow
176
- # when no submitters have supplied them yet).
177
- for col in ("Avg Cost ($)", "Avg Latency (s)"):
178
  if col in df.columns and df[col].isna().all():
179
  df = df.drop(columns=[col])
180
 
@@ -276,7 +286,8 @@ def submit_predictions(
276
  # ---------------------------------------------------------------------------
277
 
278
  def build_app() -> gr.Blocks:
279
- df = leaderboard_dataframe(load_submissions(RESULTS_DIR))
 
280
 
281
  with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
282
  gr.Markdown(f"# {TITLE}")
@@ -294,18 +305,37 @@ def build_app() -> gr.Blocks:
294
  # -------------------------------------------------------------
295
  with gr.Tab("🏆 Leaderboard"):
296
  gr.Markdown(
297
- "Results on the **DeepSynth test set** (80 tasks, Pass@1), "
298
- "ranked by **F1** score (LLM Judge used as tiebreaker). "
299
  "F1 / Precision / Recall measure prediction quality against gold "
300
  "answers; **LLM Judge** reports average precision under semantic "
301
  "matching. 🔒 = closed model, 🔓 = open-weights.",
302
  elem_classes=["section-header"],
303
  )
304
- gr.Dataframe(
305
- value=df,
306
- interactive=False,
307
- wrap=True,
308
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  # -------------------------------------------------------------
311
  with gr.Tab("📤 Submit"):
 
22
  # ---------------------------------------------------------------------------
23
 
24
  RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
25
+ DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
26
  QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
27
  QUEUE_DIR.mkdir(exist_ok=True, parents=True)
28
 
 
60
  letter-spacing: -0.02em !important;
61
  }
62
 
63
+ .gradio-container h1 {
64
+ font-size: 2.2rem !important;
65
+ margin-bottom: 0.25rem !important;
66
+ text-align: center !important;
67
+ }
68
  .gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
69
 
70
  .gradio-container code, .gradio-container pre {
 
77
  font-size: 1rem;
78
  margin-bottom: 1.5rem;
79
  font-weight: 500;
80
+ text-align: center;
81
  }
82
 
83
  .gradio-container .table-wrap table {
 
102
  border-bottom: 1px solid #e5e7eb;
103
  }
104
 
105
+ .link-row {
106
+ text-align: center;
107
+ margin-bottom: 1.5rem;
108
+ }
109
  .link-row a {
110
  display: inline-block;
111
  padding: 0.25rem 0.75rem;
 
182
 
183
  df = pd.DataFrame(rows)
184
 
185
+ # Drop optional metric columns if every row is None keeps the table
186
+ # clean when a split (e.g. dev) only reports a subset of metrics.
187
+ for col in ("Avg Cost ($)", "Avg Latency (s)", "Precision", "Recall", "EM"):
188
  if col in df.columns and df[col].isna().all():
189
  df = df.drop(columns=[col])
190
 
 
286
  # ---------------------------------------------------------------------------
287
 
288
  def build_app() -> gr.Blocks:
289
+ df_test = leaderboard_dataframe(load_submissions(RESULTS_DIR))
290
+ df_dev = leaderboard_dataframe(load_submissions(DEV_RESULTS_DIR))
291
 
292
  with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
293
  gr.Markdown(f"# {TITLE}")
 
305
  # -------------------------------------------------------------
306
  with gr.Tab("🏆 Leaderboard"):
307
  gr.Markdown(
308
+ "Results ranked by **F1** score (LLM Judge used as tiebreaker). "
 
309
  "F1 / Precision / Recall measure prediction quality against gold "
310
  "answers; **LLM Judge** reports average precision under semantic "
311
  "matching. 🔒 = closed model, 🔓 = open-weights.",
312
  elem_classes=["section-header"],
313
  )
314
+
315
+ with gr.Tabs():
316
+ with gr.Tab("Dev (40 tasks · public)"):
317
+ gr.Markdown(
318
+ "Self-reported numbers on the **public dev set** (40 tasks, "
319
+ "Pass@1). Useful for prototyping and comparing methods during "
320
+ "development. Anyone can score themselves locally on this split.",
321
+ )
322
+ gr.Dataframe(
323
+ value=df_dev,
324
+ interactive=False,
325
+ wrap=True,
326
+ )
327
+
328
+ with gr.Tab("Test (80 tasks · held-out)"):
329
+ gr.Markdown(
330
+ "Official numbers on the **held-out test set** (80 tasks, "
331
+ "Pass@1). Gold answers are private; submissions are scored "
332
+ "by the maintainers.",
333
+ )
334
+ gr.Dataframe(
335
+ value=df_test,
336
+ interactive=False,
337
+ wrap=True,
338
+ )
339
 
340
  # -------------------------------------------------------------
341
  with gr.Tab("📤 Submit"):
submissions/2026-02-15-huawei-deepseek-chat.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "DeepSeek-Chat",
4
+ "base_model": "deepseek-chat (2025-08)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "open",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 2.1,
19
+ "llm_judge": 5.0
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-deepseek-reasoner.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "DeepSeek-Reasoner",
4
+ "base_model": "deepseek-r1 (2026-02)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "open",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 5.0,
19
+ "llm_judge": 7.5
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-gemini-pro-2-5 (1).json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "Gemini-Pro-2.5",
4
+ "base_model": "gemini-pro-2.5 (2025-08)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "closed",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 5.9,
19
+ "llm_judge": 5.0
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-gemini-pro-3.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "Gemini-Pro-3",
4
+ "base_model": "gemini-pro-3 (2026-02)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "closed",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 8.6,
19
+ "llm_judge": 15.0
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-gpt-4-1 (1).json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "GPT-4.1",
4
+ "base_model": "gpt-4.1 (2025-08)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "closed",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 1.8,
19
+ "llm_judge": 7.5
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-gpt-5-1 (1).json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "GPT-5.1",
4
+ "base_model": "gpt-5.1 (2025-08)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "closed",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 6.2,
19
+ "llm_judge": 12.5
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-gpt-5-2.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "GPT-5.2",
4
+ "base_model": "gpt-5.2-pro (2026-02)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "closed",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 15.6,
19
+ "llm_judge": 5.0
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-o3 (1).json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "o3",
4
+ "base_model": "o3 (2025-08)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "closed",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 6.3,
19
+ "llm_judge": 10.0
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-o3-deep-research (1).json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "o3-deep-research",
4
+ "base_model": "o3-deep-research (2025-08)",
5
+ "scaffold": "Custom",
6
+ "category": "Agent Framework",
7
+ "access": "closed",
8
+ "tools_used": [
9
+ "web_search",
10
+ "python_interpreter"
11
+ ],
12
+ "organization": "Huawei (paper baseline)",
13
+ "paper_url": "https://arxiv.org/abs/2602.21143",
14
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
15
+ "submission_date": "2026-02-15",
16
+ "split": "dev",
17
+ "num_seeds": 1
18
+ },
19
+ "scores": {
20
+ "overall": {
21
+ "f1": 9.9,
22
+ "llm_judge": 20.0
23
+ }
24
+ }
25
+ }
submissions/2026-02-15-huawei-o4-mini (1).json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "o4-mini",
4
+ "base_model": "o4-mini (2025-08)",
5
+ "scaffold": "none",
6
+ "category": "LLM Baseline",
7
+ "access": "closed",
8
+ "tools_used": [],
9
+ "organization": "Huawei (paper baseline)",
10
+ "paper_url": "https://arxiv.org/abs/2602.21143",
11
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
12
+ "submission_date": "2026-02-15",
13
+ "split": "dev",
14
+ "num_seeds": 1
15
+ },
16
+ "scores": {
17
+ "overall": {
18
+ "f1": 3.3,
19
+ "llm_judge": 2.5
20
+ }
21
+ }
22
+ }
submissions/2026-02-15-huawei-owl-gpt4-1 (1).json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "OWL-GPT4.1",
4
+ "base_model": "gpt-4.1",
5
+ "scaffold": "Custom",
6
+ "category": "Agent Framework",
7
+ "access": "open",
8
+ "tools_used": [
9
+ "web_search",
10
+ "python_interpreter"
11
+ ],
12
+ "organization": "Huawei (paper baseline)",
13
+ "paper_url": "https://arxiv.org/abs/2602.21143",
14
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
15
+ "submission_date": "2026-02-15",
16
+ "split": "dev",
17
+ "num_seeds": 1
18
+ },
19
+ "scores": {
20
+ "overall": {
21
+ "f1": 4.1,
22
+ "llm_judge": 12.5
23
+ }
24
+ }
25
+ }
submissions/2026-02-15-huawei-smolagent-gpt4-1 (1).json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "Smolagent-GPT4.1",
4
+ "base_model": "gpt-4.1",
5
+ "scaffold": "CodeAct",
6
+ "category": "Agent Framework",
7
+ "access": "open",
8
+ "tools_used": [
9
+ "web_search",
10
+ "python_interpreter"
11
+ ],
12
+ "organization": "Huawei (paper baseline)",
13
+ "paper_url": "https://arxiv.org/abs/2602.21143",
14
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
15
+ "submission_date": "2026-02-15",
16
+ "split": "dev",
17
+ "num_seeds": 1
18
+ },
19
+ "scores": {
20
+ "overall": {
21
+ "f1": 6.3,
22
+ "llm_judge": 7.5
23
+ }
24
+ }
25
+ }