debjitpaul commited on
Commit ·
a380ad5
1
Parent(s): 1f21b1a
update app.py and submissions
Browse files- app.py +42 -12
- submissions/2026-02-15-huawei-deepseek-chat.json +22 -0
- submissions/2026-02-15-huawei-deepseek-reasoner.json +22 -0
- submissions/2026-02-15-huawei-gemini-pro-2-5 (1).json +22 -0
- submissions/2026-02-15-huawei-gemini-pro-3.json +22 -0
- submissions/2026-02-15-huawei-gpt-4-1 (1).json +22 -0
- submissions/2026-02-15-huawei-gpt-5-1 (1).json +22 -0
- submissions/2026-02-15-huawei-gpt-5-2.json +22 -0
- submissions/2026-02-15-huawei-o3 (1).json +22 -0
- submissions/2026-02-15-huawei-o3-deep-research (1).json +25 -0
- submissions/2026-02-15-huawei-o4-mini (1).json +22 -0
- submissions/2026-02-15-huawei-owl-gpt4-1 (1).json +25 -0
- submissions/2026-02-15-huawei-smolagent-gpt4-1 (1).json +25 -0
app.py
CHANGED
|
@@ -22,6 +22,7 @@ import pandas as pd
|
|
| 22 |
# ---------------------------------------------------------------------------
|
| 23 |
|
| 24 |
RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
|
|
|
|
| 25 |
QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
|
| 26 |
QUEUE_DIR.mkdir(exist_ok=True, parents=True)
|
| 27 |
|
|
@@ -59,7 +60,11 @@ CUSTOM_CSS = """
|
|
| 59 |
letter-spacing: -0.02em !important;
|
| 60 |
}
|
| 61 |
|
| 62 |
-
.gradio-container h1 {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
.gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
|
| 64 |
|
| 65 |
.gradio-container code, .gradio-container pre {
|
|
@@ -72,6 +77,7 @@ CUSTOM_CSS = """
|
|
| 72 |
font-size: 1rem;
|
| 73 |
margin-bottom: 1.5rem;
|
| 74 |
font-weight: 500;
|
|
|
|
| 75 |
}
|
| 76 |
|
| 77 |
.gradio-container .table-wrap table {
|
|
@@ -96,6 +102,10 @@ CUSTOM_CSS = """
|
|
| 96 |
border-bottom: 1px solid #e5e7eb;
|
| 97 |
}
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
.link-row a {
|
| 100 |
display: inline-block;
|
| 101 |
padding: 0.25rem 0.75rem;
|
|
@@ -172,9 +182,9 @@ def leaderboard_dataframe(submissions: list[dict[str, Any]]) -> pd.DataFrame:
|
|
| 172 |
|
| 173 |
df = pd.DataFrame(rows)
|
| 174 |
|
| 175 |
-
# Drop
|
| 176 |
-
# when
|
| 177 |
-
for col in ("Avg Cost ($)", "Avg Latency (s)"):
|
| 178 |
if col in df.columns and df[col].isna().all():
|
| 179 |
df = df.drop(columns=[col])
|
| 180 |
|
|
@@ -276,7 +286,8 @@ def submit_predictions(
|
|
| 276 |
# ---------------------------------------------------------------------------
|
| 277 |
|
| 278 |
def build_app() -> gr.Blocks:
|
| 279 |
-
|
|
|
|
| 280 |
|
| 281 |
with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
|
| 282 |
gr.Markdown(f"# {TITLE}")
|
|
@@ -294,18 +305,37 @@ def build_app() -> gr.Blocks:
|
|
| 294 |
# -------------------------------------------------------------
|
| 295 |
with gr.Tab("🏆 Leaderboard"):
|
| 296 |
gr.Markdown(
|
| 297 |
-
"Results
|
| 298 |
-
"ranked by **F1** score (LLM Judge used as tiebreaker). "
|
| 299 |
"F1 / Precision / Recall measure prediction quality against gold "
|
| 300 |
"answers; **LLM Judge** reports average precision under semantic "
|
| 301 |
"matching. 🔒 = closed model, 🔓 = open-weights.",
|
| 302 |
elem_classes=["section-header"],
|
| 303 |
)
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
# -------------------------------------------------------------
|
| 311 |
with gr.Tab("📤 Submit"):
|
|
|
|
| 22 |
# ---------------------------------------------------------------------------
|
| 23 |
|
| 24 |
RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_RESULTS_DIR", "submissions"))
|
| 25 |
+
DEV_RESULTS_DIR = Path(os.environ.get("DEEPSYNTH_DEV_RESULTS_DIR", "dev_submissions"))
|
| 26 |
QUEUE_DIR = Path(os.environ.get("DEEPSYNTH_QUEUE_DIR", "submissions_queue"))
|
| 27 |
QUEUE_DIR.mkdir(exist_ok=True, parents=True)
|
| 28 |
|
|
|
|
| 60 |
letter-spacing: -0.02em !important;
|
| 61 |
}
|
| 62 |
|
| 63 |
+
.gradio-container h1 {
|
| 64 |
+
font-size: 2.2rem !important;
|
| 65 |
+
margin-bottom: 0.25rem !important;
|
| 66 |
+
text-align: center !important;
|
| 67 |
+
}
|
| 68 |
.gradio-container h2 { font-size: 1.4rem !important; margin-top: 1.5rem !important; }
|
| 69 |
|
| 70 |
.gradio-container code, .gradio-container pre {
|
|
|
|
| 77 |
font-size: 1rem;
|
| 78 |
margin-bottom: 1.5rem;
|
| 79 |
font-weight: 500;
|
| 80 |
+
text-align: center;
|
| 81 |
}
|
| 82 |
|
| 83 |
.gradio-container .table-wrap table {
|
|
|
|
| 102 |
border-bottom: 1px solid #e5e7eb;
|
| 103 |
}
|
| 104 |
|
| 105 |
+
.link-row {
|
| 106 |
+
text-align: center;
|
| 107 |
+
margin-bottom: 1.5rem;
|
| 108 |
+
}
|
| 109 |
.link-row a {
|
| 110 |
display: inline-block;
|
| 111 |
padding: 0.25rem 0.75rem;
|
|
|
|
| 182 |
|
| 183 |
df = pd.DataFrame(rows)
|
| 184 |
|
| 185 |
+
# Drop optional metric columns if every row is None — keeps the table
|
| 186 |
+
# clean when a split (e.g. dev) only reports a subset of metrics.
|
| 187 |
+
for col in ("Avg Cost ($)", "Avg Latency (s)", "Precision", "Recall", "EM"):
|
| 188 |
if col in df.columns and df[col].isna().all():
|
| 189 |
df = df.drop(columns=[col])
|
| 190 |
|
|
|
|
| 286 |
# ---------------------------------------------------------------------------
|
| 287 |
|
| 288 |
def build_app() -> gr.Blocks:
|
| 289 |
+
df_test = leaderboard_dataframe(load_submissions(RESULTS_DIR))
|
| 290 |
+
df_dev = leaderboard_dataframe(load_submissions(DEV_RESULTS_DIR))
|
| 291 |
|
| 292 |
with gr.Blocks(title="DeepSynth Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Default()) as app:
|
| 293 |
gr.Markdown(f"# {TITLE}")
|
|
|
|
| 305 |
# -------------------------------------------------------------
|
| 306 |
with gr.Tab("🏆 Leaderboard"):
|
| 307 |
gr.Markdown(
|
| 308 |
+
"Results ranked by **F1** score (LLM Judge used as tiebreaker). "
|
|
|
|
| 309 |
"F1 / Precision / Recall measure prediction quality against gold "
|
| 310 |
"answers; **LLM Judge** reports average precision under semantic "
|
| 311 |
"matching. 🔒 = closed model, 🔓 = open-weights.",
|
| 312 |
elem_classes=["section-header"],
|
| 313 |
)
|
| 314 |
+
|
| 315 |
+
with gr.Tabs():
|
| 316 |
+
with gr.Tab("Dev (40 tasks · public)"):
|
| 317 |
+
gr.Markdown(
|
| 318 |
+
"Self-reported numbers on the **public dev set** (40 tasks, "
|
| 319 |
+
"Pass@1). Useful for prototyping and comparing methods during "
|
| 320 |
+
"development. Anyone can score themselves locally on this split.",
|
| 321 |
+
)
|
| 322 |
+
gr.Dataframe(
|
| 323 |
+
value=df_dev,
|
| 324 |
+
interactive=False,
|
| 325 |
+
wrap=True,
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
with gr.Tab("Test (80 tasks · held-out)"):
|
| 329 |
+
gr.Markdown(
|
| 330 |
+
"Official numbers on the **held-out test set** (80 tasks, "
|
| 331 |
+
"Pass@1). Gold answers are private; submissions are scored "
|
| 332 |
+
"by the maintainers.",
|
| 333 |
+
)
|
| 334 |
+
gr.Dataframe(
|
| 335 |
+
value=df_test,
|
| 336 |
+
interactive=False,
|
| 337 |
+
wrap=True,
|
| 338 |
+
)
|
| 339 |
|
| 340 |
# -------------------------------------------------------------
|
| 341 |
with gr.Tab("📤 Submit"):
|
submissions/2026-02-15-huawei-deepseek-chat.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "DeepSeek-Chat",
|
| 4 |
+
"base_model": "deepseek-chat (2025-08)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "open",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 2.1,
|
| 19 |
+
"llm_judge": 5.0
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-deepseek-reasoner.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "DeepSeek-Reasoner",
|
| 4 |
+
"base_model": "deepseek-r1 (2026-02)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "open",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 5.0,
|
| 19 |
+
"llm_judge": 7.5
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-gemini-pro-2-5 (1).json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "Gemini-Pro-2.5",
|
| 4 |
+
"base_model": "gemini-pro-2.5 (2025-08)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "closed",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 5.9,
|
| 19 |
+
"llm_judge": 5.0
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-gemini-pro-3.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "Gemini-Pro-3",
|
| 4 |
+
"base_model": "gemini-pro-3 (2026-02)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "closed",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 8.6,
|
| 19 |
+
"llm_judge": 15.0
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-gpt-4-1 (1).json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "GPT-4.1",
|
| 4 |
+
"base_model": "gpt-4.1 (2025-08)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "closed",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 1.8,
|
| 19 |
+
"llm_judge": 7.5
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-gpt-5-1 (1).json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "GPT-5.1",
|
| 4 |
+
"base_model": "gpt-5.1 (2025-08)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "closed",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 6.2,
|
| 19 |
+
"llm_judge": 12.5
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-gpt-5-2.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "GPT-5.2",
|
| 4 |
+
"base_model": "gpt-5.2-pro (2026-02)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "closed",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 15.6,
|
| 19 |
+
"llm_judge": 5.0
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-o3 (1).json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "o3",
|
| 4 |
+
"base_model": "o3 (2025-08)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "closed",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 6.3,
|
| 19 |
+
"llm_judge": 10.0
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-o3-deep-research (1).json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "o3-deep-research",
|
| 4 |
+
"base_model": "o3-deep-research (2025-08)",
|
| 5 |
+
"scaffold": "Custom",
|
| 6 |
+
"category": "Agent Framework",
|
| 7 |
+
"access": "closed",
|
| 8 |
+
"tools_used": [
|
| 9 |
+
"web_search",
|
| 10 |
+
"python_interpreter"
|
| 11 |
+
],
|
| 12 |
+
"organization": "Huawei (paper baseline)",
|
| 13 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 14 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 15 |
+
"submission_date": "2026-02-15",
|
| 16 |
+
"split": "dev",
|
| 17 |
+
"num_seeds": 1
|
| 18 |
+
},
|
| 19 |
+
"scores": {
|
| 20 |
+
"overall": {
|
| 21 |
+
"f1": 9.9,
|
| 22 |
+
"llm_judge": 20.0
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
}
|
submissions/2026-02-15-huawei-o4-mini (1).json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "o4-mini",
|
| 4 |
+
"base_model": "o4-mini (2025-08)",
|
| 5 |
+
"scaffold": "none",
|
| 6 |
+
"category": "LLM Baseline",
|
| 7 |
+
"access": "closed",
|
| 8 |
+
"tools_used": [],
|
| 9 |
+
"organization": "Huawei (paper baseline)",
|
| 10 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 11 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 12 |
+
"submission_date": "2026-02-15",
|
| 13 |
+
"split": "dev",
|
| 14 |
+
"num_seeds": 1
|
| 15 |
+
},
|
| 16 |
+
"scores": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"f1": 3.3,
|
| 19 |
+
"llm_judge": 2.5
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
}
|
submissions/2026-02-15-huawei-owl-gpt4-1 (1).json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "OWL-GPT4.1",
|
| 4 |
+
"base_model": "gpt-4.1",
|
| 5 |
+
"scaffold": "Custom",
|
| 6 |
+
"category": "Agent Framework",
|
| 7 |
+
"access": "open",
|
| 8 |
+
"tools_used": [
|
| 9 |
+
"web_search",
|
| 10 |
+
"python_interpreter"
|
| 11 |
+
],
|
| 12 |
+
"organization": "Huawei (paper baseline)",
|
| 13 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 14 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 15 |
+
"submission_date": "2026-02-15",
|
| 16 |
+
"split": "dev",
|
| 17 |
+
"num_seeds": 1
|
| 18 |
+
},
|
| 19 |
+
"scores": {
|
| 20 |
+
"overall": {
|
| 21 |
+
"f1": 4.1,
|
| 22 |
+
"llm_judge": 12.5
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
}
|
submissions/2026-02-15-huawei-smolagent-gpt4-1 (1).json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "Smolagent-GPT4.1",
|
| 4 |
+
"base_model": "gpt-4.1",
|
| 5 |
+
"scaffold": "CodeAct",
|
| 6 |
+
"category": "Agent Framework",
|
| 7 |
+
"access": "open",
|
| 8 |
+
"tools_used": [
|
| 9 |
+
"web_search",
|
| 10 |
+
"python_interpreter"
|
| 11 |
+
],
|
| 12 |
+
"organization": "Huawei (paper baseline)",
|
| 13 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 14 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 15 |
+
"submission_date": "2026-02-15",
|
| 16 |
+
"split": "dev",
|
| 17 |
+
"num_seeds": 1
|
| 18 |
+
},
|
| 19 |
+
"scores": {
|
| 20 |
+
"overall": {
|
| 21 |
+
"f1": 6.3,
|
| 22 |
+
"llm_judge": 7.5
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
}
|