debjitpaul commited on
Commit ·
0b6132e
1
Parent(s): 634f1e8
Add paper baselines to leaderboard
Browse files
submissions/2026-04-10-anthropic-claude-opus-react.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"agent_name": "ReAct-Claude-Opus",
|
| 4 |
+
"base_model": "claude-opus-4-6",
|
| 5 |
+
"scaffold": "ReAct",
|
| 6 |
+
"tools_used": ["web_search", "python_interpreter"],
|
| 7 |
+
"organization": "DeepSynth authors (paper baseline)",
|
| 8 |
+
"paper_url": "https://arxiv.org/abs/2602.21143",
|
| 9 |
+
"code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
|
| 10 |
+
"submission_date": "2026-04-10",
|
| 11 |
+
"split": "test",
|
| 12 |
+
"num_seeds": 3
|
| 13 |
+
},
|
| 14 |
+
"scores": {
|
| 15 |
+
"overall": {"exact_match": 0.22, "f1": 0.38, "llm_judge": 0.47},
|
| 16 |
+
"per_domain": {
|
| 17 |
+
"science": {"f1": 0.REPLACE},
|
| 18 |
+
"geography": {"f1": 0.REPLACE},
|
| 19 |
+
"economics": {"f1": 0.REPLACE},
|
| 20 |
+
"history": {"f1": 0.REPLACE},
|
| 21 |
+
"culture": {"f1": 0.REPLACE},
|
| 22 |
+
"politics": {"f1": 0.REPLACE},
|
| 23 |
+
"technology": {"f1": 0.REPLACE}
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
"efficiency": {
|
| 27 |
+
"avg_cost_usd": 0.44,
|
| 28 |
+
"avg_latency_s": 15.2,
|
| 29 |
+
"avg_num_tool_calls": 5.8
|
| 30 |
+
}
|
| 31 |
+
}
|