debjitpaul commited on
Commit
0b6132e
·
1 Parent(s): 634f1e8

Add paper baselines to leaderboard

Browse files
submissions/2026-04-10-anthropic-claude-opus-react.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "agent_name": "ReAct-Claude-Opus",
4
+ "base_model": "claude-opus-4-6",
5
+ "scaffold": "ReAct",
6
+ "tools_used": ["web_search", "python_interpreter"],
7
+ "organization": "DeepSynth authors (paper baseline)",
8
+ "paper_url": "https://arxiv.org/abs/2602.21143",
9
+ "code_url": "https://github.com/agentdeepsynthesis/deepsynth-bench",
10
+ "submission_date": "2026-04-10",
11
+ "split": "test",
12
+ "num_seeds": 3
13
+ },
14
+ "scores": {
15
+ "overall": {"exact_match": 0.22, "f1": 0.38, "llm_judge": 0.47},
16
+ "per_domain": {
17
+ "science": {"f1": 0.REPLACE},
18
+ "geography": {"f1": 0.REPLACE},
19
+ "economics": {"f1": 0.REPLACE},
20
+ "history": {"f1": 0.REPLACE},
21
+ "culture": {"f1": 0.REPLACE},
22
+ "politics": {"f1": 0.REPLACE},
23
+ "technology": {"f1": 0.REPLACE}
24
+ }
25
+ },
26
+ "efficiency": {
27
+ "avg_cost_usd": 0.44,
28
+ "avg_latency_s": 15.2,
29
+ "avg_num_tool_calls": 5.8
30
+ }
31
+ }