openhands commited on
Commit
a04df0a
·
1 Parent(s): 742f074

Add validation.jsonl data and validation split to agenteval.json

Browse files
mock_results/1.0.0-dev1/agenteval.json CHANGED
@@ -49,6 +49,53 @@
49
  ]
50
  }
51
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
  ]
54
  }
 
49
  ]
50
  }
51
  ]
52
+ },
53
+ {
54
+ "name": "validation",
55
+ "tasks": [
56
+ {
57
+ "name": "swe-bench",
58
+ "tags": [
59
+ "Overall",
60
+ "swe-bench"
61
+ ]
62
+ },
63
+ {
64
+ "name": "multi-swe-bench",
65
+ "tags": [
66
+ "Overall",
67
+ "multi-swe-bench"
68
+ ]
69
+ },
70
+ {
71
+ "name": "swe-bench-multimodal",
72
+ "tags": [
73
+ "Overall",
74
+ "swe-bench-multimodal"
75
+ ]
76
+ },
77
+ {
78
+ "name": "swt-bench",
79
+ "tags": [
80
+ "Overall",
81
+ "swt-bench"
82
+ ]
83
+ },
84
+ {
85
+ "name": "commit0",
86
+ "tags": [
87
+ "Overall",
88
+ "commit0"
89
+ ]
90
+ },
91
+ {
92
+ "name": "gaia",
93
+ "tags": [
94
+ "Overall",
95
+ "gaia"
96
+ ]
97
+ }
98
+ ]
99
  }
100
  ]
101
  }
mock_results/1.0.0-dev1/validation.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
6
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
7
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
8
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
9
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
10
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
11
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
12
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
13
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
14
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
15
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
16
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
17
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
18
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
19
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
20
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
21
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
22
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
23
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
24
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
25
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
26
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
27
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
28
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
29
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
30
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}