openhands commited on
Commit
ab22529
·
1 Parent(s): cfd4f2a

Fix category names and benchmark assignments

Browse files

- Use preferred category names: Issue Resolution, Frontend, Greenfield, Testing, Information Gathering
- Move swe-bench-multimodal to Frontend category (was incorrectly in Issue Resolution)
- Remove multi-swe-bench (not used)
- Update about page to reflect correct benchmark-category mapping

about.py CHANGED
@@ -20,9 +20,9 @@ def build_page():
20
  <h2>Benchmarks</h2>
21
  <p>We evaluate agents across five categories:</p>
22
  <ul class="info-list">
23
- <li><strong>Issue Resolution:</strong> <a href="https://www.swebench.com/" target="_blank">SWE-bench</a>, <a href="https://github.com/OpenHands/SWE-bench-multimodal" target="_blank">SWE-bench Multimodal</a></li>
 
24
  <li><strong>Greenfield:</strong> <a href="https://github.com/commit-0/commit0" target="_blank">Commit0</a></li>
25
- <li><strong>Frontend:</strong> <a href="https://github.com/pwnslinger/multi-swe-bench" target="_blank">Multi-SWE-bench</a></li>
26
  <li><strong>Testing:</strong> <a href="https://github.com/logic-star-ai/swt-bench" target="_blank">SWT-bench</a></li>
27
  <li><strong>Information Gathering:</strong> <a href="https://huggingface.co/gaia-benchmark" target="_blank">GAIA</a></li>
28
  </ul>
 
20
  <h2>Benchmarks</h2>
21
  <p>We evaluate agents across five categories:</p>
22
  <ul class="info-list">
23
+ <li><strong>Issue Resolution:</strong> <a href="https://www.swebench.com/" target="_blank">SWE-bench</a></li>
24
+ <li><strong>Frontend:</strong> <a href="https://github.com/OpenHands/SWE-bench-multimodal" target="_blank">SWE-bench Multimodal</a></li>
25
  <li><strong>Greenfield:</strong> <a href="https://github.com/commit-0/commit0" target="_blank">Commit0</a></li>
 
26
  <li><strong>Testing:</strong> <a href="https://github.com/logic-star-ai/swt-bench" target="_blank">SWT-bench</a></li>
27
  <li><strong>Information Gathering:</strong> <a href="https://huggingface.co/gaia-benchmark" target="_blank">GAIA</a></li>
28
  </ul>
data/1.0.0-dev1/agenteval.json CHANGED
@@ -10,7 +10,7 @@
10
  "name": "swe-bench",
11
  "tags": [
12
  "Overall",
13
- "Bug Fixing",
14
  "swe-bench"
15
  ]
16
  },
@@ -18,7 +18,7 @@
18
  "name": "swe-bench-multimodal",
19
  "tags": [
20
  "Overall",
21
- "Bug Fixing",
22
  "swe-bench-multimodal"
23
  ]
24
  },
@@ -26,23 +26,15 @@
26
  "name": "commit0",
27
  "tags": [
28
  "Overall",
29
- "App Creation",
30
  "commit0"
31
  ]
32
  },
33
- {
34
- "name": "multi-swe-bench",
35
- "tags": [
36
- "Overall",
37
- "Frontend Development",
38
- "multi-swe-bench"
39
- ]
40
- },
41
  {
42
  "name": "swt-bench",
43
  "tags": [
44
  "Overall",
45
- "Test Generation",
46
  "swt-bench"
47
  ]
48
  },
@@ -63,7 +55,7 @@
63
  "name": "swe-bench",
64
  "tags": [
65
  "Overall",
66
- "Bug Fixing",
67
  "swe-bench"
68
  ]
69
  },
@@ -71,7 +63,7 @@
71
  "name": "swe-bench-multimodal",
72
  "tags": [
73
  "Overall",
74
- "Bug Fixing",
75
  "swe-bench-multimodal"
76
  ]
77
  },
@@ -79,23 +71,15 @@
79
  "name": "commit0",
80
  "tags": [
81
  "Overall",
82
- "App Creation",
83
  "commit0"
84
  ]
85
  },
86
- {
87
- "name": "multi-swe-bench",
88
- "tags": [
89
- "Overall",
90
- "Frontend Development",
91
- "multi-swe-bench"
92
- ]
93
- },
94
  {
95
  "name": "swt-bench",
96
  "tags": [
97
  "Overall",
98
- "Test Generation",
99
  "swt-bench"
100
  ]
101
  },
@@ -111,4 +95,4 @@
111
  }
112
  ]
113
  }
114
- }
 
10
  "name": "swe-bench",
11
  "tags": [
12
  "Overall",
13
+ "Issue Resolution",
14
  "swe-bench"
15
  ]
16
  },
 
18
  "name": "swe-bench-multimodal",
19
  "tags": [
20
  "Overall",
21
+ "Frontend",
22
  "swe-bench-multimodal"
23
  ]
24
  },
 
26
  "name": "commit0",
27
  "tags": [
28
  "Overall",
29
+ "Greenfield",
30
  "commit0"
31
  ]
32
  },
 
 
 
 
 
 
 
 
33
  {
34
  "name": "swt-bench",
35
  "tags": [
36
  "Overall",
37
+ "Testing",
38
  "swt-bench"
39
  ]
40
  },
 
55
  "name": "swe-bench",
56
  "tags": [
57
  "Overall",
58
+ "Issue Resolution",
59
  "swe-bench"
60
  ]
61
  },
 
63
  "name": "swe-bench-multimodal",
64
  "tags": [
65
  "Overall",
66
+ "Frontend",
67
  "swe-bench-multimodal"
68
  ]
69
  },
 
71
  "name": "commit0",
72
  "tags": [
73
  "Overall",
74
+ "Greenfield",
75
  "commit0"
76
  ]
77
  },
 
 
 
 
 
 
 
 
78
  {
79
  "name": "swt-bench",
80
  "tags": [
81
  "Overall",
82
+ "Testing",
83
  "swt-bench"
84
  ]
85
  },
 
95
  }
96
  ]
97
  }
98
+ }
data/1.0.0-dev1/multi-swe-bench.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
2
- {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
3
- {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
4
- {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
5
- {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}