openhands commited on
Commit
b978a6b
Β·
1 Parent(s): 974f31f

Remove multi-swe-bench from OpenHands Index

Browse files

- Remove multi-swe-bench from benchmark mappings in simple_data_loader.py
- Remove multi-swe-bench from generate_mock_jsonl.py
- Update FRONTEND_DEVELOPMENT_DESCRIPTION to reference SWE-bench Multimodal (not Multi-SWE-bench)
- Remove multi-swe-bench from about.py benchmark lists and acknowledgements
- Update DATA_STRUCTURE.md to remove multi-swe-bench references
- Keep 5 categories (Frontend uses swe-bench-multimodal, not multi-swe-bench)

DATA_STRUCTURE.md CHANGED
@@ -12,7 +12,6 @@ openhands-index-results/
12
  β”‚ β”œβ”€β”€ test.jsonl # Test split results
13
  β”‚ β”œβ”€β”€ validation.jsonl # Validation split results
14
  β”‚ β”œβ”€β”€ swe-bench.jsonl # Individual benchmark results
15
- β”‚ β”œβ”€β”€ multi-swe-bench.jsonl
16
  β”‚ β”œβ”€β”€ swe-bench-multimodal.jsonl
17
  β”‚ β”œβ”€β”€ swt-bench.jsonl
18
  β”‚ β”œβ”€β”€ commit0.jsonl
@@ -58,10 +57,6 @@ The configuration file defines the benchmark structure:
58
  "name": "swe-bench",
59
  "tags": ["swe-bench"]
60
  },
61
- {
62
- "name": "multi-swe-bench",
63
- "tags": ["multi-swe-bench"]
64
- },
65
  {
66
  "name": "swe-bench-multimodal",
67
  "tags": ["swe-bench-multimodal"]
@@ -87,10 +82,6 @@ The configuration file defines the benchmark structure:
87
  "name": "swe-bench",
88
  "tags": ["swe-bench"]
89
  },
90
- {
91
- "name": "multi-swe-bench",
92
- "tags": ["multi-swe-bench"]
93
- },
94
  {
95
  "name": "swe-bench-multimodal",
96
  "tags": ["swe-bench-multimodal"]
 
12
  β”‚ β”œβ”€β”€ test.jsonl # Test split results
13
  β”‚ β”œβ”€β”€ validation.jsonl # Validation split results
14
  β”‚ β”œβ”€β”€ swe-bench.jsonl # Individual benchmark results
 
15
  β”‚ β”œβ”€β”€ swe-bench-multimodal.jsonl
16
  β”‚ β”œβ”€β”€ swt-bench.jsonl
17
  β”‚ β”œβ”€β”€ commit0.jsonl
 
57
  "name": "swe-bench",
58
  "tags": ["swe-bench"]
59
  },
 
 
 
 
60
  {
61
  "name": "swe-bench-multimodal",
62
  "tags": ["swe-bench-multimodal"]
 
82
  "name": "swe-bench",
83
  "tags": ["swe-bench"]
84
  },
 
 
 
 
85
  {
86
  "name": "swe-bench-multimodal",
87
  "tags": ["swe-bench-multimodal"]
about.py CHANGED
@@ -34,11 +34,10 @@ def build_page():
34
  """
35
  <h2>What Does OpenHands Index Include?</h2>
36
  <p>
37
- OpenHands Index aggregates results from 6 key benchmarks for evaluating AI coding agents:
38
  </p>
39
  <ul class="info-list">
40
  <li><strong>SWE-bench</strong>: Repository-level bug fixing from real GitHub issues</li>
41
- <li><strong>Multi-SWE-bench</strong>: Multi-repository software engineering tasks</li>
42
  <li><strong>SWE-bench Multimodal</strong>: Bug fixing with visual context</li>
43
  <li><strong>SWT-bench</strong>: Web development and testing tasks</li>
44
  <li><strong>Commit0</strong>: Commit message generation and code understanding</li>
@@ -116,7 +115,7 @@ def build_page():
116
  To submit your agent's evaluation results to the OpenHands Index:
117
  </p>
118
  <ol class="info-list">
119
- <li>Run your agent on the supported benchmarks (SWE-bench, Multi-SWE-bench, SWE-bench Multimodal, SWT-bench, Commit0, GAIA)</li>
120
  <li>Format your results according to the data structure documented in the repository</li>
121
  <li>Submit a pull request to <a href="https://github.com/OpenHands/openhands-index-results" target="_blank" class="primary-link-button">github.com/OpenHands/openhands-index-results</a></li>
122
  <li>Your submission should include:
@@ -164,7 +163,6 @@ def build_page():
164
  </p>
165
  <ul class="info-list">
166
  <li><a href="https://www.swebench.com/" target="_blank">SWE-bench</a> - Princeton NLP Group</li>
167
- <li><a href="https://github.com/multi-swe-bench/multi-swe-bench" target="_blank">Multi-SWE-bench</a> - Multi-SWE-bench Team</li>
168
  <li><a href="https://github.com/OpenHands/SWE-bench-multimodal" target="_blank">SWE-bench Multimodal</a> - OpenHands Team</li>
169
  <li><a href="https://github.com/logic-star-ai/swt-bench" target="_blank">SWT-bench</a> - Logic Star AI</li>
170
  <li><a href="https://github.com/commit-0/commit0" target="_blank">Commit0</a> - Commit0 Team</li>
 
34
  """
35
  <h2>What Does OpenHands Index Include?</h2>
36
  <p>
37
+ OpenHands Index aggregates results from 5 key benchmarks for evaluating AI coding agents:
38
  </p>
39
  <ul class="info-list">
40
  <li><strong>SWE-bench</strong>: Repository-level bug fixing from real GitHub issues</li>
 
41
  <li><strong>SWE-bench Multimodal</strong>: Bug fixing with visual context</li>
42
  <li><strong>SWT-bench</strong>: Web development and testing tasks</li>
43
  <li><strong>Commit0</strong>: Commit message generation and code understanding</li>
 
115
  To submit your agent's evaluation results to the OpenHands Index:
116
  </p>
117
  <ol class="info-list">
118
+ <li>Run your agent on the supported benchmarks (SWE-bench, SWE-bench Multimodal, SWT-bench, Commit0, GAIA)</li>
119
  <li>Format your results according to the data structure documented in the repository</li>
120
  <li>Submit a pull request to <a href="https://github.com/OpenHands/openhands-index-results" target="_blank" class="primary-link-button">github.com/OpenHands/openhands-index-results</a></li>
121
  <li>Your submission should include:
 
163
  </p>
164
  <ul class="info-list">
165
  <li><a href="https://www.swebench.com/" target="_blank">SWE-bench</a> - Princeton NLP Group</li>
 
166
  <li><a href="https://github.com/OpenHands/SWE-bench-multimodal" target="_blank">SWE-bench Multimodal</a> - OpenHands Team</li>
167
  <li><a href="https://github.com/logic-star-ai/swt-bench" target="_blank">SWT-bench</a> - Logic Star AI</li>
168
  <li><a href="https://github.com/commit-0/commit0" target="_blank">Commit0</a> - Commit0 Team</li>
content.py CHANGED
@@ -57,7 +57,7 @@ For detailed results, use the links above to explore individual benchmark pages.
57
  FRONTEND_DEVELOPMENT_DESCRIPTION = """
58
  The **Frontend** category evaluates agents on their ability to build user interfaces and web applications. This tests skills in HTML, CSS, JavaScript frameworks, responsive design, and creating interactive user experiences.
59
  <br><br>
60
- This category includes Multi-SWE-bench, which challenges agents to work across multiple repositories and coordinate changes in complex web application architectures.
61
  <br>
62
  """
63
  TEST_GENERATION_DESCRIPTION = """
 
57
  FRONTEND_DEVELOPMENT_DESCRIPTION = """
58
  The **Frontend** category evaluates agents on their ability to build user interfaces and web applications. This tests skills in HTML, CSS, JavaScript frameworks, responsive design, and creating interactive user experiences.
59
  <br><br>
60
+ This category includes SWE-bench Multimodal, which challenges agents to solve GitHub issues that include visual context like screenshots and diagrams.
61
  <br>
62
  """
63
  TEST_GENERATION_DESCRIPTION = """
generate_mock_jsonl.py CHANGED
@@ -4,18 +4,13 @@ import os
4
  from pathlib import Path
5
  from datetime import datetime
6
 
7
- # Define the 6 benchmarks
8
  BENCHMARKS = {
9
  "swe-bench": {
10
  "tags": ["swe-bench"],
11
  "metric": "resolve_rate",
12
  "metric_display": "Resolve Rate (%)"
13
  },
14
- "multi-swe-bench": {
15
- "tags": ["multi-swe-bench"],
16
- "metric": "resolve_rate",
17
- "metric_display": "Resolve Rate (%)"
18
- },
19
  "swe-bench-multimodal": {
20
  "tags": ["swe-bench-multimodal"],
21
  "metric": "resolve_rate",
@@ -46,7 +41,6 @@ MOCK_AGENTS = [
46
  "openness": "closed",
47
  "scores": {
48
  "swe-bench": 48.3,
49
- "multi-swe-bench": 35.2,
50
  "swe-bench-multimodal": 42.1,
51
  "swt-bench": 65.4,
52
  "commit0": 71.2,
@@ -59,7 +53,6 @@ MOCK_AGENTS = [
59
  "openness": "closed",
60
  "scores": {
61
  "swe-bench": 45.1,
62
- "multi-swe-bench": 32.8,
63
  "swe-bench-multimodal": 39.5,
64
  "swt-bench": 62.3,
65
  "commit0": 68.9,
@@ -72,7 +65,6 @@ MOCK_AGENTS = [
72
  "openness": "closed",
73
  "scores": {
74
  "swe-bench": 38.7,
75
- "multi-swe-bench": 28.4,
76
  "swe-bench-multimodal": 34.2,
77
  "swt-bench": 54.1,
78
  "commit0": 61.5,
@@ -85,7 +77,6 @@ MOCK_AGENTS = [
85
  "openness": "closed",
86
  "scores": {
87
  "swe-bench": 32.5,
88
- "multi-swe-bench": 24.1,
89
  "swe-bench-multimodal": 28.9,
90
  "swt-bench": 47.8,
91
  "commit0": 55.3,
@@ -98,7 +89,6 @@ MOCK_AGENTS = [
98
  "openness": "closed",
99
  "scores": {
100
  "swe-bench": 29.8,
101
- "multi-swe-bench": 21.5,
102
  "swe-bench-multimodal": 25.7,
103
  "swt-bench": 44.2,
104
  "commit0": 52.1,
 
4
  from pathlib import Path
5
  from datetime import datetime
6
 
7
+ # Define the 5 benchmarks
8
  BENCHMARKS = {
9
  "swe-bench": {
10
  "tags": ["swe-bench"],
11
  "metric": "resolve_rate",
12
  "metric_display": "Resolve Rate (%)"
13
  },
 
 
 
 
 
14
  "swe-bench-multimodal": {
15
  "tags": ["swe-bench-multimodal"],
16
  "metric": "resolve_rate",
 
41
  "openness": "closed",
42
  "scores": {
43
  "swe-bench": 48.3,
 
44
  "swe-bench-multimodal": 42.1,
45
  "swt-bench": 65.4,
46
  "commit0": 71.2,
 
53
  "openness": "closed",
54
  "scores": {
55
  "swe-bench": 45.1,
 
56
  "swe-bench-multimodal": 39.5,
57
  "swt-bench": 62.3,
58
  "commit0": 68.9,
 
65
  "openness": "closed",
66
  "scores": {
67
  "swe-bench": 38.7,
 
68
  "swe-bench-multimodal": 34.2,
69
  "swt-bench": 54.1,
70
  "commit0": 61.5,
 
77
  "openness": "closed",
78
  "scores": {
79
  "swe-bench": 32.5,
 
80
  "swe-bench-multimodal": 28.9,
81
  "swt-bench": 47.8,
82
  "commit0": 55.3,
 
89
  "openness": "closed",
90
  "scores": {
91
  "swe-bench": 29.8,
 
92
  "swe-bench-multimodal": 25.7,
93
  "swt-bench": 44.2,
94
  "commit0": 52.1,
simple_data_loader.py CHANGED
@@ -147,7 +147,6 @@ class SimpleLeaderboardViewer:
147
  'swe-bench': ['Issue Resolution'],
148
  'swe-bench-multimodal': ['Frontend'],
149
  'commit0': ['Greenfield'],
150
- 'multi-swe-bench': ['Issue Resolution'],
151
  'swt-bench': ['Testing'],
152
  'gaia': ['Information Gathering'],
153
  }
 
147
  'swe-bench': ['Issue Resolution'],
148
  'swe-bench-multimodal': ['Frontend'],
149
  'commit0': ['Greenfield'],
 
150
  'swt-bench': ['Testing'],
151
  'gaia': ['Information Gathering'],
152
  }