openhands commited on
Commit
24ccc42
·
1 Parent(s): ab22529

Remove agenteval.json, use hardcoded mappings as single source of truth

Browse files

Simplify the data loader by removing the agenteval.json config file and
its loading logic. The benchmark-to-category mappings are now defined
directly in the code, eliminating redundancy and potential inconsistencies.

data/1.0.0-dev1/agenteval.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "suite_config": {
3
- "name": "openhands-index",
4
- "version": "1.0.0-dev1",
5
- "splits": [
6
- {
7
- "name": "test",
8
- "tasks": [
9
- {
10
- "name": "swe-bench",
11
- "tags": [
12
- "Overall",
13
- "Issue Resolution",
14
- "swe-bench"
15
- ]
16
- },
17
- {
18
- "name": "swe-bench-multimodal",
19
- "tags": [
20
- "Overall",
21
- "Frontend",
22
- "swe-bench-multimodal"
23
- ]
24
- },
25
- {
26
- "name": "commit0",
27
- "tags": [
28
- "Overall",
29
- "Greenfield",
30
- "commit0"
31
- ]
32
- },
33
- {
34
- "name": "swt-bench",
35
- "tags": [
36
- "Overall",
37
- "Testing",
38
- "swt-bench"
39
- ]
40
- },
41
- {
42
- "name": "gaia",
43
- "tags": [
44
- "Overall",
45
- "Information Gathering",
46
- "gaia"
47
- ]
48
- }
49
- ]
50
- },
51
- {
52
- "name": "validation",
53
- "tasks": [
54
- {
55
- "name": "swe-bench",
56
- "tags": [
57
- "Overall",
58
- "Issue Resolution",
59
- "swe-bench"
60
- ]
61
- },
62
- {
63
- "name": "swe-bench-multimodal",
64
- "tags": [
65
- "Overall",
66
- "Frontend",
67
- "swe-bench-multimodal"
68
- ]
69
- },
70
- {
71
- "name": "commit0",
72
- "tags": [
73
- "Overall",
74
- "Greenfield",
75
- "commit0"
76
- ]
77
- },
78
- {
79
- "name": "swt-bench",
80
- "tags": [
81
- "Overall",
82
- "Testing",
83
- "swt-bench"
84
- ]
85
- },
86
- {
87
- "name": "gaia",
88
- "tags": [
89
- "Overall",
90
- "Information Gathering",
91
- "gaia"
92
- ]
93
- }
94
- ]
95
- }
96
- ]
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
simple_data_loader.py CHANGED
@@ -109,59 +109,23 @@ class SimpleLeaderboardViewer:
109
  self.split = split
110
  self.config_path = self.data_dir / config
111
 
112
- # Load suite configuration
113
- config_file = self.config_path / "agenteval.json"
114
- if config_file.exists():
115
- with open(config_file) as f:
116
- suite_config = json.load(f)
117
- self.suite_config = suite_config["suite_config"]
118
- else:
119
- self.suite_config = {
120
- "name": "openhands-index",
121
- "version": config,
122
- "splits": []
123
- }
124
 
125
- # Build tag map from config - organize benchmarks by category
126
  self.tag_map = {}
127
- self.benchmark_to_categories = {} # Maps benchmark name to its categories
128
-
129
- # Try loading from config first
130
- config_has_mappings = False
131
- for split_config in self.suite_config.get("splits", []):
132
- if split_config["name"] == split:
133
- for task in split_config.get("tasks", []):
134
- task_name = task["name"]
135
- # Store which categories this benchmark belongs to
136
- self.benchmark_to_categories[task_name] = []
137
- for tag in task.get("tags", []):
138
- # Skip "Overall" and the benchmark's own name
139
- if tag != "Overall" and tag != task_name:
140
- # This is a category tag
141
- if tag not in self.tag_map:
142
- self.tag_map[tag] = []
143
- if task_name not in self.tag_map[tag]:
144
- self.tag_map[tag].append(task_name)
145
- self.benchmark_to_categories[task_name].append(tag)
146
- config_has_mappings = True
147
-
148
- # FALLBACK: If no mappings loaded from config, use hard-coded category mappings
149
- if not config_has_mappings:
150
- print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
151
- fallback_mappings = {
152
- 'swe-bench': ['Issue Resolution'],
153
- 'swe-bench-multimodal': ['Frontend'],
154
- 'commit0': ['Greenfield'],
155
- 'swt-bench': ['Testing'],
156
- 'gaia': ['Information Gathering'],
157
- }
158
- for benchmark, categories in fallback_mappings.items():
159
- self.benchmark_to_categories[benchmark] = categories
160
- for category in categories:
161
- if category not in self.tag_map:
162
- self.tag_map[category] = []
163
- if benchmark not in self.tag_map[category]:
164
- self.tag_map[category].append(benchmark)
165
 
166
  def _load_from_agent_dirs(self):
167
  """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
 
109
  self.split = split
110
  self.config_path = self.data_dir / config
111
 
112
+ # Benchmark to category mappings (single source of truth)
113
+ self.benchmark_to_categories = {
114
+ 'swe-bench': ['Issue Resolution'],
115
+ 'swe-bench-multimodal': ['Frontend'],
116
+ 'commit0': ['Greenfield'],
117
+ 'swt-bench': ['Testing'],
118
+ 'gaia': ['Information Gathering'],
119
+ }
 
 
 
 
120
 
121
+ # Build tag map (category -> benchmarks)
122
  self.tag_map = {}
123
+ for benchmark, categories in self.benchmark_to_categories.items():
124
+ for category in categories:
125
+ if category not in self.tag_map:
126
+ self.tag_map[category] = []
127
+ if benchmark not in self.tag_map[category]:
128
+ self.tag_map[category].append(benchmark)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  def _load_from_agent_dirs(self):
131
  """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""