Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
b4ac443
1
Parent(s):
044cdf4
CRITICAL FIX: Add fallback category mappings for data without agenteval.json
Browse filesROOT CAUSE: The app was calculating 0.0 scores because:
1. No agenteval.json file exists in the data directory
2. Without it, benchmark_to_categories dict was empty
3. Category aggregation was skipped (line 162-167 in simple_data_loader.py)
4. Overall score calculation received empty list, resulted in None/0.0
SOLUTION: Add hard-coded fallback mappings when agenteval.json is missing:
- swe-bench, swe-bench-multimodal, commit0, multi-swe-bench → Bug Fixing
- swt-bench → Frontend Development
- gaia → Information Gathering
This ensures category aggregation works even without config file.
Tested locally: ✅ Shows real scores (57.767, 54.692, etc.)
Co-authored-by: openhands <openhands@all-hands.dev>
- simple_data_loader.py +23 -0
simple_data_loader.py
CHANGED
|
@@ -39,6 +39,9 @@ class SimpleLeaderboardViewer:
|
|
| 39 |
# Build tag map from config - organize benchmarks by category
|
| 40 |
self.tag_map = {}
|
| 41 |
self.benchmark_to_categories = {} # Maps benchmark name to its categories
|
|
|
|
|
|
|
|
|
|
| 42 |
for split_config in self.suite_config.get("splits", []):
|
| 43 |
if split_config["name"] == split:
|
| 44 |
for task in split_config.get("tasks", []):
|
|
@@ -54,6 +57,26 @@ class SimpleLeaderboardViewer:
|
|
| 54 |
if task_name not in self.tag_map[tag]:
|
| 55 |
self.tag_map[tag].append(task_name)
|
| 56 |
self.benchmark_to_categories[task_name].append(tag)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
def _load_from_agent_dirs(self):
|
| 59 |
"""Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
|
|
|
|
| 39 |
# Build tag map from config - organize benchmarks by category
|
| 40 |
self.tag_map = {}
|
| 41 |
self.benchmark_to_categories = {} # Maps benchmark name to its categories
|
| 42 |
+
|
| 43 |
+
# Try loading from config first
|
| 44 |
+
config_has_mappings = False
|
| 45 |
for split_config in self.suite_config.get("splits", []):
|
| 46 |
if split_config["name"] == split:
|
| 47 |
for task in split_config.get("tasks", []):
|
|
|
|
| 57 |
if task_name not in self.tag_map[tag]:
|
| 58 |
self.tag_map[tag].append(task_name)
|
| 59 |
self.benchmark_to_categories[task_name].append(tag)
|
| 60 |
+
config_has_mappings = True
|
| 61 |
+
|
| 62 |
+
# FALLBACK: If no mappings loaded from config, use hard-coded category mappings
|
| 63 |
+
if not config_has_mappings:
|
| 64 |
+
print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
|
| 65 |
+
fallback_mappings = {
|
| 66 |
+
'swe-bench': ['Bug Fixing'],
|
| 67 |
+
'swe-bench-multimodal': ['Bug Fixing'],
|
| 68 |
+
'commit0': ['Bug Fixing'],
|
| 69 |
+
'multi-swe-bench': ['Bug Fixing'],
|
| 70 |
+
'swt-bench': ['Frontend Development'],
|
| 71 |
+
'gaia': ['Information Gathering'],
|
| 72 |
+
}
|
| 73 |
+
for benchmark, categories in fallback_mappings.items():
|
| 74 |
+
self.benchmark_to_categories[benchmark] = categories
|
| 75 |
+
for category in categories:
|
| 76 |
+
if category not in self.tag_map:
|
| 77 |
+
self.tag_map[category] = []
|
| 78 |
+
if benchmark not in self.tag_map[category]:
|
| 79 |
+
self.tag_map[category].append(benchmark)
|
| 80 |
|
| 81 |
def _load_from_agent_dirs(self):
|
| 82 |
"""Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
|