Spaces:

OpenHands
/

openhands-index

Running

openhands commited on Jan 27

Commit

24ccc42

1 Parent(s): ab22529

Remove agenteval.json, use hardcoded mappings as single source of truth

Simplify the data loader by removing the agenteval.json config file and
its loading logic. The benchmark-to-category mappings are now defined
directly in the code, eliminating redundancy and potential inconsistencies.

Files changed (2) hide show

data/1.0.0-dev1/agenteval.json +0 -98
simple_data_loader.py +15 -51

data/1.0.0-dev1/agenteval.json DELETED Viewed

@@ -1,98 +0,0 @@
-{
-  "suite_config": {
-    "name": "openhands-index",
-    "version": "1.0.0-dev1",
-    "splits": [
-      {
-        "name": "test",
-        "tasks": [
-          {
-            "name": "swe-bench",
-            "tags": [
-              "Overall",
-              "Issue Resolution",
-              "swe-bench"
-            ]
-          },
-          {
-            "name": "swe-bench-multimodal",
-            "tags": [
-              "Overall",
-              "Frontend",
-              "swe-bench-multimodal"
-            ]
-          },
-          {
-            "name": "commit0",
-            "tags": [
-              "Overall",
-              "Greenfield",
-              "commit0"
-            ]
-          },
-          {
-            "name": "swt-bench",
-            "tags": [
-              "Overall",
-              "Testing",
-              "swt-bench"
-            ]
-          },
-          {
-            "name": "gaia",
-            "tags": [
-              "Overall",
-              "Information Gathering",
-              "gaia"
-            ]
-          }
-        ]
-      },
-      {
-        "name": "validation",
-        "tasks": [
-          {
-            "name": "swe-bench",
-            "tags": [
-              "Overall",
-              "Issue Resolution",
-              "swe-bench"
-            ]
-          },
-          {
-            "name": "swe-bench-multimodal",
-            "tags": [
-              "Overall",
-              "Frontend",
-              "swe-bench-multimodal"
-            ]
-          },
-          {
-            "name": "commit0",
-            "tags": [
-              "Overall",
-              "Greenfield",
-              "commit0"
-            ]
-          },
-          {
-            "name": "swt-bench",
-            "tags": [
-              "Overall",
-              "Testing",
-              "swt-bench"
-            ]
-          },
-          {
-            "name": "gaia",
-            "tags": [
-              "Overall",
-              "Information Gathering",
-              "gaia"
-            ]
-          }
-        ]
-      }
-    ]
-  }
-}

simple_data_loader.py CHANGED Viewed

@@ -109,59 +109,23 @@ class SimpleLeaderboardViewer:
         self.split = split
         self.config_path = self.data_dir / config
-        # Load suite configuration
-        config_file = self.config_path / "agenteval.json"
-        if config_file.exists():
-            with open(config_file) as f:
-                suite_config = json.load(f)
-                self.suite_config = suite_config["suite_config"]
-        else:
-            self.suite_config = {
-                "name": "openhands-index",
-                "version": config,
-                "splits": []
-            }
-        # Build tag map from config - organize benchmarks by category
         self.tag_map = {}
-        self.benchmark_to_categories = {}  # Maps benchmark name to its categories
-        # Try loading from config first
-        config_has_mappings = False
-        for split_config in self.suite_config.get("splits", []):
-            if split_config["name"] == split:
-                for task in split_config.get("tasks", []):
-                    task_name = task["name"]
-                    # Store which categories this benchmark belongs to
-                    self.benchmark_to_categories[task_name] = []
-                    for tag in task.get("tags", []):
-                        # Skip "Overall" and the benchmark's own name
-                        if tag != "Overall" and tag != task_name:
-                            # This is a category tag
-                            if tag not in self.tag_map:
-                                self.tag_map[tag] = []
-                            if task_name not in self.tag_map[tag]:
-                                self.tag_map[tag].append(task_name)
-                            self.benchmark_to_categories[task_name].append(tag)
-                            config_has_mappings = True
-        # FALLBACK: If no mappings loaded from config, use hard-coded category mappings
-        if not config_has_mappings:
-            print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
-            fallback_mappings = {
-                'swe-bench': ['Issue Resolution'],
-                'swe-bench-multimodal': ['Frontend'],
-                'commit0': ['Greenfield'],
-                'swt-bench': ['Testing'],
-                'gaia': ['Information Gathering'],
-            }
-            for benchmark, categories in fallback_mappings.items():
-                self.benchmark_to_categories[benchmark] = categories
-                for category in categories:
-                    if category not in self.tag_map:
-                        self.tag_map[category] = []
-                    if benchmark not in self.tag_map[category]:
-                        self.tag_map[category].append(benchmark)
     def _load_from_agent_dirs(self):
         """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""

         self.split = split
         self.config_path = self.data_dir / config
+        # Benchmark to category mappings (single source of truth)
+        self.benchmark_to_categories = {
+            'swe-bench': ['Issue Resolution'],
+            'swe-bench-multimodal': ['Frontend'],
+            'commit0': ['Greenfield'],
+            'swt-bench': ['Testing'],
+            'gaia': ['Information Gathering'],
+        }
+        # Build tag map (category -> benchmarks)
         self.tag_map = {}
+        for benchmark, categories in self.benchmark_to_categories.items():
+            for category in categories:
+                if category not in self.tag_map:
+                    self.tag_map[category] = []
+                if benchmark not in self.tag_map[category]:
+                    self.tag_map[category].append(benchmark)
     def _load_from_agent_dirs(self):
         """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""