Spaces:

OpenHands
/

openhands-index

Running

App Files Files Community

simonrosenberg1 commited on Apr 8

Commit

d7034b8

verified ·

1 Parent(s): bef7ade

Move alternative agents to a dedicated page

Browse files

Address review feedback from the original commit:

- Mixing default OpenHands runs and alternative-agent runs (Claude Code,
Codex, Gemini CLI, OpenHands Sub-agents) in the same leaderboard table
and the same scatter plot is misleading. Their cost/runtime numbers
aren't apples-to-apples across harnesses, and conflating them with the
OpenHands ranking on Home obscures what the index is actually
measuring.
- Move alternative-agent rows to a dedicated /alternative-agents page.
Home and the per-category subpages stay exactly as before
(default-OpenHands only).

Changes:

- simple_data_loader.SimpleLeaderboardViewer now takes an `agent_filter`
argument, defaulting to "openhands". When "openhands" it walks
results/{model}/. When "alternative" it walks
alternative_agents/{type}/{model}/.
- ui_components.get_leaderboard_viewer_instance / get_full_leaderboard_data
thread `agent_filter` through and key the in-memory cache on
(split, agent_filter) so the two pages don't fight over a single slot.
- New alternative_agents_page.build_page renders the same
create_leaderboard_display UI but with agent_filter="alternative".
Single Overall view, no per-category subpages (the dataset is small
and the goal is "show me all the alternatives at a glance").
- app.py wires `with demo.route('Alternative Agents', '/alternative-agents'):`
between Information Gathering and About in the top nav.
- leaderboard_transformer.view_table only inserts the "Agent" column
when the dataframe has more than one distinct agent_name. The
OpenHands pages now hide it (every row would say "OpenHands"); the
Alternative Agents page shows it because rows differ.

Local verification:

agent_filter=openhands -> 22 rows, all agent_name=OpenHands
agent_filter=alternative -> 7 rows: 2 Claude Code, 1 Codex, 2 OpenHands Sub-agents,
2 mislabelled Gemini entries (data fix needed
in openhands-index-results)

Files changed (5) hide show

alternative_agents_page.py +65 -0
app.py +4 -0
leaderboard_transformer.py +15 -5
simple_data_loader.py +71 -39
ui_components.py +30 -17

alternative_agents_page.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Alternative Agents leaderboard page.
+The canonical OpenHands Index leaderboard (Home + the per-category pages)
+ranks default OpenHands agent runs from ``results/{model}/`` in the
+openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
+Gemini CLI, OpenHands Sub-agents, ...) live under
+``alternative_agents/{type}/{model}/`` and aren't directly comparable to
+default OpenHands runs (different scaffolds, different cost/runtime
+characteristics), so they get their own standalone page instead of being
+mixed into the same ranking.
+This page is intentionally a single Overall view (no per-category
+subpages) — the alternative-agents dataset is small (one row per
+harness × model) and the goal is "show me all the alternatives at a
+glance", not "drill into Issue Resolution for Codex".
+"""
+import matplotlib
+matplotlib.use('Agg')
+import gradio as gr
+from simple_data_loader import SimpleLeaderboardViewer
+from ui_components import (
+    create_leaderboard_display,
+    get_full_leaderboard_data,
+)
+ALTERNATIVE_AGENTS_INTRO = """
+<div id="alternative-agents-intro">
+  <h2>Alternative Agents</h2>
+  <p>
+    Third-party agent harnesses running the OpenHands Index benchmarks.
+    These rows aren't part of the OpenHands ranking on the
+    <a href="/home">Home</a> page — they're tracked here as a comparison
+    point. Cost and runtime numbers come from each harness's own
+    instrumentation and aren't directly comparable across harnesses.
+  </p>
+</div>
+"""
+def build_page():
+    gr.HTML(ALTERNATIVE_AGENTS_INTRO)
+    gr.Markdown("---")
+    test_df, test_tag_map = get_full_leaderboard_data(
+        "test",
+        agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
+    )
+    if test_df.empty:
+        gr.Markdown(
+            "No alternative agent submissions yet. New runs land in "
+            "`alternative_agents/{type}/{model}/` in "
+            "[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
+        )
+        return
+    create_leaderboard_display(
+        full_df=test_df,
+        tag_map=test_tag_map,
+        category_name="Overall",
+        split_name="test",
+    )

app.py CHANGED Viewed

@@ -35,6 +35,7 @@ from app_creation import build_page as build_app_creation_page
 from frontend_development import build_page as build_frontend_page
 from test_generation import build_page as build_test_generation_page
 from information_gathering import build_page as build_information_gathering_page
 from about import build_page as build_about_page
 logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
@@ -373,6 +374,9 @@ with demo.route("Testing", "/testing"):
 with demo.route("Information Gathering", "/information-gathering"):
     build_information_gathering_page()
 with demo.route("About", "/about"):
     build_about_page()

 from frontend_development import build_page as build_frontend_page
 from test_generation import build_page as build_test_generation_page
 from information_gathering import build_page as build_information_gathering_page
+from alternative_agents_page import build_page as build_alternative_agents_page
 from about import build_page as build_about_page
 logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
 with demo.route("Information Gathering", "/information-gathering"):
     build_information_gathering_page()
+with demo.route("Alternative Agents", "/alternative-agents"):
+    build_alternative_agents_page()
 with demo.route("About", "/about"):
     build_about_page()

leaderboard_transformer.py CHANGED Viewed

@@ -816,11 +816,21 @@ class DataTransformer:
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
-        # "Agent" sits between id and Language Model so OpenHands vs
-        # alternative agents (Claude Code / Codex / Gemini CLI) are visible
-        # at a glance, and so the same model with two different agents
-        # doesn't look like a duplicate row.
-        base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs", "Visualization"]

         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
+        # Only include the "Agent" column when the dataframe actually has
+        # more than one distinct agent. On the canonical OpenHands pages
+        # every row says "OpenHands", so adding the column is just noise;
+        # on the Alternative Agents page rows differ (Claude Code / Codex
+        # / Gemini CLI / OpenHands Sub-agents), so the column carries
+        # signal and disambiguates same-model rows from different
+        # harnesses.
+        has_mixed_agents = (
+            "Agent" in df_view.columns
+            and df_view["Agent"].dropna().nunique() > 1
+        )
+        if has_mixed_agents:
+            base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
+        else:
+            base_cols = ["id", "Language Model", "SDK Version", "Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs", "Visualization"]

simple_data_loader.py CHANGED Viewed

@@ -96,17 +96,43 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
 class SimpleLeaderboardViewer:
     """Simple replacement for agent-eval's LeaderboardViewer."""
-    def __init__(self, data_dir: str, config: str, split: str):
         """
         Args:
             data_dir: Path to data directory
             config: Config name (e.g., "1.0.0-dev1")
             split: Split name (e.g., "validation" or "test")
         """
         self.data_dir = Path(data_dir)
         self.config = config
         self.split = split
         self.config_path = self.data_dir / config
         # Benchmark to category mappings (single source of truth)
@@ -183,52 +209,58 @@ class SimpleLeaderboardViewer:
         return records, errors
     def _load_from_agent_dirs(self):
-        """Load default-agent results plus any alternative_agents/ entries.
-        Reads ``{config}/results/{model}/`` for default OpenHands runs and
-        ``{config}/alternative_agents/{type}/{model}/`` for ACP agent runs
-        (acp-claude, acp-codex, acp-gemini, ...) so they all surface in the
-        same leaderboard. Returns ``None`` if neither directory yields any
-        records (which makes the caller render an empty-state placeholder).
         """
         all_records = []
         all_validation_errors = []
-        # 1. Default OpenHands agent results
-        results_dir = self.config_path / "results"
-        if results_dir.exists():
-            for agent_dir in results_dir.iterdir():
-                if not agent_dir.is_dir():
-                    continue
-                records, errors = self._records_from_agent_dir(agent_dir)
-                all_records.extend(records)
-                all_validation_errors.extend(errors)
-        # 2. Alternative agents (one subdirectory per agent_type, then per model)
-        # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE map
-        # in OpenHands/evaluation push_to_index_from_archive.py — keeping it
-        # in sync ensures rows are labelled the same way the index repo
-        # records them.
-        agent_type_default_name = {
-            'acp-claude': 'Claude Code',
-            'acp-codex': 'Codex',
-            'acp-gemini': 'Gemini CLI',
-            'openhands_subagents': 'OpenHands Sub-agents',
-        }
-        alt_dir = self.config_path / "alternative_agents"
-        if alt_dir.exists():
-            for type_dir in alt_dir.iterdir():
-                if not type_dir.is_dir():
-                    continue
-                default_name = agent_type_default_name.get(type_dir.name)
-                for agent_dir in type_dir.iterdir():
                     if not agent_dir.is_dir():
                         continue
-                    records, errors = self._records_from_agent_dir(
-                        agent_dir, default_agent_name=default_name
-                    )
                     all_records.extend(records)
                     all_validation_errors.extend(errors)
         # Log validation errors if any
         if all_validation_errors:

 class SimpleLeaderboardViewer:
     """Simple replacement for agent-eval's LeaderboardViewer."""
+    AGENT_FILTER_OPENHANDS = "openhands"
+    AGENT_FILTER_ALTERNATIVE = "alternative"
+    def __init__(
+        self,
+        data_dir: str,
+        config: str,
+        split: str,
+        agent_filter: str = AGENT_FILTER_OPENHANDS,
+    ):
         """
         Args:
             data_dir: Path to data directory
             config: Config name (e.g., "1.0.0-dev1")
             split: Split name (e.g., "validation" or "test")
+            agent_filter: Which submissions to include.
+                ``"openhands"`` (default) loads only the default OpenHands
+                agent runs from ``results/{model}/`` — the canonical
+                leaderboard. ``"alternative"`` loads only third-party
+                harnesses (Claude Code / Codex / Gemini CLI / OpenHands
+                Sub-agents) from ``alternative_agents/{type}/{model}/``,
+                which power the standalone Alternative Agents page.
+                The two are kept on separate pages because their
+                cost/runtime numbers aren't apples-to-apples and mixing
+                them in one ranking would be misleading.
         """
+        if agent_filter not in (self.AGENT_FILTER_OPENHANDS, self.AGENT_FILTER_ALTERNATIVE):
+            raise ValueError(
+                f"agent_filter must be one of "
+                f"{{{self.AGENT_FILTER_OPENHANDS!r}, {self.AGENT_FILTER_ALTERNATIVE!r}}}, "
+                f"got {agent_filter!r}"
+            )
         self.data_dir = Path(data_dir)
         self.config = config
         self.split = split
+        self.agent_filter = agent_filter
         self.config_path = self.data_dir / config
         # Benchmark to category mappings (single source of truth)
         return records, errors
     def _load_from_agent_dirs(self):
+        """Load agent records based on ``self.agent_filter``.
+        - ``"openhands"`` (default): only ``{config}/results/{model}/``,
+          which is the canonical OpenHands leaderboard. The Home page and
+          the per-category subpages use this.
+        - ``"alternative"``: only
+          ``{config}/alternative_agents/{type}/{model}/`` (acp-claude,
+          acp-codex, acp-gemini, openhands_subagents, ...). The dedicated
+          Alternative Agents page uses this.
+        Returns ``None`` if no records were found (which makes the caller
+        render an empty-state placeholder).
         """
         all_records = []
         all_validation_errors = []
+        if self.agent_filter == self.AGENT_FILTER_OPENHANDS:
+            # Default OpenHands agent results
+            results_dir = self.config_path / "results"
+            if results_dir.exists():
+                for agent_dir in results_dir.iterdir():
                     if not agent_dir.is_dir():
                         continue
+                    records, errors = self._records_from_agent_dir(agent_dir)
                     all_records.extend(records)
                     all_validation_errors.extend(errors)
+        else:
+            # Alternative agents (one subdirectory per agent_type, then per model)
+            # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE
+            # map in OpenHands/evaluation push_to_index_from_archive.py — keeping
+            # it in sync ensures rows are labelled the same way the index repo
+            # records them.
+            agent_type_default_name = {
+                'acp-claude': 'Claude Code',
+                'acp-codex': 'Codex',
+                'acp-gemini': 'Gemini CLI',
+                'openhands_subagents': 'OpenHands Sub-agents',
+            }
+            alt_dir = self.config_path / "alternative_agents"
+            if alt_dir.exists():
+                for type_dir in alt_dir.iterdir():
+                    if not type_dir.is_dir():
+                        continue
+                    default_name = agent_type_default_name.get(type_dir.name)
+                    for agent_dir in type_dir.iterdir():
+                        if not agent_dir.is_dir():
+                            continue
+                        records, errors = self._records_from_agent_dir(
+                            agent_dir, default_agent_name=default_name
+                        )
+                        all_records.extend(records)
+                        all_validation_errors.extend(errors)
         # Log validation errors if any
         if all_validation_errors:

ui_components.py CHANGED Viewed

@@ -508,28 +508,36 @@ class DummyViewer:
         # The _load method returns the error DataFrame and an empty tag map
         return self._error_df, {}
-def get_leaderboard_viewer_instance(split: str):
     """
-    Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid
-    re-downloading data. On error, returns a stable DummyViewer object.
     """
     global CACHED_VIEWERS, CACHED_TAG_MAPS
     with _cache_lock:
-        if split in CACHED_VIEWERS:
             # Cache hit: return the cached viewer and tag map
-            return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
     # --- Cache miss: try to load data from the source ---
     try:
         # First try to load from extracted data directory (local mock data)
         data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
-        print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
         viewer = SimpleLeaderboardViewer(
             data_dir=data_dir,
             config=CONFIG_NAME,
-            split=split
         )
         # Simplify tag map creation
@@ -537,14 +545,14 @@ def get_leaderboard_viewer_instance(split: str):
         # Cache the results for next time (thread-safe)
         with _cache_lock:
-            CACHED_VIEWERS[split] = viewer
-            CACHED_TAG_MAPS[split] = pretty_tag_map  # Cache the pretty map directly
         return viewer, pretty_tag_map
     except Exception as e:
         # On ANY error, create a consistent error message and cache a DummyViewer
-        error_message = f"Error loading data for split '{split}': {e}"
         print(format_error(error_message))
         dummy_df = pd.DataFrame({"Message": [error_message]})
@@ -553,8 +561,8 @@ def get_leaderboard_viewer_instance(split: str):
         # Cache the dummy objects so we don't try to fetch again on this run
         with _cache_lock:
-            CACHED_VIEWERS[split] = dummy_viewer
-            CACHED_TAG_MAPS[split] = dummy_tag_map
         return dummy_viewer, dummy_tag_map
@@ -1268,12 +1276,17 @@ def create_benchmark_details_display(
             legend_markdown = create_legend_markdown(benchmark_name)
             gr.HTML(value=legend_markdown, elem_id="legend-markdown")
-def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
     """
-    Loads and transforms the complete dataset for a given split.
-    This function handles caching and returns the final "pretty" DataFrame and tag map.
     """
-    viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
     if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
         raw_df, _ = viewer_or_data._load()

         # The _load method returns the error DataFrame and an empty tag map
         return self._error_df, {}
+def get_leaderboard_viewer_instance(
+    split: str,
+    agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
+):
     """
+    Fetches the LeaderboardViewer for a (split, agent_filter) pair, using a
+    thread-safe cache to avoid re-downloading data. The cache is keyed on
+    both axes so the OpenHands and Alternative Agents pages don't fight
+    over a single slot. On error, returns a stable DummyViewer.
     """
     global CACHED_VIEWERS, CACHED_TAG_MAPS
+    cache_key = (split, agent_filter)
     with _cache_lock:
+        if cache_key in CACHED_VIEWERS:
             # Cache hit: return the cached viewer and tag map
+            return CACHED_VIEWERS[cache_key], CACHED_TAG_MAPS.get(cache_key, {"Overall": []})
     # --- Cache miss: try to load data from the source ---
     try:
         # First try to load from extracted data directory (local mock data)
         data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
+        print(f"Loading data for split '{split}' (agent_filter={agent_filter}) from: {data_dir}/{CONFIG_NAME}")
         viewer = SimpleLeaderboardViewer(
             data_dir=data_dir,
             config=CONFIG_NAME,
+            split=split,
+            agent_filter=agent_filter,
         )
         # Simplify tag map creation
         # Cache the results for next time (thread-safe)
         with _cache_lock:
+            CACHED_VIEWERS[cache_key] = viewer
+            CACHED_TAG_MAPS[cache_key] = pretty_tag_map  # Cache the pretty map directly
         return viewer, pretty_tag_map
     except Exception as e:
         # On ANY error, create a consistent error message and cache a DummyViewer
+        error_message = f"Error loading data for split '{split}' (agent_filter={agent_filter}): {e}"
         print(format_error(error_message))
         dummy_df = pd.DataFrame({"Message": [error_message]})
         # Cache the dummy objects so we don't try to fetch again on this run
         with _cache_lock:
+            CACHED_VIEWERS[cache_key] = dummy_viewer
+            CACHED_TAG_MAPS[cache_key] = dummy_tag_map
         return dummy_viewer, dummy_tag_map
             legend_markdown = create_legend_markdown(benchmark_name)
             gr.HTML(value=legend_markdown, elem_id="legend-markdown")
+def get_full_leaderboard_data(
+    split: str,
+    agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
+) -> tuple[pd.DataFrame, dict]:
     """
+    Loads and transforms the complete dataset for a (split, agent_filter)
+    pair. ``agent_filter`` defaults to ``"openhands"`` so existing pages
+    that don't pass it stay on the canonical leaderboard. The Alternative
+    Agents page passes ``"alternative"`` to get the third-party harnesses.
     """
+    viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split, agent_filter=agent_filter)
     if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
         raw_df, _ = viewer_or_data._load()