Spaces:

OpenHands
/

openhands-index

Running

+"""Alternative Agents leaderboard page.
+The canonical OpenHands Index leaderboard (Home + the per-category pages)
+ranks default OpenHands agent runs from ``results/{model}/`` in the
+openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
+Gemini CLI, OpenHands Sub-agents, ...) live under
+``alternative_agents/{type}/{model}/`` and aren't directly comparable to
+default OpenHands runs (different scaffolds, different cost/runtime
+characteristics), so they get their own standalone page instead of being
+mixed into the same ranking.
+This page is intentionally a single Overall view (no per-category
+subpages) — the alternative-agents dataset is small (one row per
+harness × model) and the goal is "show me all the alternatives at a
+glance", not "drill into Issue Resolution for Codex".
+"""
+import matplotlib
+matplotlib.use('Agg')
+import gradio as gr
+from simple_data_loader import SimpleLeaderboardViewer
+from ui_components import (
+    create_leaderboard_display,
+    get_full_leaderboard_data,
+)
+ALTERNATIVE_AGENTS_INTRO = """
+<div id="alternative-agents-intro">
+  <h2>Alternative Agents</h2>
+  <p>
+    Third-party agent harnesses running the OpenHands Index benchmarks.
+    These rows aren't part of the OpenHands ranking on the
+    <a href="/home">Home</a> page — they're tracked here as a comparison
+    point. Cost and runtime numbers come from each harness's own
+    instrumentation and aren't directly comparable across harnesses.
+  </p>
+</div>
+"""
+def build_page():
+    gr.HTML(ALTERNATIVE_AGENTS_INTRO)
+    gr.Markdown("---")
+    test_df, test_tag_map = get_full_leaderboard_data(
+        "test",
+        agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
+    )
+    if test_df.empty:
+        gr.Markdown(
+            "No alternative agent submissions yet. New runs land in "
+            "`alternative_agents/{type}/{model}/` in "
+            "[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
+        )
+        return
+    create_leaderboard_display(
+        full_df=test_df,
+        tag_map=test_tag_map,
+        category_name="Overall",
+        split_name="test",
+    )

app.py CHANGED Viewed

@@ -35,6 +35,7 @@ from app_creation import build_page as build_app_creation_page
 from frontend_development import build_page as build_frontend_page
 from test_generation import build_page as build_test_generation_page
 from information_gathering import build_page as build_information_gathering_page
 from about import build_page as build_about_page
 logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
@@ -373,6 +374,9 @@ with demo.route("Testing", "/testing"):
 with demo.route("Information Gathering", "/information-gathering"):
     build_information_gathering_page()
 with demo.route("About", "/about"):
     build_about_page()

 from frontend_development import build_page as build_frontend_page
 from test_generation import build_page as build_test_generation_page
 from information_gathering import build_page as build_information_gathering_page
+from alternative_agents_page import build_page as build_alternative_agents_page
 from about import build_page as build_about_page
 logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
 with demo.route("Information Gathering", "/information-gathering"):
     build_information_gathering_page()
+with demo.route("Alternative Agents", "/alternative-agents"):
+    build_alternative_agents_page()
 with demo.route("About", "/about"):
     build_about_page()

assets/harnesses/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Agent harness logos
+This folder holds the **bottom half** of the composite scatter markers used
+on the [Alternative Agents](../../alternative_agents_page.py) page. Each
+point on that scatter stacks two logos: the model provider on top (from
+`assets/logo-*.svg`) and the harness on the bottom (from this folder).
+## Expected filenames
+The scatter code looks up a logo by the exact `agent_name` string that the
+`push-to-index` workflow writes into the index repo's `metadata.json`, then
+maps it through `HARNESS_LOGO_STEMS` in `leaderboard_transformer.py`. Keep
+these filenames in sync with that map.
+| `agent_name` (in index repo) | File in this folder |
+| --- | --- |
+| `Claude Code`          | `claude-code.svg`  or `claude-code.png` |
+| `Codex`                | `codex-cli.svg`    or `codex-cli.png`   |
+| `Gemini CLI`           | `gemini-cli.svg`   or `gemini-cli.png`  |
+| `OpenHands`            | `openhands.svg`    or `openhands.png`   |
+| `OpenHands Sub-agents` | `openhands.svg`    or `openhands.png`   (shared with `OpenHands`) |
+Both `.svg` and `.png` are accepted — the resolver tries `.svg` first, then
+`.png`. **Prefer SVG when possible**: the HuggingFace Space rejects new
+binary files on plain `git push` and routes PNGs through Xet, so an SVG is
+one less thing to set up.
+## When a file is missing
+The scatter falls back to a single marker (just the model provider logo) —
+exactly the same rendering path the canonical OpenHands pages use. Nothing
+crashes and nothing prints a warning in normal operation. This means you
+can roll out logos one harness at a time without waiting for all four.
+## Sizing and shape
+- Square canvas. The composite marker is drawn at a fixed aspect ratio, so
+  a non-square logo will get squished.
+- Any SVG `viewBox` works — the renderer base64-encodes the file as-is and
+  Plotly scales it to the marker's `sizex` / `sizey`. Around `80×80` to
+  `256×256` is a good source size.
+- Leave some internal padding (≈10%) so the logo doesn't touch the marker
+  edge when two are stacked.
+- No background is required, but a rounded-square coloured tile reads well
+  at small sizes because it gives each harness a distinct silhouette even
+  when the inner glyph isn't fully legible. Look at the existing
+  `assets/logo-*.svg` files for the canonical model provider logos if you
+  want a visual reference for sizing.
+## Adding a new harness
+1. Decide on the exact `agent_name` that the push-to-index workflow writes
+   for the new harness (see `AGENT_NAME_BY_TYPE` in
+   `OpenHands/evaluation/push-to-index-job/scripts/push_to_index_from_archive.py`).
+2. Add an entry to `HARNESS_LOGO_STEMS` in
+   [`leaderboard_transformer.py`](../../leaderboard_transformer.py) that
+   maps the display name to a stem.
+3. Drop `{stem}.svg` (or `.png`) into this folder.
+4. Reload the app and look at `/alternative-agents`.

assets/harnesses/claude-code.svg ADDED Viewed

assets/harnesses/codex-cli.svg ADDED Viewed

assets/harnesses/gemini-cli.svg ADDED Viewed

assets/harnesses/openhands.svg ADDED Viewed

docs/screenshots/alternative-agents.png ADDED Viewed

Git LFS Details

SHA256: 99766c7d2c11a6f90f24a5f0effbae74a8aa33096b89ff1c4fcfb238fe06a2f5
Pointer size: 131 Bytes
Size of remote file: 104 kB

leaderboard_transformer.py CHANGED Viewed

@@ -228,17 +228,17 @@ def get_country_from_model(model_name: str) -> dict:
 def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
     """
     Gets the appropriate icon based on the mark_by selection.
     Args:
         model_name: The model name
         openness: The openness value (open/closed)
         mark_by: One of "Company", "Openness", or "Country"
     Returns:
         dict with 'path' and 'name' keys
     """
     from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
     if mark_by == MARK_BY_OPENNESS:
         return get_openness_icon(openness)
     elif mark_by == MARK_BY_COUNTRY:
@@ -247,6 +247,59 @@ def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
         return get_company_from_model(model_name)
 # Standard layout configuration for all charts
 STANDARD_LAYOUT = dict(
     template="plotly_white",
@@ -655,6 +708,7 @@ def _pretty_column_name(raw_col: str) -> str:
     # Case 1: Handle fixed, special-case mappings first.
     fixed_mappings = {
         'id': 'id',
         'SDK version': 'SDK Version',
         'Openhands version': 'SDK Version',  # Legacy support
         'Language model': 'Language Model',
@@ -815,7 +869,21 @@ class DataTransformer:
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
-        base_cols = ["id","Language Model","SDK Version","Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs", "Visualization"]
@@ -1018,13 +1086,18 @@ def _plot_scatter_plotly(
         """
         Builds the complete HTML string for the plot's hover tooltip.
         Format: {lm_name} (SDK {version})
                 Average Score: {score}
                 Average Cost/Runtime: {value}
                 Openness: {openness}
         """
         h_pad = "   "
         parts = ["<br>"]
         # Get and clean the language model name
         llm_base_value = row.get('Language Model', '')
         llm_base_value = clean_llm_base_list(llm_base_value)
@@ -1032,13 +1105,21 @@ def _plot_scatter_plotly(
             lm_name = llm_base_value[0]
         else:
             lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
         # Get SDK version
         sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
         # Title line: {lm_name} (SDK {version})
         parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
         # Average Score
         parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
@@ -1111,51 +1192,116 @@ def _plot_scatter_plotly(
     y_min = min_score - 5 if min_score > 5 else 0
     y_max = max_score + 5
     for _, row in data_plot.iterrows():
         model_name = row.get('Language Model', '')
         openness = row.get('Openness', '')
         marker_info = get_marker_icon(model_name, openness, mark_by)
-        logo_path = marker_info['path']
-        # Read the SVG file and encode as base64 data URI
-        if os.path.exists(logo_path):
-            try:
-                with open(logo_path, 'rb') as f:
-                    encoded_logo = base64.b64encode(f.read()).decode('utf-8')
-                    logo_uri = f"data:image/svg+xml;base64,{encoded_logo}"
-                    x_val = row[x_col_to_use]
-                    y_val = row[y_col_to_use]
-                    # Convert to domain coordinates (0-1 range)
-                    # For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
-                    if x_val > 0:
-                        log_x = np.log10(x_val)
-                        domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
-                    else:
-                        domain_x = 0
-                    # For linear y: domain_y = (y - y_min) / (y_max - y_min)
-                    domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
-                    # Clamp to valid range
-                    domain_x = max(0, min(1, domain_x))
-                    domain_y = max(0, min(1, domain_y))
-                    layout_images.append(dict(
-                        source=logo_uri,
-                        xref="x domain",  # Use domain coordinates for log scale compatibility
-                        yref="y domain",
-                        x=domain_x,
-                        y=domain_y,
-                        sizex=0.04,  # Size as fraction of plot width
-                        sizey=0.06,  # Size as fraction of plot height
-                        xanchor="center",
-                        yanchor="middle",
-                        layer="above"
-                    ))
-            except Exception as e:
-                logger.warning(f"Could not load logo {logo_path}: {e}")
     # --- Section 7: Add Model Name Labels to Frontier Points ---
     if frontier_rows:

 def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
     """
     Gets the appropriate icon based on the mark_by selection.
     Args:
         model_name: The model name
         openness: The openness value (open/closed)
         mark_by: One of "Company", "Openness", or "Country"
     Returns:
         dict with 'path' and 'name' keys
     """
     from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
     if mark_by == MARK_BY_OPENNESS:
         return get_openness_icon(openness)
     elif mark_by == MARK_BY_COUNTRY:
         return get_company_from_model(model_name)
+# Map the agent_name stored in the index repo's metadata.json to a file stem
+# inside assets/harnesses/. Kept in sync with AGENT_NAME_BY_TYPE in
+# OpenHands/evaluation push_to_index_from_archive.py — if a new ACP harness
+# lands there, add the corresponding display name and a matching stem here.
+#
+# The scatter plot looks for {stem}.svg first, then {stem}.png in
+# assets/harnesses/. This repo intentionally ships only a README in that
+# folder: drop the logo files in by hand (SVG preferred, PNG works too via
+# HF Xet) and they'll be picked up on the next app restart. If the file is
+# missing, get_harness_icon() returns None and the scatter falls back to the
+# single-marker path — same rendering the canonical OpenHands pages use —
+# so logos can be added one harness at a time without breaking anything.
+HARNESS_LOGO_STEMS: dict[str, str] = {
+    "Claude Code":          "claude-code",
+    "Codex":                "codex-cli",
+    "Gemini CLI":           "gemini-cli",
+    "OpenHands":            "openhands",
+    "OpenHands Sub-agents": "openhands",
+}
+HARNESS_LOGO_DIR = "assets/harnesses"
+HARNESS_LOGO_EXTENSIONS = ("svg", "png")
+def get_harness_icon(agent_name: Optional[str]) -> Optional[dict]:
+    """Return {'path', 'name'} for the harness logo, or None if not usable.
+    Consumed by the Alternative Agents scatter plot to draw a composite
+    marker (model provider on top, harness on bottom). Returns None in any
+    of three cases, all of which make the caller skip the harness layer:
+    - ``agent_name`` is empty or missing from the dataframe row.
+    - ``agent_name`` isn't in ``HARNESS_LOGO_STEMS`` (new harness that
+      hasn't been registered yet — register it and drop in a logo).
+    - The logo file for that stem doesn't exist in ``assets/harnesses/``
+      yet (the repo ships only the README).
+    That third case is the important one: it lets the Alternative Agents
+    page work immediately after checkout even when the harness logo files
+    haven't been dropped in. The corresponding points just render like a
+    canonical-page marker (model logo only) until the file is added.
+    """
+    if not agent_name:
+        return None
+    stem = HARNESS_LOGO_STEMS.get(str(agent_name).strip())
+    if stem is None:
+        return None
+    for ext in HARNESS_LOGO_EXTENSIONS:
+        path = f"{HARNESS_LOGO_DIR}/{stem}.{ext}"
+        if os.path.exists(path):
+            return {"path": path, "name": agent_name}
+    return None
 # Standard layout configuration for all charts
 STANDARD_LAYOUT = dict(
     template="plotly_white",
     # Case 1: Handle fixed, special-case mappings first.
     fixed_mappings = {
         'id': 'id',
+        'agent_name': 'Agent',
         'SDK version': 'SDK Version',
         'Openhands version': 'SDK Version',  # Legacy support
         'Language model': 'Language Model',
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
+        # Only include the "Agent" column when the dataframe actually has
+        # more than one distinct agent. On the canonical OpenHands pages
+        # every row says "OpenHands", so adding the column is just noise;
+        # on the Alternative Agents page rows differ (Claude Code / Codex
+        # / Gemini CLI / OpenHands Sub-agents), so the column carries
+        # signal and disambiguates same-model rows from different
+        # harnesses.
+        has_mixed_agents = (
+            "Agent" in df_view.columns
+            and df_view["Agent"].dropna().nunique() > 1
+        )
+        if has_mixed_agents:
+            base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
+        else:
+            base_cols = ["id", "Language Model", "SDK Version", "Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs", "Visualization"]
         """
         Builds the complete HTML string for the plot's hover tooltip.
         Format: {lm_name} (SDK {version})
+                Harness: {agent}        (only when the row carries an Agent —
+                                         Alternative Agents page only; the
+                                         canonical OpenHands pages drop the
+                                         Agent column in view() so this line
+                                         is skipped there)
                 Average Score: {score}
                 Average Cost/Runtime: {value}
                 Openness: {openness}
         """
         h_pad = "   "
         parts = ["<br>"]
         # Get and clean the language model name
         llm_base_value = row.get('Language Model', '')
         llm_base_value = clean_llm_base_list(llm_base_value)
             lm_name = llm_base_value[0]
         else:
             lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
         # Get SDK version
         sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
         # Title line: {lm_name} (SDK {version})
         parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
+        # Harness line — only on pages where the Agent column is present
+        # (Alternative Agents). Without this, two rows for the same LM run
+        # under different harnesses (e.g. Claude Code vs OpenHands Sub-agents
+        # on claude-sonnet-4-5) are indistinguishable on hover.
+        agent_value = row.get('Agent')
+        if agent_value is not None and pd.notna(agent_value) and str(agent_value).strip():
+            parts.append(f"{h_pad}Harness: <b>{agent_value}</b>{h_pad}<br>")
         # Average Score
         parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
     y_min = min_score - 5 if min_score > 5 else 0
     y_max = max_score + 5
+    # Cache base64-encoded logos across rows — every Claude model on the
+    # Alternative Agents page points at the same assets/harness-claude-code.svg,
+    # so decoding once per path is ~N× cheaper than once per point.
+    _logo_cache: dict[str, str] = {}
+    def _encode_logo(path: str) -> Optional[str]:
+        if path in _logo_cache:
+            return _logo_cache[path]
+        if not os.path.exists(path):
+            return None
+        try:
+            with open(path, "rb") as f:
+                encoded = base64.b64encode(f.read()).decode("utf-8")
+        except Exception as e:
+            logger.warning(f"Could not load logo {path}: {e}")
+            return None
+        mime = "svg+xml" if path.lower().endswith(".svg") else "png"
+        uri = f"data:image/{mime};base64,{encoded}"
+        _logo_cache[path] = uri
+        return uri
+    # Composite markers: on the Alternative Agents page the dataframe carries
+    # an "Agent" column (Claude Code / Codex / Gemini CLI / OpenHands Sub-agents),
+    # so a point for claude-sonnet-4-5 under Claude Code and under OpenHands
+    # Sub-agents would otherwise share the exact same Anthropic logo marker
+    # and be visually indistinguishable. When Agent is present, we stack
+    # two logos at each point: model provider on top, harness on the bottom.
+    # Canonical OpenHands pages drop the Agent column in view() (via the
+    # has_mixed_agents check), so they fall through to the single-logo path
+    # and render exactly as before.
+    has_harness_column = (
+        "Agent" in data_plot.columns
+        and data_plot["Agent"].dropna().astype(str).str.strip().ne("").any()
+    )
+    # Marker sizes. The composite variant fits two logos inside roughly the
+    # same vertical footprint as a single marker, so each half is slightly
+    # smaller and the two halves are offset symmetrically around the point's
+    # true y-coordinate.
+    SINGLE_SIZE_X, SINGLE_SIZE_Y = 0.04, 0.06
+    STACKED_SIZE_X, STACKED_SIZE_Y = 0.035, 0.048
+    STACKED_Y_OFFSET = 0.028  # half-separation between model (top) and harness (bottom)
     for _, row in data_plot.iterrows():
         model_name = row.get('Language Model', '')
         openness = row.get('Openness', '')
         marker_info = get_marker_icon(model_name, openness, mark_by)
+        model_logo_uri = _encode_logo(marker_info['path'])
+        if model_logo_uri is None:
+            continue
+        # Harness (only meaningful when the dataframe carries an Agent column).
+        harness_uri = None
+        if has_harness_column:
+            harness_info = get_harness_icon(row.get("Agent"))
+            if harness_info is not None:
+                harness_uri = _encode_logo(harness_info["path"])
+        x_val = row[x_col_to_use]
+        y_val = row[y_col_to_use]
+        # Convert to domain coordinates (0-1 range)
+        # For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
+        if x_val > 0:
+            log_x = np.log10(x_val)
+            domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
+        else:
+            domain_x = 0
+        # For linear y: domain_y = (y - y_min) / (y_max - y_min)
+        domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
+        # Clamp to valid range
+        domain_x = max(0, min(1, domain_x))
+        domain_y = max(0, min(1, domain_y))
+        if harness_uri is not None:
+            # Composite: stack model on top, harness on bottom, clamping
+            # each half to the plot area so markers near the edges don't
+            # drift off-canvas.
+            model_y = min(1, domain_y + STACKED_Y_OFFSET)
+            harness_y = max(0, domain_y - STACKED_Y_OFFSET)
+            layout_images.append(dict(
+                source=model_logo_uri,
+                xref="x domain", yref="y domain",
+                x=domain_x, y=model_y,
+                sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
+                xanchor="center", yanchor="middle",
+                layer="above",
+            ))
+            layout_images.append(dict(
+                source=harness_uri,
+                xref="x domain", yref="y domain",
+                x=domain_x, y=harness_y,
+                sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
+                xanchor="center", yanchor="middle",
+                layer="above",
+            ))
+        else:
+            # Single marker (canonical OpenHands pages, or Alternative Agents
+            # rows with an unknown harness name — the latter shouldn't happen
+            # in practice since HARNESS_LOGO_PATHS covers every agent_name the
+            # push-to-index script emits).
+            layout_images.append(dict(
+                source=model_logo_uri,
+                xref="x domain", yref="y domain",
+                x=domain_x, y=domain_y,
+                sizex=SINGLE_SIZE_X, sizey=SINGLE_SIZE_Y,
+                xanchor="center", yanchor="middle",
+                layer="above",
+            ))
     # --- Section 7: Add Model Name Labels to Frontier Points ---
     if frontier_rows:

setup_data.py CHANGED Viewed

@@ -70,27 +70,39 @@ def fetch_data_from_github():
         # Look for data files in the cloned repository
         results_source = clone_dir / "results"
         if not results_source.exists():
             print(f"Results directory not found in repository")
             return False
         # Check if there are any agent result directories
         result_dirs = list(results_source.iterdir())
         if not result_dirs:
             print(f"No agent results found in {results_source}")
             return False
         print(f"Found {len(result_dirs)} agent result directories")
         # Create target directory and copy the results structure
         os.makedirs(target_dir.parent, exist_ok=True)
         if target_dir.exists():
             shutil.rmtree(target_dir)
         # Copy the entire results directory
         target_results = target_dir / "results"
         shutil.copytree(results_source, target_results)
         print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")

         # Look for data files in the cloned repository
         results_source = clone_dir / "results"
         if not results_source.exists():
             print(f"Results directory not found in repository")
             return False
         # Check if there are any agent result directories
         result_dirs = list(results_source.iterdir())
         if not result_dirs:
             print(f"No agent results found in {results_source}")
             return False
         print(f"Found {len(result_dirs)} agent result directories")
         # Create target directory and copy the results structure
         os.makedirs(target_dir.parent, exist_ok=True)
         if target_dir.exists():
             shutil.rmtree(target_dir)
         # Copy the entire results directory
         target_results = target_dir / "results"
         shutil.copytree(results_source, target_results)
+        # Also copy alternative_agents/ if present, so the loader can pick up
+        # ACP runs (acp-claude, acp-codex, acp-gemini, openhands_subagents, ...)
+        # alongside the default OpenHands agent results.
+        alt_source = clone_dir / "alternative_agents"
+        if alt_source.exists():
+            alt_target = target_dir / "alternative_agents"
+            shutil.copytree(alt_source, alt_target)
+            agent_types = sorted(p.name for p in alt_source.iterdir() if p.is_dir())
+            print(f"Found alternative agent types: {agent_types}")
+        else:
+            print("No alternative_agents/ directory in repository (skipping)")
         print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")

simple_data_loader.py CHANGED Viewed

@@ -96,17 +96,43 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
 class SimpleLeaderboardViewer:
     """Simple replacement for agent-eval's LeaderboardViewer."""
-    def __init__(self, data_dir: str, config: str, split: str):
         """
         Args:
             data_dir: Path to data directory
             config: Config name (e.g., "1.0.0-dev1")
             split: Split name (e.g., "validation" or "test")
         """
         self.data_dir = Path(data_dir)
         self.config = config
         self.split = split
         self.config_path = self.data_dir / config
         # Benchmark to category mappings (single source of truth)
@@ -127,55 +153,115 @@ class SimpleLeaderboardViewer:
                 if benchmark not in self.tag_map[category]:
                     self.tag_map[category].append(benchmark)
     def _load_from_agent_dirs(self):
-        """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
-        results_dir = self.config_path / "results"
-        if not results_dir.exists():
-            return None  # Fall back to old format
         all_records = []
         all_validation_errors = []
-        # Iterate through each agent directory
-        for agent_dir in results_dir.iterdir():
-            if not agent_dir.is_dir():
-                continue
-            # Load and validate using pydantic models
-            metadata, scores, errors = load_and_validate_agent_data(agent_dir)
-            if errors:
-                all_validation_errors.extend(errors)
-            if metadata is None or scores is None:
-                continue
-            # Skip entries that are hidden from the leaderboard
-            if metadata.get('hide_from_leaderboard', False):
-                logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
-                continue
-            # Create one record per benchmark (mimicking old JSONL format)
-            for score_entry in scores:
-                record = {
-                    'agent_version': metadata.get('agent_version', 'Unknown'),
-                    'llm_base': metadata.get('model', 'unknown'),
-                    'openness': metadata.get('openness', 'unknown'),
-                    'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
-                    'release_date': metadata.get('release_date', ''),  # Model release date
-                    'parameter_count_b': metadata.get('parameter_count_b'),  # Total params in billions
-                    'active_parameter_count_b': metadata.get('active_parameter_count_b'),  # Active params for MoE
-                    'score': score_entry.get('score'),
-                    'metric': score_entry.get('metric', 'unknown'),
-                    'cost_per_instance': score_entry.get('cost_per_instance'),
-                    'average_runtime': score_entry.get('average_runtime'),
-                    'tags': [score_entry.get('benchmark')],
-                    'full_archive': score_entry.get('full_archive', ''),  # Download URL for trajectories
-                    'eval_visualization_page': score_entry.get('eval_visualization_page', ''),  # Laminar visualization URL
-                }
-                all_records.append(record)
         # Log validation errors if any
         if all_validation_errors:
             logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
@@ -183,10 +269,10 @@ class SimpleLeaderboardViewer:
                 logger.warning(f"  - {error}")
             if len(all_validation_errors) > 5:
                 logger.warning(f"  ... and {len(all_validation_errors) - 5} more")
         if not all_records:
-            return None  # Fall back to old format
         return pd.DataFrame(all_records)
     def _load(self):
@@ -206,26 +292,36 @@ class SimpleLeaderboardViewer:
             # Group by agent (version + model combination) to aggregate results across datasets
             transformed_records = []
-            # Create a unique identifier for each agent (version + model)
-            df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
             for agent_id in df['agent_id'].unique():
                 agent_records = df[df['agent_id'] == agent_id]
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
                 agent_version = first_record['agent_version']
                 # Normalize openness to "open" or "closed"
                 from aliases import OPENNESS_MAPPING
                 raw_openness = first_record['openness']
                 normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                 # All 5 categories for the leaderboard
                 ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
                 record = {
                     # Core agent info - use final display names
                     'SDK version': agent_version,  # Will become "SDK Version"
                     'Language model': first_record['llm_base'],  # Will become "Language Model"
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
@@ -235,7 +331,7 @@ class SimpleLeaderboardViewer:
                     'parameter_count_b': first_record.get('parameter_count_b'),  # Total params in billions
                     'active_parameter_count_b': first_record.get('active_parameter_count_b'),  # Active params for MoE
                     # Additional columns expected by the transformer
-                    # Use agent_id (version_model) as unique identifier for Pareto frontier calculation
                     'id': agent_id,
                     'source': first_record.get('source', ''),  # Will become "Source"
                     'logs': first_record.get('logs', ''),  # Will become "Logs"

 class SimpleLeaderboardViewer:
     """Simple replacement for agent-eval's LeaderboardViewer."""
+    AGENT_FILTER_OPENHANDS = "openhands"
+    AGENT_FILTER_ALTERNATIVE = "alternative"
+    def __init__(
+        self,
+        data_dir: str,
+        config: str,
+        split: str,
+        agent_filter: str = AGENT_FILTER_OPENHANDS,
+    ):
         """
         Args:
             data_dir: Path to data directory
             config: Config name (e.g., "1.0.0-dev1")
             split: Split name (e.g., "validation" or "test")
+            agent_filter: Which submissions to include.
+                ``"openhands"`` (default) loads only the default OpenHands
+                agent runs from ``results/{model}/`` — the canonical
+                leaderboard. ``"alternative"`` loads only third-party
+                harnesses (Claude Code / Codex / Gemini CLI / OpenHands
+                Sub-agents) from ``alternative_agents/{type}/{model}/``,
+                which power the standalone Alternative Agents page.
+                The two are kept on separate pages because their
+                cost/runtime numbers aren't apples-to-apples and mixing
+                them in one ranking would be misleading.
         """
+        if agent_filter not in (self.AGENT_FILTER_OPENHANDS, self.AGENT_FILTER_ALTERNATIVE):
+            raise ValueError(
+                f"agent_filter must be one of "
+                f"{{{self.AGENT_FILTER_OPENHANDS!r}, {self.AGENT_FILTER_ALTERNATIVE!r}}}, "
+                f"got {agent_filter!r}"
+            )
         self.data_dir = Path(data_dir)
         self.config = config
         self.split = split
+        self.agent_filter = agent_filter
         self.config_path = self.data_dir / config
         # Benchmark to category mappings (single source of truth)
                 if benchmark not in self.tag_map[category]:
                     self.tag_map[category].append(benchmark)
+    # Default agent_name when metadata.json doesn't carry one. Matches the
+    # default-agent value used by push_to_index_from_archive.py so legacy
+    # entries (which omit the field) still group cleanly with new entries.
+    DEFAULT_AGENT_NAME = "OpenHands"
+    def _records_from_agent_dir(self, agent_dir: Path, default_agent_name: str | None = None) -> tuple[list[dict], list[str]]:
+        """Build per-benchmark records from a single agent directory.
+        Shared by ``_load_from_agent_dirs`` (default OpenHands results) and
+        ``_load_from_alternative_agents_dirs`` (acp-claude / acp-codex / etc.).
+        Returns ``(records, validation_errors)``. Returns an empty list of
+        records when the directory has no scores or is hidden from the
+        leaderboard.
+        """
+        records: list[dict] = []
+        metadata, scores, errors = load_and_validate_agent_data(agent_dir)
+        if metadata is None or scores is None:
+            return records, errors
+        if metadata.get('hide_from_leaderboard', False):
+            logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
+            return records, errors
+        # Resolve the agent display name. Prefer the value stamped into
+        # metadata.json by push-to-index; fall back to the directory's
+        # default (e.g. "Claude Code" for acp-claude/) and finally to
+        # "OpenHands" for legacy results/ entries that predate the field.
+        agent_name = (
+            metadata.get('agent_name')
+            or default_agent_name
+            or self.DEFAULT_AGENT_NAME
+        )
+        for score_entry in scores:
+            record = {
+                'agent_name': agent_name,
+                'agent_version': metadata.get('agent_version', 'Unknown'),
+                'llm_base': metadata.get('model', 'unknown'),
+                'openness': metadata.get('openness', 'unknown'),
+                'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
+                'release_date': metadata.get('release_date', ''),
+                'parameter_count_b': metadata.get('parameter_count_b'),
+                'active_parameter_count_b': metadata.get('active_parameter_count_b'),
+                'score': score_entry.get('score'),
+                'metric': score_entry.get('metric', 'unknown'),
+                'cost_per_instance': score_entry.get('cost_per_instance'),
+                'average_runtime': score_entry.get('average_runtime'),
+                'tags': [score_entry.get('benchmark')],
+                'full_archive': score_entry.get('full_archive', ''),
+                'eval_visualization_page': score_entry.get('eval_visualization_page', ''),
+            }
+            records.append(record)
+        return records, errors
     def _load_from_agent_dirs(self):
+        """Load agent records based on ``self.agent_filter``.
+        - ``"openhands"`` (default): only ``{config}/results/{model}/``,
+          which is the canonical OpenHands leaderboard. The Home page and
+          the per-category subpages use this.
+        - ``"alternative"``: only
+          ``{config}/alternative_agents/{type}/{model}/`` (acp-claude,
+          acp-codex, acp-gemini, openhands_subagents, ...). The dedicated
+          Alternative Agents page uses this.
+        Returns ``None`` if no records were found (which makes the caller
+        render an empty-state placeholder).
+        """
         all_records = []
         all_validation_errors = []
+        if self.agent_filter == self.AGENT_FILTER_OPENHANDS:
+            # Default OpenHands agent results
+            results_dir = self.config_path / "results"
+            if results_dir.exists():
+                for agent_dir in results_dir.iterdir():
+                    if not agent_dir.is_dir():
+                        continue
+                    records, errors = self._records_from_agent_dir(agent_dir)
+                    all_records.extend(records)
+                    all_validation_errors.extend(errors)
+        else:
+            # Alternative agents (one subdirectory per agent_type, then per model)
+            # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE
+            # map in OpenHands/evaluation push_to_index_from_archive.py — keeping
+            # it in sync ensures rows are labelled the same way the index repo
+            # records them.
+            agent_type_default_name = {
+                'acp-claude': 'Claude Code',
+                'acp-codex': 'Codex',
+                'acp-gemini': 'Gemini CLI',
+                'openhands_subagents': 'OpenHands Sub-agents',
+            }
+            alt_dir = self.config_path / "alternative_agents"
+            if alt_dir.exists():
+                for type_dir in alt_dir.iterdir():
+                    if not type_dir.is_dir():
+                        continue
+                    default_name = agent_type_default_name.get(type_dir.name)
+                    for agent_dir in type_dir.iterdir():
+                        if not agent_dir.is_dir():
+                            continue
+                        records, errors = self._records_from_agent_dir(
+                            agent_dir, default_agent_name=default_name
+                        )
+                        all_records.extend(records)
+                        all_validation_errors.extend(errors)
         # Log validation errors if any
         if all_validation_errors:
             logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
                 logger.warning(f"  - {error}")
             if len(all_validation_errors) > 5:
                 logger.warning(f"  ... and {len(all_validation_errors) - 5} more")
         if not all_records:
+            return None  # Caller will render empty-state placeholder
         return pd.DataFrame(all_records)
     def _load(self):
             # Group by agent (version + model combination) to aggregate results across datasets
             transformed_records = []
+            # Create a unique identifier per (agent_name, agent_version, model)
+            # tuple. Including agent_name keeps an OpenHands run and a Claude
+            # Code run on the same SDK version + model from collapsing into
+            # one row when both submit to the leaderboard.
+            df['agent_name'] = df['agent_name'].fillna(self.DEFAULT_AGENT_NAME)
+            df['agent_id'] = (
+                df['agent_name'].astype(str)
+                + '_' + df['agent_version'].astype(str)
+                + '_' + df['llm_base'].astype(str)
+            )
             for agent_id in df['agent_id'].unique():
                 agent_records = df[df['agent_id'] == agent_id]
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
                 agent_version = first_record['agent_version']
+                agent_name = first_record['agent_name']
                 # Normalize openness to "open" or "closed"
                 from aliases import OPENNESS_MAPPING
                 raw_openness = first_record['openness']
                 normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                 # All 5 categories for the leaderboard
                 ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
                 record = {
                     # Core agent info - use final display names
+                    'agent_name': agent_name,  # Will become "Agent"
                     'SDK version': agent_version,  # Will become "SDK Version"
                     'Language model': first_record['llm_base'],  # Will become "Language Model"
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'parameter_count_b': first_record.get('parameter_count_b'),  # Total params in billions
                     'active_parameter_count_b': first_record.get('active_parameter_count_b'),  # Active params for MoE
                     # Additional columns expected by the transformer
+                    # Use agent_id (name_version_model) as unique identifier for Pareto frontier calculation
                     'id': agent_id,
                     'source': first_record.get('source', ''),  # Will become "Source"
                     'logs': first_record.get('logs', ''),  # Will become "Logs"

ui_components.py CHANGED Viewed

@@ -508,28 +508,36 @@ class DummyViewer:
         # The _load method returns the error DataFrame and an empty tag map
         return self._error_df, {}
-def get_leaderboard_viewer_instance(split: str):
     """
-    Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid
-    re-downloading data. On error, returns a stable DummyViewer object.
     """
     global CACHED_VIEWERS, CACHED_TAG_MAPS
     with _cache_lock:
-        if split in CACHED_VIEWERS:
             # Cache hit: return the cached viewer and tag map
-            return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
     # --- Cache miss: try to load data from the source ---
     try:
         # First try to load from extracted data directory (local mock data)
         data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
-        print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
         viewer = SimpleLeaderboardViewer(
             data_dir=data_dir,
             config=CONFIG_NAME,
-            split=split
         )
         # Simplify tag map creation
@@ -537,14 +545,14 @@ def get_leaderboard_viewer_instance(split: str):
         # Cache the results for next time (thread-safe)
         with _cache_lock:
-            CACHED_VIEWERS[split] = viewer
-            CACHED_TAG_MAPS[split] = pretty_tag_map  # Cache the pretty map directly
         return viewer, pretty_tag_map
     except Exception as e:
         # On ANY error, create a consistent error message and cache a DummyViewer
-        error_message = f"Error loading data for split '{split}': {e}"
         print(format_error(error_message))
         dummy_df = pd.DataFrame({"Message": [error_message]})
@@ -553,8 +561,8 @@ def get_leaderboard_viewer_instance(split: str):
         # Cache the dummy objects so we don't try to fetch again on this run
         with _cache_lock:
-            CACHED_VIEWERS[split] = dummy_viewer
-            CACHED_TAG_MAPS[split] = dummy_tag_map
         return dummy_viewer, dummy_tag_map
@@ -1268,12 +1276,17 @@ def create_benchmark_details_display(
             legend_markdown = create_legend_markdown(benchmark_name)
             gr.HTML(value=legend_markdown, elem_id="legend-markdown")
-def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
     """
-    Loads and transforms the complete dataset for a given split.
-    This function handles caching and returns the final "pretty" DataFrame and tag map.
     """
-    viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
     if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
         raw_df, _ = viewer_or_data._load()

         # The _load method returns the error DataFrame and an empty tag map
         return self._error_df, {}
+def get_leaderboard_viewer_instance(
+    split: str,
+    agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
+):
     """
+    Fetches the LeaderboardViewer for a (split, agent_filter) pair, using a
+    thread-safe cache to avoid re-downloading data. The cache is keyed on
+    both axes so the OpenHands and Alternative Agents pages don't fight
+    over a single slot. On error, returns a stable DummyViewer.
     """
     global CACHED_VIEWERS, CACHED_TAG_MAPS
+    cache_key = (split, agent_filter)
     with _cache_lock:
+        if cache_key in CACHED_VIEWERS:
             # Cache hit: return the cached viewer and tag map
+            return CACHED_VIEWERS[cache_key], CACHED_TAG_MAPS.get(cache_key, {"Overall": []})
     # --- Cache miss: try to load data from the source ---
     try:
         # First try to load from extracted data directory (local mock data)
         data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
+        print(f"Loading data for split '{split}' (agent_filter={agent_filter}) from: {data_dir}/{CONFIG_NAME}")
         viewer = SimpleLeaderboardViewer(
             data_dir=data_dir,
             config=CONFIG_NAME,
+            split=split,
+            agent_filter=agent_filter,
         )
         # Simplify tag map creation
         # Cache the results for next time (thread-safe)
         with _cache_lock:
+            CACHED_VIEWERS[cache_key] = viewer
+            CACHED_TAG_MAPS[cache_key] = pretty_tag_map  # Cache the pretty map directly
         return viewer, pretty_tag_map
     except Exception as e:
         # On ANY error, create a consistent error message and cache a DummyViewer
+        error_message = f"Error loading data for split '{split}' (agent_filter={agent_filter}): {e}"
         print(format_error(error_message))
         dummy_df = pd.DataFrame({"Message": [error_message]})
         # Cache the dummy objects so we don't try to fetch again on this run
         with _cache_lock:
+            CACHED_VIEWERS[cache_key] = dummy_viewer
+            CACHED_TAG_MAPS[cache_key] = dummy_tag_map
         return dummy_viewer, dummy_tag_map
             legend_markdown = create_legend_markdown(benchmark_name)
             gr.HTML(value=legend_markdown, elem_id="legend-markdown")
+def get_full_leaderboard_data(
+    split: str,
+    agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
+) -> tuple[pd.DataFrame, dict]:
     """
+    Loads and transforms the complete dataset for a (split, agent_filter)
+    pair. ``agent_filter`` defaults to ``"openhands"`` so existing pages
+    that don't pass it stay on the canonical leaderboard. The Alternative
+    Agents page passes ``"alternative"`` to get the third-party harnesses.
     """
+    viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split, agent_filter=agent_filter)
     if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
         raw_df, _ = viewer_or_data._load()