Spaces:
Running
Running
| """Alternative Agents leaderboard page. | |
| The canonical OpenHands Index leaderboard (Home + the per-category pages) | |
| ranks default OpenHands agent runs from ``results/{model}/`` in the | |
| openhands-index-results repo. Third-party harnesses (Claude Code, Codex, | |
| Gemini CLI, OpenHands Sub-agents, ...) live under | |
| ``alternative_agents/{type}/{model}/`` and aren't directly comparable to | |
| default OpenHands runs (different scaffolds, different cost/runtime | |
| characteristics), so they get their own standalone page instead of being | |
| mixed into the same ranking. | |
| This page is intentionally a single Overall view (no per-category | |
| subpages) — the alternative-agents dataset is small (one row per | |
| harness × model) and the goal is "show me all the alternatives at a | |
| glance", not "drill into Issue Resolution for Codex". | |
| To make same-model comparisons easier, the page also appends canonical | |
| OpenHands rows for any language model that appears in the alternative | |
| agent dataset. The match is exact, so ``Gemini-3-Pro`` and | |
| ``Gemini-3.1-Pro`` remain distinct entries. | |
| """ | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import pandas as pd | |
| import gradio as gr | |
| from simple_data_loader import SimpleLeaderboardViewer | |
| from ui_components import ( | |
| create_leaderboard_display, | |
| get_full_leaderboard_data, | |
| ) | |
| ALTERNATIVE_AGENTS_INTRO = """ | |
| <div id="alternative-agents-intro"> | |
| <h2>Alternative Agents</h2> | |
| <p> | |
| Third-party agent harnesses running the OpenHands Index benchmarks. | |
| To make direct comparisons easier, this page also includes the | |
| canonical OpenHands row whenever the exact same language model appears | |
| under an alternative harness. Cost and runtime numbers still come from | |
| each harness's own instrumentation and aren't directly comparable | |
| across harnesses. | |
| </p> | |
| </div> | |
| """ | |
| def _append_openhands_shared_models( | |
| alternative_df: pd.DataFrame, | |
| split: str, | |
| ) -> pd.DataFrame: | |
| if alternative_df.empty or "Language Model" not in alternative_df.columns: | |
| return alternative_df | |
| openhands_df, _ = get_full_leaderboard_data( | |
| split, | |
| agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS, | |
| ) | |
| if openhands_df.empty or "Language Model" not in openhands_df.columns: | |
| return alternative_df | |
| alternative_models = set( | |
| alternative_df["Language Model"].dropna().astype(str).str.strip() | |
| ) | |
| if not alternative_models: | |
| return alternative_df | |
| openhands_shared_df = openhands_df[ | |
| openhands_df["Language Model"].astype(str).str.strip().isin(alternative_models) | |
| ].copy() | |
| if openhands_shared_df.empty: | |
| return alternative_df | |
| return pd.concat([alternative_df, openhands_shared_df], ignore_index=True, sort=False) | |
| def build_page(): | |
| gr.HTML(ALTERNATIVE_AGENTS_INTRO) | |
| gr.Markdown("---") | |
| test_df, test_tag_map = get_full_leaderboard_data( | |
| "test", | |
| agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE, | |
| ) | |
| if test_df.empty: | |
| gr.Markdown( | |
| "No alternative agent submissions yet. New runs land in " | |
| "`alternative_agents/{type}/{model}/` in " | |
| "[openhands-index-results](https://github.com/OpenHands/openhands-index-results)." | |
| ) | |
| return | |
| test_df = _append_openhands_shared_models(test_df, split="test") | |
| create_leaderboard_display( | |
| full_df=test_df, | |
| tag_map=test_tag_map, | |
| category_name="Overall", | |
| split_name="test", | |
| ) | |