openhands-index / alternative_agents_page.py
openhands
Show OpenHands on alternate agents page for shared models
70749cd
raw
history blame
3.46 kB
"""Alternative Agents leaderboard page.
The canonical OpenHands Index leaderboard (Home + the per-category pages)
ranks default OpenHands agent runs from ``results/{model}/`` in the
openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
Gemini CLI, OpenHands Sub-agents, ...) live under
``alternative_agents/{type}/{model}/`` and aren't directly comparable to
default OpenHands runs (different scaffolds, different cost/runtime
characteristics), so they get their own standalone page instead of being
mixed into the same ranking.
This page is intentionally a single Overall view (no per-category
subpages) — the alternative-agents dataset is small (one row per
harness × model) and the goal is "show me all the alternatives at a
glance", not "drill into Issue Resolution for Codex".
To make same-model comparisons easier, the page also appends canonical
OpenHands rows for any language model that appears in the alternative
agent dataset. The match is exact, so ``Gemini-3-Pro`` and
``Gemini-3.1-Pro`` remain distinct entries.
"""
import matplotlib
matplotlib.use('Agg')
import pandas as pd
import gradio as gr
from simple_data_loader import SimpleLeaderboardViewer
from ui_components import (
create_leaderboard_display,
get_full_leaderboard_data,
)
ALTERNATIVE_AGENTS_INTRO = """
<div id="alternative-agents-intro">
<h2>Alternative Agents</h2>
<p>
Third-party agent harnesses running the OpenHands Index benchmarks.
To make direct comparisons easier, this page also includes the
canonical OpenHands row whenever the exact same language model appears
under an alternative harness. Cost and runtime numbers still come from
each harness's own instrumentation and aren't directly comparable
across harnesses.
</p>
</div>
"""
def _append_openhands_shared_models(
alternative_df: pd.DataFrame,
split: str,
) -> pd.DataFrame:
if alternative_df.empty or "Language Model" not in alternative_df.columns:
return alternative_df
openhands_df, _ = get_full_leaderboard_data(
split,
agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
)
if openhands_df.empty or "Language Model" not in openhands_df.columns:
return alternative_df
alternative_models = set(
alternative_df["Language Model"].dropna().astype(str).str.strip()
)
if not alternative_models:
return alternative_df
openhands_shared_df = openhands_df[
openhands_df["Language Model"].astype(str).str.strip().isin(alternative_models)
].copy()
if openhands_shared_df.empty:
return alternative_df
return pd.concat([alternative_df, openhands_shared_df], ignore_index=True, sort=False)
def build_page():
gr.HTML(ALTERNATIVE_AGENTS_INTRO)
gr.Markdown("---")
test_df, test_tag_map = get_full_leaderboard_data(
"test",
agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
)
if test_df.empty:
gr.Markdown(
"No alternative agent submissions yet. New runs land in "
"`alternative_agents/{type}/{model}/` in "
"[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
)
return
test_df = _append_openhands_shared_models(test_df, split="test")
create_leaderboard_display(
full_df=test_df,
tag_map=test_tag_map,
category_name="Overall",
split_name="test",
)