File size: 3,464 Bytes
6d3b657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70749cd
 
 
 
 
6d3b657
 
 
70749cd
6d3b657
 
 
 
 
 
 
 
 
 
 
 
 
 
70749cd
 
 
 
 
6d3b657
 
 
 
 
70749cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d3b657
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70749cd
 
6d3b657
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""Alternative Agents leaderboard page.

The canonical OpenHands Index leaderboard (Home + the per-category pages)
ranks default OpenHands agent runs from ``results/{model}/`` in the
openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
Gemini CLI, OpenHands Sub-agents, ...) live under
``alternative_agents/{type}/{model}/`` and aren't directly comparable to
default OpenHands runs (different scaffolds, different cost/runtime
characteristics), so they get their own standalone page instead of being
mixed into the same ranking.

This page is intentionally a single Overall view (no per-category
subpages) — the alternative-agents dataset is small (one row per
harness × model) and the goal is "show me all the alternatives at a
glance", not "drill into Issue Resolution for Codex".

To make same-model comparisons easier, the page also appends canonical
OpenHands rows for any language model that appears in the alternative
agent dataset. The match is exact, so ``Gemini-3-Pro`` and
``Gemini-3.1-Pro`` remain distinct entries.
"""
import matplotlib
matplotlib.use('Agg')
import pandas as pd
import gradio as gr

from simple_data_loader import SimpleLeaderboardViewer
from ui_components import (
    create_leaderboard_display,
    get_full_leaderboard_data,
)


ALTERNATIVE_AGENTS_INTRO = """
<div id="alternative-agents-intro">
  <h2>Alternative Agents</h2>
  <p>
    Third-party agent harnesses running the OpenHands Index benchmarks.
    To make direct comparisons easier, this page also includes the
    canonical OpenHands row whenever the exact same language model appears
    under an alternative harness. Cost and runtime numbers still come from
    each harness's own instrumentation and aren't directly comparable
    across harnesses.
  </p>
</div>
"""


def _append_openhands_shared_models(
    alternative_df: pd.DataFrame,
    split: str,
) -> pd.DataFrame:
    if alternative_df.empty or "Language Model" not in alternative_df.columns:
        return alternative_df

    openhands_df, _ = get_full_leaderboard_data(
        split,
        agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
    )
    if openhands_df.empty or "Language Model" not in openhands_df.columns:
        return alternative_df

    alternative_models = set(
        alternative_df["Language Model"].dropna().astype(str).str.strip()
    )
    if not alternative_models:
        return alternative_df

    openhands_shared_df = openhands_df[
        openhands_df["Language Model"].astype(str).str.strip().isin(alternative_models)
    ].copy()
    if openhands_shared_df.empty:
        return alternative_df

    return pd.concat([alternative_df, openhands_shared_df], ignore_index=True, sort=False)


def build_page():
    gr.HTML(ALTERNATIVE_AGENTS_INTRO)

    gr.Markdown("---")

    test_df, test_tag_map = get_full_leaderboard_data(
        "test",
        agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
    )

    if test_df.empty:
        gr.Markdown(
            "No alternative agent submissions yet. New runs land in "
            "`alternative_agents/{type}/{model}/` in "
            "[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
        )
        return

    test_df = _append_openhands_shared_models(test_df, split="test")

    create_leaderboard_display(
        full_df=test_df,
        tag_map=test_tag_map,
        category_name="Overall",
        split_name="test",
    )