Spaces:
Running
Running
Show ACP agent results in the leaderboard
#11
by simonrosenberg1 - opened
- .gitattributes +1 -0
- alternative_agents_page.py +65 -0
- app.py +4 -0
- assets/harnesses/README.md +59 -0
- assets/harnesses/claude-code.svg +1 -0
- assets/harnesses/codex-cli.svg +1 -0
- assets/harnesses/gemini-cli.svg +1 -0
- assets/harnesses/openhands.svg +1 -0
- docs/screenshots/alternative-agents.png +3 -0
- leaderboard_transformer.py +195 -49
- setup_data.py +17 -5
- simple_data_loader.py +155 -59
- ui_components.py +30 -17
.gitattributes
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
|
|
|
|
|
|
| 1 |
|
| 2 |
+
docs/screenshots/alternative-agents.png filter=lfs diff=lfs merge=lfs -text
|
alternative_agents_page.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Alternative Agents leaderboard page.
|
| 2 |
+
|
| 3 |
+
The canonical OpenHands Index leaderboard (Home + the per-category pages)
|
| 4 |
+
ranks default OpenHands agent runs from ``results/{model}/`` in the
|
| 5 |
+
openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
|
| 6 |
+
Gemini CLI, OpenHands Sub-agents, ...) live under
|
| 7 |
+
``alternative_agents/{type}/{model}/`` and aren't directly comparable to
|
| 8 |
+
default OpenHands runs (different scaffolds, different cost/runtime
|
| 9 |
+
characteristics), so they get their own standalone page instead of being
|
| 10 |
+
mixed into the same ranking.
|
| 11 |
+
|
| 12 |
+
This page is intentionally a single Overall view (no per-category
|
| 13 |
+
subpages) β the alternative-agents dataset is small (one row per
|
| 14 |
+
harness Γ model) and the goal is "show me all the alternatives at a
|
| 15 |
+
glance", not "drill into Issue Resolution for Codex".
|
| 16 |
+
"""
|
| 17 |
+
import matplotlib
|
| 18 |
+
matplotlib.use('Agg')
|
| 19 |
+
import gradio as gr
|
| 20 |
+
|
| 21 |
+
from simple_data_loader import SimpleLeaderboardViewer
|
| 22 |
+
from ui_components import (
|
| 23 |
+
create_leaderboard_display,
|
| 24 |
+
get_full_leaderboard_data,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
ALTERNATIVE_AGENTS_INTRO = """
|
| 29 |
+
<div id="alternative-agents-intro">
|
| 30 |
+
<h2>Alternative Agents</h2>
|
| 31 |
+
<p>
|
| 32 |
+
Third-party agent harnesses running the OpenHands Index benchmarks.
|
| 33 |
+
These rows aren't part of the OpenHands ranking on the
|
| 34 |
+
<a href="/home">Home</a> page β they're tracked here as a comparison
|
| 35 |
+
point. Cost and runtime numbers come from each harness's own
|
| 36 |
+
instrumentation and aren't directly comparable across harnesses.
|
| 37 |
+
</p>
|
| 38 |
+
</div>
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def build_page():
|
| 43 |
+
gr.HTML(ALTERNATIVE_AGENTS_INTRO)
|
| 44 |
+
|
| 45 |
+
gr.Markdown("---")
|
| 46 |
+
|
| 47 |
+
test_df, test_tag_map = get_full_leaderboard_data(
|
| 48 |
+
"test",
|
| 49 |
+
agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
if test_df.empty:
|
| 53 |
+
gr.Markdown(
|
| 54 |
+
"No alternative agent submissions yet. New runs land in "
|
| 55 |
+
"`alternative_agents/{type}/{model}/` in "
|
| 56 |
+
"[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
|
| 57 |
+
)
|
| 58 |
+
return
|
| 59 |
+
|
| 60 |
+
create_leaderboard_display(
|
| 61 |
+
full_df=test_df,
|
| 62 |
+
tag_map=test_tag_map,
|
| 63 |
+
category_name="Overall",
|
| 64 |
+
split_name="test",
|
| 65 |
+
)
|
app.py
CHANGED
|
@@ -35,6 +35,7 @@ from app_creation import build_page as build_app_creation_page
|
|
| 35 |
from frontend_development import build_page as build_frontend_page
|
| 36 |
from test_generation import build_page as build_test_generation_page
|
| 37 |
from information_gathering import build_page as build_information_gathering_page
|
|
|
|
| 38 |
from about import build_page as build_about_page
|
| 39 |
|
| 40 |
logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
|
|
@@ -373,6 +374,9 @@ with demo.route("Testing", "/testing"):
|
|
| 373 |
with demo.route("Information Gathering", "/information-gathering"):
|
| 374 |
build_information_gathering_page()
|
| 375 |
|
|
|
|
|
|
|
|
|
|
| 376 |
with demo.route("About", "/about"):
|
| 377 |
build_about_page()
|
| 378 |
|
|
|
|
| 35 |
from frontend_development import build_page as build_frontend_page
|
| 36 |
from test_generation import build_page as build_test_generation_page
|
| 37 |
from information_gathering import build_page as build_information_gathering_page
|
| 38 |
+
from alternative_agents_page import build_page as build_alternative_agents_page
|
| 39 |
from about import build_page as build_about_page
|
| 40 |
|
| 41 |
logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
|
|
|
|
| 374 |
with demo.route("Information Gathering", "/information-gathering"):
|
| 375 |
build_information_gathering_page()
|
| 376 |
|
| 377 |
+
with demo.route("Alternative Agents", "/alternative-agents"):
|
| 378 |
+
build_alternative_agents_page()
|
| 379 |
+
|
| 380 |
with demo.route("About", "/about"):
|
| 381 |
build_about_page()
|
| 382 |
|
assets/harnesses/README.md
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Agent harness logos
|
| 2 |
+
|
| 3 |
+
This folder holds the **bottom half** of the composite scatter markers used
|
| 4 |
+
on the [Alternative Agents](../../alternative_agents_page.py) page. Each
|
| 5 |
+
point on that scatter stacks two logos: the model provider on top (from
|
| 6 |
+
`assets/logo-*.svg`) and the harness on the bottom (from this folder).
|
| 7 |
+
|
| 8 |
+
## Expected filenames
|
| 9 |
+
|
| 10 |
+
The scatter code looks up a logo by the exact `agent_name` string that the
|
| 11 |
+
`push-to-index` workflow writes into the index repo's `metadata.json`, then
|
| 12 |
+
maps it through `HARNESS_LOGO_STEMS` in `leaderboard_transformer.py`. Keep
|
| 13 |
+
these filenames in sync with that map.
|
| 14 |
+
|
| 15 |
+
| `agent_name` (in index repo) | File in this folder |
|
| 16 |
+
| --- | --- |
|
| 17 |
+
| `Claude Code` | `claude-code.svg` or `claude-code.png` |
|
| 18 |
+
| `Codex` | `codex-cli.svg` or `codex-cli.png` |
|
| 19 |
+
| `Gemini CLI` | `gemini-cli.svg` or `gemini-cli.png` |
|
| 20 |
+
| `OpenHands` | `openhands.svg` or `openhands.png` |
|
| 21 |
+
| `OpenHands Sub-agents` | `openhands.svg` or `openhands.png` (shared with `OpenHands`) |
|
| 22 |
+
|
| 23 |
+
Both `.svg` and `.png` are accepted β the resolver tries `.svg` first, then
|
| 24 |
+
`.png`. **Prefer SVG when possible**: the HuggingFace Space rejects new
|
| 25 |
+
binary files on plain `git push` and routes PNGs through Xet, so an SVG is
|
| 26 |
+
one less thing to set up.
|
| 27 |
+
|
| 28 |
+
## When a file is missing
|
| 29 |
+
|
| 30 |
+
The scatter falls back to a single marker (just the model provider logo) β
|
| 31 |
+
exactly the same rendering path the canonical OpenHands pages use. Nothing
|
| 32 |
+
crashes and nothing prints a warning in normal operation. This means you
|
| 33 |
+
can roll out logos one harness at a time without waiting for all four.
|
| 34 |
+
|
| 35 |
+
## Sizing and shape
|
| 36 |
+
|
| 37 |
+
- Square canvas. The composite marker is drawn at a fixed aspect ratio, so
|
| 38 |
+
a non-square logo will get squished.
|
| 39 |
+
- Any SVG `viewBox` works β the renderer base64-encodes the file as-is and
|
| 40 |
+
Plotly scales it to the marker's `sizex` / `sizey`. Around `80Γ80` to
|
| 41 |
+
`256Γ256` is a good source size.
|
| 42 |
+
- Leave some internal padding (β10%) so the logo doesn't touch the marker
|
| 43 |
+
edge when two are stacked.
|
| 44 |
+
- No background is required, but a rounded-square coloured tile reads well
|
| 45 |
+
at small sizes because it gives each harness a distinct silhouette even
|
| 46 |
+
when the inner glyph isn't fully legible. Look at the existing
|
| 47 |
+
`assets/logo-*.svg` files for the canonical model provider logos if you
|
| 48 |
+
want a visual reference for sizing.
|
| 49 |
+
|
| 50 |
+
## Adding a new harness
|
| 51 |
+
|
| 52 |
+
1. Decide on the exact `agent_name` that the push-to-index workflow writes
|
| 53 |
+
for the new harness (see `AGENT_NAME_BY_TYPE` in
|
| 54 |
+
`OpenHands/evaluation/push-to-index-job/scripts/push_to_index_from_archive.py`).
|
| 55 |
+
2. Add an entry to `HARNESS_LOGO_STEMS` in
|
| 56 |
+
[`leaderboard_transformer.py`](../../leaderboard_transformer.py) that
|
| 57 |
+
maps the display name to a stem.
|
| 58 |
+
3. Drop `{stem}.svg` (or `.png`) into this folder.
|
| 59 |
+
4. Reload the app and look at `/alternative-agents`.
|
assets/harnesses/claude-code.svg
ADDED
|
|
assets/harnesses/codex-cli.svg
ADDED
|
|
assets/harnesses/gemini-cli.svg
ADDED
|
|
assets/harnesses/openhands.svg
ADDED
|
|
docs/screenshots/alternative-agents.png
ADDED
|
Git LFS Details
|
leaderboard_transformer.py
CHANGED
|
@@ -228,17 +228,17 @@ def get_country_from_model(model_name: str) -> dict:
|
|
| 228 |
def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
|
| 229 |
"""
|
| 230 |
Gets the appropriate icon based on the mark_by selection.
|
| 231 |
-
|
| 232 |
Args:
|
| 233 |
model_name: The model name
|
| 234 |
openness: The openness value (open/closed)
|
| 235 |
mark_by: One of "Company", "Openness", or "Country"
|
| 236 |
-
|
| 237 |
Returns:
|
| 238 |
dict with 'path' and 'name' keys
|
| 239 |
"""
|
| 240 |
from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
|
| 241 |
-
|
| 242 |
if mark_by == MARK_BY_OPENNESS:
|
| 243 |
return get_openness_icon(openness)
|
| 244 |
elif mark_by == MARK_BY_COUNTRY:
|
|
@@ -247,6 +247,59 @@ def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
|
|
| 247 |
return get_company_from_model(model_name)
|
| 248 |
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
# Standard layout configuration for all charts
|
| 251 |
STANDARD_LAYOUT = dict(
|
| 252 |
template="plotly_white",
|
|
@@ -655,6 +708,7 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 655 |
# Case 1: Handle fixed, special-case mappings first.
|
| 656 |
fixed_mappings = {
|
| 657 |
'id': 'id',
|
|
|
|
| 658 |
'SDK version': 'SDK Version',
|
| 659 |
'Openhands version': 'SDK Version', # Legacy support
|
| 660 |
'Language model': 'Language Model',
|
|
@@ -815,7 +869,21 @@ class DataTransformer:
|
|
| 815 |
df_view = df_sorted.copy()
|
| 816 |
|
| 817 |
# --- 3. Add Columns for Agent Openness ---
|
| 818 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 819 |
new_cols = ["Openness"]
|
| 820 |
ending_cols = ["Date", "Logs", "Visualization"]
|
| 821 |
|
|
@@ -1018,13 +1086,18 @@ def _plot_scatter_plotly(
|
|
| 1018 |
"""
|
| 1019 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 1020 |
Format: {lm_name} (SDK {version})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1021 |
Average Score: {score}
|
| 1022 |
Average Cost/Runtime: {value}
|
| 1023 |
Openness: {openness}
|
| 1024 |
"""
|
| 1025 |
h_pad = " "
|
| 1026 |
parts = ["<br>"]
|
| 1027 |
-
|
| 1028 |
# Get and clean the language model name
|
| 1029 |
llm_base_value = row.get('Language Model', '')
|
| 1030 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
|
@@ -1032,13 +1105,21 @@ def _plot_scatter_plotly(
|
|
| 1032 |
lm_name = llm_base_value[0]
|
| 1033 |
else:
|
| 1034 |
lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
|
| 1035 |
-
|
| 1036 |
# Get SDK version
|
| 1037 |
sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
|
| 1038 |
-
|
| 1039 |
# Title line: {lm_name} (SDK {version})
|
| 1040 |
parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
|
| 1041 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1042 |
# Average Score
|
| 1043 |
parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 1044 |
|
|
@@ -1111,51 +1192,116 @@ def _plot_scatter_plotly(
|
|
| 1111 |
y_min = min_score - 5 if min_score > 5 else 0
|
| 1112 |
y_max = max_score + 5
|
| 1113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1114 |
for _, row in data_plot.iterrows():
|
| 1115 |
model_name = row.get('Language Model', '')
|
| 1116 |
openness = row.get('Openness', '')
|
| 1117 |
marker_info = get_marker_icon(model_name, openness, mark_by)
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
-
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
-
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
|
| 1136 |
-
|
| 1137 |
-
|
| 1138 |
-
|
| 1139 |
-
|
| 1140 |
-
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
-
|
| 1157 |
-
|
| 1158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1159 |
|
| 1160 |
# --- Section 7: Add Model Name Labels to Frontier Points ---
|
| 1161 |
if frontier_rows:
|
|
|
|
| 228 |
def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
|
| 229 |
"""
|
| 230 |
Gets the appropriate icon based on the mark_by selection.
|
| 231 |
+
|
| 232 |
Args:
|
| 233 |
model_name: The model name
|
| 234 |
openness: The openness value (open/closed)
|
| 235 |
mark_by: One of "Company", "Openness", or "Country"
|
| 236 |
+
|
| 237 |
Returns:
|
| 238 |
dict with 'path' and 'name' keys
|
| 239 |
"""
|
| 240 |
from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
|
| 241 |
+
|
| 242 |
if mark_by == MARK_BY_OPENNESS:
|
| 243 |
return get_openness_icon(openness)
|
| 244 |
elif mark_by == MARK_BY_COUNTRY:
|
|
|
|
| 247 |
return get_company_from_model(model_name)
|
| 248 |
|
| 249 |
|
| 250 |
+
# Map the agent_name stored in the index repo's metadata.json to a file stem
|
| 251 |
+
# inside assets/harnesses/. Kept in sync with AGENT_NAME_BY_TYPE in
|
| 252 |
+
# OpenHands/evaluation push_to_index_from_archive.py β if a new ACP harness
|
| 253 |
+
# lands there, add the corresponding display name and a matching stem here.
|
| 254 |
+
#
|
| 255 |
+
# The scatter plot looks for {stem}.svg first, then {stem}.png in
|
| 256 |
+
# assets/harnesses/. This repo intentionally ships only a README in that
|
| 257 |
+
# folder: drop the logo files in by hand (SVG preferred, PNG works too via
|
| 258 |
+
# HF Xet) and they'll be picked up on the next app restart. If the file is
|
| 259 |
+
# missing, get_harness_icon() returns None and the scatter falls back to the
|
| 260 |
+
# single-marker path β same rendering the canonical OpenHands pages use β
|
| 261 |
+
# so logos can be added one harness at a time without breaking anything.
|
| 262 |
+
HARNESS_LOGO_STEMS: dict[str, str] = {
|
| 263 |
+
"Claude Code": "claude-code",
|
| 264 |
+
"Codex": "codex-cli",
|
| 265 |
+
"Gemini CLI": "gemini-cli",
|
| 266 |
+
"OpenHands": "openhands",
|
| 267 |
+
"OpenHands Sub-agents": "openhands",
|
| 268 |
+
}
|
| 269 |
+
HARNESS_LOGO_DIR = "assets/harnesses"
|
| 270 |
+
HARNESS_LOGO_EXTENSIONS = ("svg", "png")
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def get_harness_icon(agent_name: Optional[str]) -> Optional[dict]:
|
| 274 |
+
"""Return {'path', 'name'} for the harness logo, or None if not usable.
|
| 275 |
+
|
| 276 |
+
Consumed by the Alternative Agents scatter plot to draw a composite
|
| 277 |
+
marker (model provider on top, harness on bottom). Returns None in any
|
| 278 |
+
of three cases, all of which make the caller skip the harness layer:
|
| 279 |
+
|
| 280 |
+
- ``agent_name`` is empty or missing from the dataframe row.
|
| 281 |
+
- ``agent_name`` isn't in ``HARNESS_LOGO_STEMS`` (new harness that
|
| 282 |
+
hasn't been registered yet β register it and drop in a logo).
|
| 283 |
+
- The logo file for that stem doesn't exist in ``assets/harnesses/``
|
| 284 |
+
yet (the repo ships only the README).
|
| 285 |
+
|
| 286 |
+
That third case is the important one: it lets the Alternative Agents
|
| 287 |
+
page work immediately after checkout even when the harness logo files
|
| 288 |
+
haven't been dropped in. The corresponding points just render like a
|
| 289 |
+
canonical-page marker (model logo only) until the file is added.
|
| 290 |
+
"""
|
| 291 |
+
if not agent_name:
|
| 292 |
+
return None
|
| 293 |
+
stem = HARNESS_LOGO_STEMS.get(str(agent_name).strip())
|
| 294 |
+
if stem is None:
|
| 295 |
+
return None
|
| 296 |
+
for ext in HARNESS_LOGO_EXTENSIONS:
|
| 297 |
+
path = f"{HARNESS_LOGO_DIR}/{stem}.{ext}"
|
| 298 |
+
if os.path.exists(path):
|
| 299 |
+
return {"path": path, "name": agent_name}
|
| 300 |
+
return None
|
| 301 |
+
|
| 302 |
+
|
| 303 |
# Standard layout configuration for all charts
|
| 304 |
STANDARD_LAYOUT = dict(
|
| 305 |
template="plotly_white",
|
|
|
|
| 708 |
# Case 1: Handle fixed, special-case mappings first.
|
| 709 |
fixed_mappings = {
|
| 710 |
'id': 'id',
|
| 711 |
+
'agent_name': 'Agent',
|
| 712 |
'SDK version': 'SDK Version',
|
| 713 |
'Openhands version': 'SDK Version', # Legacy support
|
| 714 |
'Language model': 'Language Model',
|
|
|
|
| 869 |
df_view = df_sorted.copy()
|
| 870 |
|
| 871 |
# --- 3. Add Columns for Agent Openness ---
|
| 872 |
+
# Only include the "Agent" column when the dataframe actually has
|
| 873 |
+
# more than one distinct agent. On the canonical OpenHands pages
|
| 874 |
+
# every row says "OpenHands", so adding the column is just noise;
|
| 875 |
+
# on the Alternative Agents page rows differ (Claude Code / Codex
|
| 876 |
+
# / Gemini CLI / OpenHands Sub-agents), so the column carries
|
| 877 |
+
# signal and disambiguates same-model rows from different
|
| 878 |
+
# harnesses.
|
| 879 |
+
has_mixed_agents = (
|
| 880 |
+
"Agent" in df_view.columns
|
| 881 |
+
and df_view["Agent"].dropna().nunique() > 1
|
| 882 |
+
)
|
| 883 |
+
if has_mixed_agents:
|
| 884 |
+
base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
|
| 885 |
+
else:
|
| 886 |
+
base_cols = ["id", "Language Model", "SDK Version", "Source"]
|
| 887 |
new_cols = ["Openness"]
|
| 888 |
ending_cols = ["Date", "Logs", "Visualization"]
|
| 889 |
|
|
|
|
| 1086 |
"""
|
| 1087 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 1088 |
Format: {lm_name} (SDK {version})
|
| 1089 |
+
Harness: {agent} (only when the row carries an Agent β
|
| 1090 |
+
Alternative Agents page only; the
|
| 1091 |
+
canonical OpenHands pages drop the
|
| 1092 |
+
Agent column in view() so this line
|
| 1093 |
+
is skipped there)
|
| 1094 |
Average Score: {score}
|
| 1095 |
Average Cost/Runtime: {value}
|
| 1096 |
Openness: {openness}
|
| 1097 |
"""
|
| 1098 |
h_pad = " "
|
| 1099 |
parts = ["<br>"]
|
| 1100 |
+
|
| 1101 |
# Get and clean the language model name
|
| 1102 |
llm_base_value = row.get('Language Model', '')
|
| 1103 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
|
|
|
| 1105 |
lm_name = llm_base_value[0]
|
| 1106 |
else:
|
| 1107 |
lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
|
| 1108 |
+
|
| 1109 |
# Get SDK version
|
| 1110 |
sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
|
| 1111 |
+
|
| 1112 |
# Title line: {lm_name} (SDK {version})
|
| 1113 |
parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
|
| 1114 |
+
|
| 1115 |
+
# Harness line β only on pages where the Agent column is present
|
| 1116 |
+
# (Alternative Agents). Without this, two rows for the same LM run
|
| 1117 |
+
# under different harnesses (e.g. Claude Code vs OpenHands Sub-agents
|
| 1118 |
+
# on claude-sonnet-4-5) are indistinguishable on hover.
|
| 1119 |
+
agent_value = row.get('Agent')
|
| 1120 |
+
if agent_value is not None and pd.notna(agent_value) and str(agent_value).strip():
|
| 1121 |
+
parts.append(f"{h_pad}Harness: <b>{agent_value}</b>{h_pad}<br>")
|
| 1122 |
+
|
| 1123 |
# Average Score
|
| 1124 |
parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 1125 |
|
|
|
|
| 1192 |
y_min = min_score - 5 if min_score > 5 else 0
|
| 1193 |
y_max = max_score + 5
|
| 1194 |
|
| 1195 |
+
# Cache base64-encoded logos across rows β every Claude model on the
|
| 1196 |
+
# Alternative Agents page points at the same assets/harness-claude-code.svg,
|
| 1197 |
+
# so decoding once per path is ~NΓ cheaper than once per point.
|
| 1198 |
+
_logo_cache: dict[str, str] = {}
|
| 1199 |
+
def _encode_logo(path: str) -> Optional[str]:
|
| 1200 |
+
if path in _logo_cache:
|
| 1201 |
+
return _logo_cache[path]
|
| 1202 |
+
if not os.path.exists(path):
|
| 1203 |
+
return None
|
| 1204 |
+
try:
|
| 1205 |
+
with open(path, "rb") as f:
|
| 1206 |
+
encoded = base64.b64encode(f.read()).decode("utf-8")
|
| 1207 |
+
except Exception as e:
|
| 1208 |
+
logger.warning(f"Could not load logo {path}: {e}")
|
| 1209 |
+
return None
|
| 1210 |
+
mime = "svg+xml" if path.lower().endswith(".svg") else "png"
|
| 1211 |
+
uri = f"data:image/{mime};base64,{encoded}"
|
| 1212 |
+
_logo_cache[path] = uri
|
| 1213 |
+
return uri
|
| 1214 |
+
|
| 1215 |
+
# Composite markers: on the Alternative Agents page the dataframe carries
|
| 1216 |
+
# an "Agent" column (Claude Code / Codex / Gemini CLI / OpenHands Sub-agents),
|
| 1217 |
+
# so a point for claude-sonnet-4-5 under Claude Code and under OpenHands
|
| 1218 |
+
# Sub-agents would otherwise share the exact same Anthropic logo marker
|
| 1219 |
+
# and be visually indistinguishable. When Agent is present, we stack
|
| 1220 |
+
# two logos at each point: model provider on top, harness on the bottom.
|
| 1221 |
+
# Canonical OpenHands pages drop the Agent column in view() (via the
|
| 1222 |
+
# has_mixed_agents check), so they fall through to the single-logo path
|
| 1223 |
+
# and render exactly as before.
|
| 1224 |
+
has_harness_column = (
|
| 1225 |
+
"Agent" in data_plot.columns
|
| 1226 |
+
and data_plot["Agent"].dropna().astype(str).str.strip().ne("").any()
|
| 1227 |
+
)
|
| 1228 |
+
|
| 1229 |
+
# Marker sizes. The composite variant fits two logos inside roughly the
|
| 1230 |
+
# same vertical footprint as a single marker, so each half is slightly
|
| 1231 |
+
# smaller and the two halves are offset symmetrically around the point's
|
| 1232 |
+
# true y-coordinate.
|
| 1233 |
+
SINGLE_SIZE_X, SINGLE_SIZE_Y = 0.04, 0.06
|
| 1234 |
+
STACKED_SIZE_X, STACKED_SIZE_Y = 0.035, 0.048
|
| 1235 |
+
STACKED_Y_OFFSET = 0.028 # half-separation between model (top) and harness (bottom)
|
| 1236 |
+
|
| 1237 |
for _, row in data_plot.iterrows():
|
| 1238 |
model_name = row.get('Language Model', '')
|
| 1239 |
openness = row.get('Openness', '')
|
| 1240 |
marker_info = get_marker_icon(model_name, openness, mark_by)
|
| 1241 |
+
model_logo_uri = _encode_logo(marker_info['path'])
|
| 1242 |
+
if model_logo_uri is None:
|
| 1243 |
+
continue
|
| 1244 |
+
|
| 1245 |
+
# Harness (only meaningful when the dataframe carries an Agent column).
|
| 1246 |
+
harness_uri = None
|
| 1247 |
+
if has_harness_column:
|
| 1248 |
+
harness_info = get_harness_icon(row.get("Agent"))
|
| 1249 |
+
if harness_info is not None:
|
| 1250 |
+
harness_uri = _encode_logo(harness_info["path"])
|
| 1251 |
+
|
| 1252 |
+
x_val = row[x_col_to_use]
|
| 1253 |
+
y_val = row[y_col_to_use]
|
| 1254 |
+
|
| 1255 |
+
# Convert to domain coordinates (0-1 range)
|
| 1256 |
+
# For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
|
| 1257 |
+
if x_val > 0:
|
| 1258 |
+
log_x = np.log10(x_val)
|
| 1259 |
+
domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
|
| 1260 |
+
else:
|
| 1261 |
+
domain_x = 0
|
| 1262 |
+
|
| 1263 |
+
# For linear y: domain_y = (y - y_min) / (y_max - y_min)
|
| 1264 |
+
domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
|
| 1265 |
+
|
| 1266 |
+
# Clamp to valid range
|
| 1267 |
+
domain_x = max(0, min(1, domain_x))
|
| 1268 |
+
domain_y = max(0, min(1, domain_y))
|
| 1269 |
+
|
| 1270 |
+
if harness_uri is not None:
|
| 1271 |
+
# Composite: stack model on top, harness on bottom, clamping
|
| 1272 |
+
# each half to the plot area so markers near the edges don't
|
| 1273 |
+
# drift off-canvas.
|
| 1274 |
+
model_y = min(1, domain_y + STACKED_Y_OFFSET)
|
| 1275 |
+
harness_y = max(0, domain_y - STACKED_Y_OFFSET)
|
| 1276 |
+
layout_images.append(dict(
|
| 1277 |
+
source=model_logo_uri,
|
| 1278 |
+
xref="x domain", yref="y domain",
|
| 1279 |
+
x=domain_x, y=model_y,
|
| 1280 |
+
sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
|
| 1281 |
+
xanchor="center", yanchor="middle",
|
| 1282 |
+
layer="above",
|
| 1283 |
+
))
|
| 1284 |
+
layout_images.append(dict(
|
| 1285 |
+
source=harness_uri,
|
| 1286 |
+
xref="x domain", yref="y domain",
|
| 1287 |
+
x=domain_x, y=harness_y,
|
| 1288 |
+
sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
|
| 1289 |
+
xanchor="center", yanchor="middle",
|
| 1290 |
+
layer="above",
|
| 1291 |
+
))
|
| 1292 |
+
else:
|
| 1293 |
+
# Single marker (canonical OpenHands pages, or Alternative Agents
|
| 1294 |
+
# rows with an unknown harness name β the latter shouldn't happen
|
| 1295 |
+
# in practice since HARNESS_LOGO_PATHS covers every agent_name the
|
| 1296 |
+
# push-to-index script emits).
|
| 1297 |
+
layout_images.append(dict(
|
| 1298 |
+
source=model_logo_uri,
|
| 1299 |
+
xref="x domain", yref="y domain",
|
| 1300 |
+
x=domain_x, y=domain_y,
|
| 1301 |
+
sizex=SINGLE_SIZE_X, sizey=SINGLE_SIZE_Y,
|
| 1302 |
+
xanchor="center", yanchor="middle",
|
| 1303 |
+
layer="above",
|
| 1304 |
+
))
|
| 1305 |
|
| 1306 |
# --- Section 7: Add Model Name Labels to Frontier Points ---
|
| 1307 |
if frontier_rows:
|
setup_data.py
CHANGED
|
@@ -70,27 +70,39 @@ def fetch_data_from_github():
|
|
| 70 |
|
| 71 |
# Look for data files in the cloned repository
|
| 72 |
results_source = clone_dir / "results"
|
| 73 |
-
|
| 74 |
if not results_source.exists():
|
| 75 |
print(f"Results directory not found in repository")
|
| 76 |
return False
|
| 77 |
-
|
| 78 |
# Check if there are any agent result directories
|
| 79 |
result_dirs = list(results_source.iterdir())
|
| 80 |
if not result_dirs:
|
| 81 |
print(f"No agent results found in {results_source}")
|
| 82 |
return False
|
| 83 |
-
|
| 84 |
print(f"Found {len(result_dirs)} agent result directories")
|
| 85 |
-
|
| 86 |
# Create target directory and copy the results structure
|
| 87 |
os.makedirs(target_dir.parent, exist_ok=True)
|
| 88 |
if target_dir.exists():
|
| 89 |
shutil.rmtree(target_dir)
|
| 90 |
-
|
| 91 |
# Copy the entire results directory
|
| 92 |
target_results = target_dir / "results"
|
| 93 |
shutil.copytree(results_source, target_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
|
| 96 |
|
|
|
|
| 70 |
|
| 71 |
# Look for data files in the cloned repository
|
| 72 |
results_source = clone_dir / "results"
|
| 73 |
+
|
| 74 |
if not results_source.exists():
|
| 75 |
print(f"Results directory not found in repository")
|
| 76 |
return False
|
| 77 |
+
|
| 78 |
# Check if there are any agent result directories
|
| 79 |
result_dirs = list(results_source.iterdir())
|
| 80 |
if not result_dirs:
|
| 81 |
print(f"No agent results found in {results_source}")
|
| 82 |
return False
|
| 83 |
+
|
| 84 |
print(f"Found {len(result_dirs)} agent result directories")
|
| 85 |
+
|
| 86 |
# Create target directory and copy the results structure
|
| 87 |
os.makedirs(target_dir.parent, exist_ok=True)
|
| 88 |
if target_dir.exists():
|
| 89 |
shutil.rmtree(target_dir)
|
| 90 |
+
|
| 91 |
# Copy the entire results directory
|
| 92 |
target_results = target_dir / "results"
|
| 93 |
shutil.copytree(results_source, target_results)
|
| 94 |
+
|
| 95 |
+
# Also copy alternative_agents/ if present, so the loader can pick up
|
| 96 |
+
# ACP runs (acp-claude, acp-codex, acp-gemini, openhands_subagents, ...)
|
| 97 |
+
# alongside the default OpenHands agent results.
|
| 98 |
+
alt_source = clone_dir / "alternative_agents"
|
| 99 |
+
if alt_source.exists():
|
| 100 |
+
alt_target = target_dir / "alternative_agents"
|
| 101 |
+
shutil.copytree(alt_source, alt_target)
|
| 102 |
+
agent_types = sorted(p.name for p in alt_source.iterdir() if p.is_dir())
|
| 103 |
+
print(f"Found alternative agent types: {agent_types}")
|
| 104 |
+
else:
|
| 105 |
+
print("No alternative_agents/ directory in repository (skipping)")
|
| 106 |
|
| 107 |
print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
|
| 108 |
|
simple_data_loader.py
CHANGED
|
@@ -96,17 +96,43 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
|
|
| 96 |
|
| 97 |
class SimpleLeaderboardViewer:
|
| 98 |
"""Simple replacement for agent-eval's LeaderboardViewer."""
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
"""
|
| 102 |
Args:
|
| 103 |
data_dir: Path to data directory
|
| 104 |
config: Config name (e.g., "1.0.0-dev1")
|
| 105 |
split: Split name (e.g., "validation" or "test")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
self.data_dir = Path(data_dir)
|
| 108 |
self.config = config
|
| 109 |
self.split = split
|
|
|
|
| 110 |
self.config_path = self.data_dir / config
|
| 111 |
|
| 112 |
# Benchmark to category mappings (single source of truth)
|
|
@@ -127,55 +153,115 @@ class SimpleLeaderboardViewer:
|
|
| 127 |
if benchmark not in self.tag_map[category]:
|
| 128 |
self.tag_map[category].append(benchmark)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
def _load_from_agent_dirs(self):
|
| 131 |
-
"""Load
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
all_records = []
|
| 138 |
all_validation_errors = []
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
#
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
all_records.append(record)
|
| 178 |
-
|
| 179 |
# Log validation errors if any
|
| 180 |
if all_validation_errors:
|
| 181 |
logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
|
|
@@ -183,10 +269,10 @@ class SimpleLeaderboardViewer:
|
|
| 183 |
logger.warning(f" - {error}")
|
| 184 |
if len(all_validation_errors) > 5:
|
| 185 |
logger.warning(f" ... and {len(all_validation_errors) - 5} more")
|
| 186 |
-
|
| 187 |
if not all_records:
|
| 188 |
-
return None #
|
| 189 |
-
|
| 190 |
return pd.DataFrame(all_records)
|
| 191 |
|
| 192 |
def _load(self):
|
|
@@ -206,26 +292,36 @@ class SimpleLeaderboardViewer:
|
|
| 206 |
# Group by agent (version + model combination) to aggregate results across datasets
|
| 207 |
transformed_records = []
|
| 208 |
|
| 209 |
-
# Create a unique identifier
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
for agent_id in df['agent_id'].unique():
|
| 213 |
agent_records = df[df['agent_id'] == agent_id]
|
| 214 |
-
|
| 215 |
# Build a single record for this agent
|
| 216 |
first_record = agent_records.iloc[0]
|
| 217 |
agent_version = first_record['agent_version']
|
| 218 |
-
|
|
|
|
| 219 |
# Normalize openness to "open" or "closed"
|
| 220 |
from aliases import OPENNESS_MAPPING
|
| 221 |
raw_openness = first_record['openness']
|
| 222 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 223 |
-
|
| 224 |
# All 5 categories for the leaderboard
|
| 225 |
ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
|
| 226 |
-
|
| 227 |
record = {
|
| 228 |
# Core agent info - use final display names
|
|
|
|
| 229 |
'SDK version': agent_version, # Will become "SDK Version"
|
| 230 |
'Language model': first_record['llm_base'], # Will become "Language Model"
|
| 231 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
|
@@ -235,7 +331,7 @@ class SimpleLeaderboardViewer:
|
|
| 235 |
'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
|
| 236 |
'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
|
| 237 |
# Additional columns expected by the transformer
|
| 238 |
-
# Use agent_id (
|
| 239 |
'id': agent_id,
|
| 240 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 241 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
|
|
|
| 96 |
|
| 97 |
class SimpleLeaderboardViewer:
|
| 98 |
"""Simple replacement for agent-eval's LeaderboardViewer."""
|
| 99 |
+
|
| 100 |
+
AGENT_FILTER_OPENHANDS = "openhands"
|
| 101 |
+
AGENT_FILTER_ALTERNATIVE = "alternative"
|
| 102 |
+
|
| 103 |
+
def __init__(
|
| 104 |
+
self,
|
| 105 |
+
data_dir: str,
|
| 106 |
+
config: str,
|
| 107 |
+
split: str,
|
| 108 |
+
agent_filter: str = AGENT_FILTER_OPENHANDS,
|
| 109 |
+
):
|
| 110 |
"""
|
| 111 |
Args:
|
| 112 |
data_dir: Path to data directory
|
| 113 |
config: Config name (e.g., "1.0.0-dev1")
|
| 114 |
split: Split name (e.g., "validation" or "test")
|
| 115 |
+
agent_filter: Which submissions to include.
|
| 116 |
+
``"openhands"`` (default) loads only the default OpenHands
|
| 117 |
+
agent runs from ``results/{model}/`` β the canonical
|
| 118 |
+
leaderboard. ``"alternative"`` loads only third-party
|
| 119 |
+
harnesses (Claude Code / Codex / Gemini CLI / OpenHands
|
| 120 |
+
Sub-agents) from ``alternative_agents/{type}/{model}/``,
|
| 121 |
+
which power the standalone Alternative Agents page.
|
| 122 |
+
The two are kept on separate pages because their
|
| 123 |
+
cost/runtime numbers aren't apples-to-apples and mixing
|
| 124 |
+
them in one ranking would be misleading.
|
| 125 |
"""
|
| 126 |
+
if agent_filter not in (self.AGENT_FILTER_OPENHANDS, self.AGENT_FILTER_ALTERNATIVE):
|
| 127 |
+
raise ValueError(
|
| 128 |
+
f"agent_filter must be one of "
|
| 129 |
+
f"{{{self.AGENT_FILTER_OPENHANDS!r}, {self.AGENT_FILTER_ALTERNATIVE!r}}}, "
|
| 130 |
+
f"got {agent_filter!r}"
|
| 131 |
+
)
|
| 132 |
self.data_dir = Path(data_dir)
|
| 133 |
self.config = config
|
| 134 |
self.split = split
|
| 135 |
+
self.agent_filter = agent_filter
|
| 136 |
self.config_path = self.data_dir / config
|
| 137 |
|
| 138 |
# Benchmark to category mappings (single source of truth)
|
|
|
|
| 153 |
if benchmark not in self.tag_map[category]:
|
| 154 |
self.tag_map[category].append(benchmark)
|
| 155 |
|
| 156 |
+
# Default agent_name when metadata.json doesn't carry one. Matches the
|
| 157 |
+
# default-agent value used by push_to_index_from_archive.py so legacy
|
| 158 |
+
# entries (which omit the field) still group cleanly with new entries.
|
| 159 |
+
DEFAULT_AGENT_NAME = "OpenHands"
|
| 160 |
+
|
| 161 |
+
def _records_from_agent_dir(self, agent_dir: Path, default_agent_name: str | None = None) -> tuple[list[dict], list[str]]:
|
| 162 |
+
"""Build per-benchmark records from a single agent directory.
|
| 163 |
+
|
| 164 |
+
Shared by ``_load_from_agent_dirs`` (default OpenHands results) and
|
| 165 |
+
``_load_from_alternative_agents_dirs`` (acp-claude / acp-codex / etc.).
|
| 166 |
+
Returns ``(records, validation_errors)``. Returns an empty list of
|
| 167 |
+
records when the directory has no scores or is hidden from the
|
| 168 |
+
leaderboard.
|
| 169 |
+
"""
|
| 170 |
+
records: list[dict] = []
|
| 171 |
+
metadata, scores, errors = load_and_validate_agent_data(agent_dir)
|
| 172 |
+
|
| 173 |
+
if metadata is None or scores is None:
|
| 174 |
+
return records, errors
|
| 175 |
+
|
| 176 |
+
if metadata.get('hide_from_leaderboard', False):
|
| 177 |
+
logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
|
| 178 |
+
return records, errors
|
| 179 |
+
|
| 180 |
+
# Resolve the agent display name. Prefer the value stamped into
|
| 181 |
+
# metadata.json by push-to-index; fall back to the directory's
|
| 182 |
+
# default (e.g. "Claude Code" for acp-claude/) and finally to
|
| 183 |
+
# "OpenHands" for legacy results/ entries that predate the field.
|
| 184 |
+
agent_name = (
|
| 185 |
+
metadata.get('agent_name')
|
| 186 |
+
or default_agent_name
|
| 187 |
+
or self.DEFAULT_AGENT_NAME
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
for score_entry in scores:
|
| 191 |
+
record = {
|
| 192 |
+
'agent_name': agent_name,
|
| 193 |
+
'agent_version': metadata.get('agent_version', 'Unknown'),
|
| 194 |
+
'llm_base': metadata.get('model', 'unknown'),
|
| 195 |
+
'openness': metadata.get('openness', 'unknown'),
|
| 196 |
+
'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
|
| 197 |
+
'release_date': metadata.get('release_date', ''),
|
| 198 |
+
'parameter_count_b': metadata.get('parameter_count_b'),
|
| 199 |
+
'active_parameter_count_b': metadata.get('active_parameter_count_b'),
|
| 200 |
+
'score': score_entry.get('score'),
|
| 201 |
+
'metric': score_entry.get('metric', 'unknown'),
|
| 202 |
+
'cost_per_instance': score_entry.get('cost_per_instance'),
|
| 203 |
+
'average_runtime': score_entry.get('average_runtime'),
|
| 204 |
+
'tags': [score_entry.get('benchmark')],
|
| 205 |
+
'full_archive': score_entry.get('full_archive', ''),
|
| 206 |
+
'eval_visualization_page': score_entry.get('eval_visualization_page', ''),
|
| 207 |
+
}
|
| 208 |
+
records.append(record)
|
| 209 |
+
return records, errors
|
| 210 |
+
|
| 211 |
def _load_from_agent_dirs(self):
|
| 212 |
+
"""Load agent records based on ``self.agent_filter``.
|
| 213 |
+
|
| 214 |
+
- ``"openhands"`` (default): only ``{config}/results/{model}/``,
|
| 215 |
+
which is the canonical OpenHands leaderboard. The Home page and
|
| 216 |
+
the per-category subpages use this.
|
| 217 |
+
- ``"alternative"``: only
|
| 218 |
+
``{config}/alternative_agents/{type}/{model}/`` (acp-claude,
|
| 219 |
+
acp-codex, acp-gemini, openhands_subagents, ...). The dedicated
|
| 220 |
+
Alternative Agents page uses this.
|
| 221 |
+
|
| 222 |
+
Returns ``None`` if no records were found (which makes the caller
|
| 223 |
+
render an empty-state placeholder).
|
| 224 |
+
"""
|
| 225 |
all_records = []
|
| 226 |
all_validation_errors = []
|
| 227 |
+
|
| 228 |
+
if self.agent_filter == self.AGENT_FILTER_OPENHANDS:
|
| 229 |
+
# Default OpenHands agent results
|
| 230 |
+
results_dir = self.config_path / "results"
|
| 231 |
+
if results_dir.exists():
|
| 232 |
+
for agent_dir in results_dir.iterdir():
|
| 233 |
+
if not agent_dir.is_dir():
|
| 234 |
+
continue
|
| 235 |
+
records, errors = self._records_from_agent_dir(agent_dir)
|
| 236 |
+
all_records.extend(records)
|
| 237 |
+
all_validation_errors.extend(errors)
|
| 238 |
+
else:
|
| 239 |
+
# Alternative agents (one subdirectory per agent_type, then per model)
|
| 240 |
+
# Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE
|
| 241 |
+
# map in OpenHands/evaluation push_to_index_from_archive.py β keeping
|
| 242 |
+
# it in sync ensures rows are labelled the same way the index repo
|
| 243 |
+
# records them.
|
| 244 |
+
agent_type_default_name = {
|
| 245 |
+
'acp-claude': 'Claude Code',
|
| 246 |
+
'acp-codex': 'Codex',
|
| 247 |
+
'acp-gemini': 'Gemini CLI',
|
| 248 |
+
'openhands_subagents': 'OpenHands Sub-agents',
|
| 249 |
+
}
|
| 250 |
+
alt_dir = self.config_path / "alternative_agents"
|
| 251 |
+
if alt_dir.exists():
|
| 252 |
+
for type_dir in alt_dir.iterdir():
|
| 253 |
+
if not type_dir.is_dir():
|
| 254 |
+
continue
|
| 255 |
+
default_name = agent_type_default_name.get(type_dir.name)
|
| 256 |
+
for agent_dir in type_dir.iterdir():
|
| 257 |
+
if not agent_dir.is_dir():
|
| 258 |
+
continue
|
| 259 |
+
records, errors = self._records_from_agent_dir(
|
| 260 |
+
agent_dir, default_agent_name=default_name
|
| 261 |
+
)
|
| 262 |
+
all_records.extend(records)
|
| 263 |
+
all_validation_errors.extend(errors)
|
| 264 |
+
|
|
|
|
|
|
|
| 265 |
# Log validation errors if any
|
| 266 |
if all_validation_errors:
|
| 267 |
logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
|
|
|
|
| 269 |
logger.warning(f" - {error}")
|
| 270 |
if len(all_validation_errors) > 5:
|
| 271 |
logger.warning(f" ... and {len(all_validation_errors) - 5} more")
|
| 272 |
+
|
| 273 |
if not all_records:
|
| 274 |
+
return None # Caller will render empty-state placeholder
|
| 275 |
+
|
| 276 |
return pd.DataFrame(all_records)
|
| 277 |
|
| 278 |
def _load(self):
|
|
|
|
| 292 |
# Group by agent (version + model combination) to aggregate results across datasets
|
| 293 |
transformed_records = []
|
| 294 |
|
| 295 |
+
# Create a unique identifier per (agent_name, agent_version, model)
|
| 296 |
+
# tuple. Including agent_name keeps an OpenHands run and a Claude
|
| 297 |
+
# Code run on the same SDK version + model from collapsing into
|
| 298 |
+
# one row when both submit to the leaderboard.
|
| 299 |
+
df['agent_name'] = df['agent_name'].fillna(self.DEFAULT_AGENT_NAME)
|
| 300 |
+
df['agent_id'] = (
|
| 301 |
+
df['agent_name'].astype(str)
|
| 302 |
+
+ '_' + df['agent_version'].astype(str)
|
| 303 |
+
+ '_' + df['llm_base'].astype(str)
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
for agent_id in df['agent_id'].unique():
|
| 307 |
agent_records = df[df['agent_id'] == agent_id]
|
| 308 |
+
|
| 309 |
# Build a single record for this agent
|
| 310 |
first_record = agent_records.iloc[0]
|
| 311 |
agent_version = first_record['agent_version']
|
| 312 |
+
agent_name = first_record['agent_name']
|
| 313 |
+
|
| 314 |
# Normalize openness to "open" or "closed"
|
| 315 |
from aliases import OPENNESS_MAPPING
|
| 316 |
raw_openness = first_record['openness']
|
| 317 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 318 |
+
|
| 319 |
# All 5 categories for the leaderboard
|
| 320 |
ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
|
| 321 |
+
|
| 322 |
record = {
|
| 323 |
# Core agent info - use final display names
|
| 324 |
+
'agent_name': agent_name, # Will become "Agent"
|
| 325 |
'SDK version': agent_version, # Will become "SDK Version"
|
| 326 |
'Language model': first_record['llm_base'], # Will become "Language Model"
|
| 327 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
|
|
|
| 331 |
'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
|
| 332 |
'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
|
| 333 |
# Additional columns expected by the transformer
|
| 334 |
+
# Use agent_id (name_version_model) as unique identifier for Pareto frontier calculation
|
| 335 |
'id': agent_id,
|
| 336 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 337 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
ui_components.py
CHANGED
|
@@ -508,28 +508,36 @@ class DummyViewer:
|
|
| 508 |
# The _load method returns the error DataFrame and an empty tag map
|
| 509 |
return self._error_df, {}
|
| 510 |
|
| 511 |
-
def get_leaderboard_viewer_instance(
|
|
|
|
|
|
|
|
|
|
| 512 |
"""
|
| 513 |
-
Fetches the LeaderboardViewer for a split,
|
| 514 |
-
re-downloading data.
|
|
|
|
|
|
|
| 515 |
"""
|
| 516 |
global CACHED_VIEWERS, CACHED_TAG_MAPS
|
| 517 |
|
|
|
|
|
|
|
| 518 |
with _cache_lock:
|
| 519 |
-
if
|
| 520 |
# Cache hit: return the cached viewer and tag map
|
| 521 |
-
return CACHED_VIEWERS[
|
| 522 |
|
| 523 |
# --- Cache miss: try to load data from the source ---
|
| 524 |
try:
|
| 525 |
# First try to load from extracted data directory (local mock data)
|
| 526 |
data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
|
| 527 |
-
|
| 528 |
-
print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
|
| 529 |
viewer = SimpleLeaderboardViewer(
|
| 530 |
data_dir=data_dir,
|
| 531 |
config=CONFIG_NAME,
|
| 532 |
-
split=split
|
|
|
|
| 533 |
)
|
| 534 |
|
| 535 |
# Simplify tag map creation
|
|
@@ -537,14 +545,14 @@ def get_leaderboard_viewer_instance(split: str):
|
|
| 537 |
|
| 538 |
# Cache the results for next time (thread-safe)
|
| 539 |
with _cache_lock:
|
| 540 |
-
CACHED_VIEWERS[
|
| 541 |
-
CACHED_TAG_MAPS[
|
| 542 |
|
| 543 |
return viewer, pretty_tag_map
|
| 544 |
|
| 545 |
except Exception as e:
|
| 546 |
# On ANY error, create a consistent error message and cache a DummyViewer
|
| 547 |
-
error_message = f"Error loading data for split '{split}': {e}"
|
| 548 |
print(format_error(error_message))
|
| 549 |
|
| 550 |
dummy_df = pd.DataFrame({"Message": [error_message]})
|
|
@@ -553,8 +561,8 @@ def get_leaderboard_viewer_instance(split: str):
|
|
| 553 |
|
| 554 |
# Cache the dummy objects so we don't try to fetch again on this run
|
| 555 |
with _cache_lock:
|
| 556 |
-
CACHED_VIEWERS[
|
| 557 |
-
CACHED_TAG_MAPS[
|
| 558 |
|
| 559 |
return dummy_viewer, dummy_tag_map
|
| 560 |
|
|
@@ -1268,12 +1276,17 @@ def create_benchmark_details_display(
|
|
| 1268 |
legend_markdown = create_legend_markdown(benchmark_name)
|
| 1269 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
| 1270 |
|
| 1271 |
-
def get_full_leaderboard_data(
|
|
|
|
|
|
|
|
|
|
| 1272 |
"""
|
| 1273 |
-
Loads and transforms the complete dataset for a
|
| 1274 |
-
|
|
|
|
|
|
|
| 1275 |
"""
|
| 1276 |
-
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
|
| 1277 |
|
| 1278 |
if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
|
| 1279 |
raw_df, _ = viewer_or_data._load()
|
|
|
|
| 508 |
# The _load method returns the error DataFrame and an empty tag map
|
| 509 |
return self._error_df, {}
|
| 510 |
|
| 511 |
+
def get_leaderboard_viewer_instance(
|
| 512 |
+
split: str,
|
| 513 |
+
agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
|
| 514 |
+
):
|
| 515 |
"""
|
| 516 |
+
Fetches the LeaderboardViewer for a (split, agent_filter) pair, using a
|
| 517 |
+
thread-safe cache to avoid re-downloading data. The cache is keyed on
|
| 518 |
+
both axes so the OpenHands and Alternative Agents pages don't fight
|
| 519 |
+
over a single slot. On error, returns a stable DummyViewer.
|
| 520 |
"""
|
| 521 |
global CACHED_VIEWERS, CACHED_TAG_MAPS
|
| 522 |
|
| 523 |
+
cache_key = (split, agent_filter)
|
| 524 |
+
|
| 525 |
with _cache_lock:
|
| 526 |
+
if cache_key in CACHED_VIEWERS:
|
| 527 |
# Cache hit: return the cached viewer and tag map
|
| 528 |
+
return CACHED_VIEWERS[cache_key], CACHED_TAG_MAPS.get(cache_key, {"Overall": []})
|
| 529 |
|
| 530 |
# --- Cache miss: try to load data from the source ---
|
| 531 |
try:
|
| 532 |
# First try to load from extracted data directory (local mock data)
|
| 533 |
data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
|
| 534 |
+
|
| 535 |
+
print(f"Loading data for split '{split}' (agent_filter={agent_filter}) from: {data_dir}/{CONFIG_NAME}")
|
| 536 |
viewer = SimpleLeaderboardViewer(
|
| 537 |
data_dir=data_dir,
|
| 538 |
config=CONFIG_NAME,
|
| 539 |
+
split=split,
|
| 540 |
+
agent_filter=agent_filter,
|
| 541 |
)
|
| 542 |
|
| 543 |
# Simplify tag map creation
|
|
|
|
| 545 |
|
| 546 |
# Cache the results for next time (thread-safe)
|
| 547 |
with _cache_lock:
|
| 548 |
+
CACHED_VIEWERS[cache_key] = viewer
|
| 549 |
+
CACHED_TAG_MAPS[cache_key] = pretty_tag_map # Cache the pretty map directly
|
| 550 |
|
| 551 |
return viewer, pretty_tag_map
|
| 552 |
|
| 553 |
except Exception as e:
|
| 554 |
# On ANY error, create a consistent error message and cache a DummyViewer
|
| 555 |
+
error_message = f"Error loading data for split '{split}' (agent_filter={agent_filter}): {e}"
|
| 556 |
print(format_error(error_message))
|
| 557 |
|
| 558 |
dummy_df = pd.DataFrame({"Message": [error_message]})
|
|
|
|
| 561 |
|
| 562 |
# Cache the dummy objects so we don't try to fetch again on this run
|
| 563 |
with _cache_lock:
|
| 564 |
+
CACHED_VIEWERS[cache_key] = dummy_viewer
|
| 565 |
+
CACHED_TAG_MAPS[cache_key] = dummy_tag_map
|
| 566 |
|
| 567 |
return dummy_viewer, dummy_tag_map
|
| 568 |
|
|
|
|
| 1276 |
legend_markdown = create_legend_markdown(benchmark_name)
|
| 1277 |
gr.HTML(value=legend_markdown, elem_id="legend-markdown")
|
| 1278 |
|
| 1279 |
+
def get_full_leaderboard_data(
|
| 1280 |
+
split: str,
|
| 1281 |
+
agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
|
| 1282 |
+
) -> tuple[pd.DataFrame, dict]:
|
| 1283 |
"""
|
| 1284 |
+
Loads and transforms the complete dataset for a (split, agent_filter)
|
| 1285 |
+
pair. ``agent_filter`` defaults to ``"openhands"`` so existing pages
|
| 1286 |
+
that don't pass it stay on the canonical leaderboard. The Alternative
|
| 1287 |
+
Agents page passes ``"alternative"`` to get the third-party harnesses.
|
| 1288 |
"""
|
| 1289 |
+
viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split, agent_filter=agent_filter)
|
| 1290 |
|
| 1291 |
if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
|
| 1292 |
raw_df, _ = viewer_or_data._load()
|