Spaces:

OpenHands
/

openhands-index

Running

openhands-index / alternative_agents_page.py

openhands

Show OpenHands on alternate agents page for shared models

70749cd about 1 month ago

3.46 kB

	"""Alternative Agents leaderboard page.

	The canonical OpenHands Index leaderboard (Home + the per-category pages)
	ranks default OpenHands agent runs from ``results/{model}/`` in the
	openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
	Gemini CLI, OpenHands Sub-agents, ...) live under
	``alternative_agents/{type}/{model}/`` and aren't directly comparable to
	default OpenHands runs (different scaffolds, different cost/runtime
	characteristics), so they get their own standalone page instead of being
	mixed into the same ranking.

	This page is intentionally a single Overall view (no per-category
	subpages) — the alternative-agents dataset is small (one row per
	harness × model) and the goal is "show me all the alternatives at a
	glance", not "drill into Issue Resolution for Codex".

	To make same-model comparisons easier, the page also appends canonical
	OpenHands rows for any language model that appears in the alternative
	agent dataset. The match is exact, so ``Gemini-3-Pro`` and
	``Gemini-3.1-Pro`` remain distinct entries.
	"""
	import matplotlib
	matplotlib.use('Agg')
	import pandas as pd
	import gradio as gr

	from simple_data_loader import SimpleLeaderboardViewer
	from ui_components import (
	create_leaderboard_display,
	get_full_leaderboard_data,
	)


	ALTERNATIVE_AGENTS_INTRO = """
	<div id="alternative-agents-intro">
	<h2>Alternative Agents</h2>
	<p>
	Third-party agent harnesses running the OpenHands Index benchmarks.
	To make direct comparisons easier, this page also includes the
	canonical OpenHands row whenever the exact same language model appears
	under an alternative harness. Cost and runtime numbers still come from
	each harness's own instrumentation and aren't directly comparable
	across harnesses.
	</p>
	</div>
	"""


	def _append_openhands_shared_models(
	alternative_df: pd.DataFrame,
	split: str,
	) -> pd.DataFrame:
	if alternative_df.empty or "Language Model" not in alternative_df.columns:
	return alternative_df

	openhands_df, _ = get_full_leaderboard_data(
	split,
	agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
	)
	if openhands_df.empty or "Language Model" not in openhands_df.columns:
	return alternative_df

	alternative_models = set(
	alternative_df["Language Model"].dropna().astype(str).str.strip()
	)
	if not alternative_models:
	return alternative_df

	openhands_shared_df = openhands_df[
	openhands_df["Language Model"].astype(str).str.strip().isin(alternative_models)
	].copy()
	if openhands_shared_df.empty:
	return alternative_df

	return pd.concat([alternative_df, openhands_shared_df], ignore_index=True, sort=False)


	def build_page():
	gr.HTML(ALTERNATIVE_AGENTS_INTRO)

	gr.Markdown("---")

	test_df, test_tag_map = get_full_leaderboard_data(
	"test",
	agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
	)

	if test_df.empty:
	gr.Markdown(
	"No alternative agent submissions yet. New runs land in "
	"`alternative_agents/{type}/{model}/` in "
	"[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
	)
	return

	test_df = _append_openhands_shared_models(test_df, split="test")

	create_leaderboard_display(
	full_df=test_df,
	tag_map=test_tag_map,
	category_name="Overall",
	split_name="test",
	)