Show ACP agent results in the leaderboard

#11
.gitattributes CHANGED
@@ -1 +1,2 @@
1
 
 
 
1
 
2
+ docs/screenshots/alternative-agents.png filter=lfs diff=lfs merge=lfs -text
alternative_agents_page.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Alternative Agents leaderboard page.
2
+
3
+ The canonical OpenHands Index leaderboard (Home + the per-category pages)
4
+ ranks default OpenHands agent runs from ``results/{model}/`` in the
5
+ openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
6
+ Gemini CLI, OpenHands Sub-agents, ...) live under
7
+ ``alternative_agents/{type}/{model}/`` and aren't directly comparable to
8
+ default OpenHands runs (different scaffolds, different cost/runtime
9
+ characteristics), so they get their own standalone page instead of being
10
+ mixed into the same ranking.
11
+
12
+ This page is intentionally a single Overall view (no per-category
13
+ subpages) β€” the alternative-agents dataset is small (one row per
14
+ harness Γ— model) and the goal is "show me all the alternatives at a
15
+ glance", not "drill into Issue Resolution for Codex".
16
+ """
17
+ import matplotlib
18
+ matplotlib.use('Agg')
19
+ import gradio as gr
20
+
21
+ from simple_data_loader import SimpleLeaderboardViewer
22
+ from ui_components import (
23
+ create_leaderboard_display,
24
+ get_full_leaderboard_data,
25
+ )
26
+
27
+
28
+ ALTERNATIVE_AGENTS_INTRO = """
29
+ <div id="alternative-agents-intro">
30
+ <h2>Alternative Agents</h2>
31
+ <p>
32
+ Third-party agent harnesses running the OpenHands Index benchmarks.
33
+ These rows aren't part of the OpenHands ranking on the
34
+ <a href="/home">Home</a> page β€” they're tracked here as a comparison
35
+ point. Cost and runtime numbers come from each harness's own
36
+ instrumentation and aren't directly comparable across harnesses.
37
+ </p>
38
+ </div>
39
+ """
40
+
41
+
42
+ def build_page():
43
+ gr.HTML(ALTERNATIVE_AGENTS_INTRO)
44
+
45
+ gr.Markdown("---")
46
+
47
+ test_df, test_tag_map = get_full_leaderboard_data(
48
+ "test",
49
+ agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
50
+ )
51
+
52
+ if test_df.empty:
53
+ gr.Markdown(
54
+ "No alternative agent submissions yet. New runs land in "
55
+ "`alternative_agents/{type}/{model}/` in "
56
+ "[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
57
+ )
58
+ return
59
+
60
+ create_leaderboard_display(
61
+ full_df=test_df,
62
+ tag_map=test_tag_map,
63
+ category_name="Overall",
64
+ split_name="test",
65
+ )
app.py CHANGED
@@ -35,6 +35,7 @@ from app_creation import build_page as build_app_creation_page
35
  from frontend_development import build_page as build_frontend_page
36
  from test_generation import build_page as build_test_generation_page
37
  from information_gathering import build_page as build_information_gathering_page
 
38
  from about import build_page as build_about_page
39
 
40
  logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
@@ -373,6 +374,9 @@ with demo.route("Testing", "/testing"):
373
  with demo.route("Information Gathering", "/information-gathering"):
374
  build_information_gathering_page()
375
 
 
 
 
376
  with demo.route("About", "/about"):
377
  build_about_page()
378
 
 
35
  from frontend_development import build_page as build_frontend_page
36
  from test_generation import build_page as build_test_generation_page
37
  from information_gathering import build_page as build_information_gathering_page
38
+ from alternative_agents_page import build_page as build_alternative_agents_page
39
  from about import build_page as build_about_page
40
 
41
  logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
 
374
  with demo.route("Information Gathering", "/information-gathering"):
375
  build_information_gathering_page()
376
 
377
+ with demo.route("Alternative Agents", "/alternative-agents"):
378
+ build_alternative_agents_page()
379
+
380
  with demo.route("About", "/about"):
381
  build_about_page()
382
 
assets/harnesses/README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Agent harness logos
2
+
3
+ This folder holds the **bottom half** of the composite scatter markers used
4
+ on the [Alternative Agents](../../alternative_agents_page.py) page. Each
5
+ point on that scatter stacks two logos: the model provider on top (from
6
+ `assets/logo-*.svg`) and the harness on the bottom (from this folder).
7
+
8
+ ## Expected filenames
9
+
10
+ The scatter code looks up a logo by the exact `agent_name` string that the
11
+ `push-to-index` workflow writes into the index repo's `metadata.json`, then
12
+ maps it through `HARNESS_LOGO_STEMS` in `leaderboard_transformer.py`. Keep
13
+ these filenames in sync with that map.
14
+
15
+ | `agent_name` (in index repo) | File in this folder |
16
+ | --- | --- |
17
+ | `Claude Code` | `claude-code.svg` or `claude-code.png` |
18
+ | `Codex` | `codex-cli.svg` or `codex-cli.png` |
19
+ | `Gemini CLI` | `gemini-cli.svg` or `gemini-cli.png` |
20
+ | `OpenHands` | `openhands.svg` or `openhands.png` |
21
+ | `OpenHands Sub-agents` | `openhands.svg` or `openhands.png` (shared with `OpenHands`) |
22
+
23
+ Both `.svg` and `.png` are accepted β€” the resolver tries `.svg` first, then
24
+ `.png`. **Prefer SVG when possible**: the HuggingFace Space rejects new
25
+ binary files on plain `git push` and routes PNGs through Xet, so an SVG is
26
+ one less thing to set up.
27
+
28
+ ## When a file is missing
29
+
30
+ The scatter falls back to a single marker (just the model provider logo) β€”
31
+ exactly the same rendering path the canonical OpenHands pages use. Nothing
32
+ crashes and nothing prints a warning in normal operation. This means you
33
+ can roll out logos one harness at a time without waiting for all four.
34
+
35
+ ## Sizing and shape
36
+
37
+ - Square canvas. The composite marker is drawn at a fixed aspect ratio, so
38
+ a non-square logo will get squished.
39
+ - Any SVG `viewBox` works β€” the renderer base64-encodes the file as-is and
40
+ Plotly scales it to the marker's `sizex` / `sizey`. Around `80Γ—80` to
41
+ `256Γ—256` is a good source size.
42
+ - Leave some internal padding (β‰ˆ10%) so the logo doesn't touch the marker
43
+ edge when two are stacked.
44
+ - No background is required, but a rounded-square coloured tile reads well
45
+ at small sizes because it gives each harness a distinct silhouette even
46
+ when the inner glyph isn't fully legible. Look at the existing
47
+ `assets/logo-*.svg` files for the canonical model provider logos if you
48
+ want a visual reference for sizing.
49
+
50
+ ## Adding a new harness
51
+
52
+ 1. Decide on the exact `agent_name` that the push-to-index workflow writes
53
+ for the new harness (see `AGENT_NAME_BY_TYPE` in
54
+ `OpenHands/evaluation/push-to-index-job/scripts/push_to_index_from_archive.py`).
55
+ 2. Add an entry to `HARNESS_LOGO_STEMS` in
56
+ [`leaderboard_transformer.py`](../../leaderboard_transformer.py) that
57
+ maps the display name to a stem.
58
+ 3. Drop `{stem}.svg` (or `.png`) into this folder.
59
+ 4. Reload the app and look at `/alternative-agents`.
assets/harnesses/claude-code.svg ADDED
assets/harnesses/codex-cli.svg ADDED
assets/harnesses/gemini-cli.svg ADDED
assets/harnesses/openhands.svg ADDED
docs/screenshots/alternative-agents.png ADDED

Git LFS Details

  • SHA256: 99766c7d2c11a6f90f24a5f0effbae74a8aa33096b89ff1c4fcfb238fe06a2f5
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB
leaderboard_transformer.py CHANGED
@@ -228,17 +228,17 @@ def get_country_from_model(model_name: str) -> dict:
228
  def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
229
  """
230
  Gets the appropriate icon based on the mark_by selection.
231
-
232
  Args:
233
  model_name: The model name
234
  openness: The openness value (open/closed)
235
  mark_by: One of "Company", "Openness", or "Country"
236
-
237
  Returns:
238
  dict with 'path' and 'name' keys
239
  """
240
  from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
241
-
242
  if mark_by == MARK_BY_OPENNESS:
243
  return get_openness_icon(openness)
244
  elif mark_by == MARK_BY_COUNTRY:
@@ -247,6 +247,59 @@ def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
247
  return get_company_from_model(model_name)
248
 
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  # Standard layout configuration for all charts
251
  STANDARD_LAYOUT = dict(
252
  template="plotly_white",
@@ -655,6 +708,7 @@ def _pretty_column_name(raw_col: str) -> str:
655
  # Case 1: Handle fixed, special-case mappings first.
656
  fixed_mappings = {
657
  'id': 'id',
 
658
  'SDK version': 'SDK Version',
659
  'Openhands version': 'SDK Version', # Legacy support
660
  'Language model': 'Language Model',
@@ -815,7 +869,21 @@ class DataTransformer:
815
  df_view = df_sorted.copy()
816
 
817
  # --- 3. Add Columns for Agent Openness ---
818
- base_cols = ["id","Language Model","SDK Version","Source"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
819
  new_cols = ["Openness"]
820
  ending_cols = ["Date", "Logs", "Visualization"]
821
 
@@ -1018,13 +1086,18 @@ def _plot_scatter_plotly(
1018
  """
1019
  Builds the complete HTML string for the plot's hover tooltip.
1020
  Format: {lm_name} (SDK {version})
 
 
 
 
 
1021
  Average Score: {score}
1022
  Average Cost/Runtime: {value}
1023
  Openness: {openness}
1024
  """
1025
  h_pad = " "
1026
  parts = ["<br>"]
1027
-
1028
  # Get and clean the language model name
1029
  llm_base_value = row.get('Language Model', '')
1030
  llm_base_value = clean_llm_base_list(llm_base_value)
@@ -1032,13 +1105,21 @@ def _plot_scatter_plotly(
1032
  lm_name = llm_base_value[0]
1033
  else:
1034
  lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
1035
-
1036
  # Get SDK version
1037
  sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
1038
-
1039
  # Title line: {lm_name} (SDK {version})
1040
  parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
1041
-
 
 
 
 
 
 
 
 
1042
  # Average Score
1043
  parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
1044
 
@@ -1111,51 +1192,116 @@ def _plot_scatter_plotly(
1111
  y_min = min_score - 5 if min_score > 5 else 0
1112
  y_max = max_score + 5
1113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1114
  for _, row in data_plot.iterrows():
1115
  model_name = row.get('Language Model', '')
1116
  openness = row.get('Openness', '')
1117
  marker_info = get_marker_icon(model_name, openness, mark_by)
1118
- logo_path = marker_info['path']
1119
-
1120
- # Read the SVG file and encode as base64 data URI
1121
- if os.path.exists(logo_path):
1122
- try:
1123
- with open(logo_path, 'rb') as f:
1124
- encoded_logo = base64.b64encode(f.read()).decode('utf-8')
1125
- logo_uri = f"data:image/svg+xml;base64,{encoded_logo}"
1126
-
1127
- x_val = row[x_col_to_use]
1128
- y_val = row[y_col_to_use]
1129
-
1130
- # Convert to domain coordinates (0-1 range)
1131
- # For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
1132
- if x_val > 0:
1133
- log_x = np.log10(x_val)
1134
- domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
1135
- else:
1136
- domain_x = 0
1137
-
1138
- # For linear y: domain_y = (y - y_min) / (y_max - y_min)
1139
- domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
1140
-
1141
- # Clamp to valid range
1142
- domain_x = max(0, min(1, domain_x))
1143
- domain_y = max(0, min(1, domain_y))
1144
-
1145
- layout_images.append(dict(
1146
- source=logo_uri,
1147
- xref="x domain", # Use domain coordinates for log scale compatibility
1148
- yref="y domain",
1149
- x=domain_x,
1150
- y=domain_y,
1151
- sizex=0.04, # Size as fraction of plot width
1152
- sizey=0.06, # Size as fraction of plot height
1153
- xanchor="center",
1154
- yanchor="middle",
1155
- layer="above"
1156
- ))
1157
- except Exception as e:
1158
- logger.warning(f"Could not load logo {logo_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159
 
1160
  # --- Section 7: Add Model Name Labels to Frontier Points ---
1161
  if frontier_rows:
 
228
  def get_marker_icon(model_name: str, openness: str, mark_by: str) -> dict:
229
  """
230
  Gets the appropriate icon based on the mark_by selection.
231
+
232
  Args:
233
  model_name: The model name
234
  openness: The openness value (open/closed)
235
  mark_by: One of "Company", "Openness", or "Country"
236
+
237
  Returns:
238
  dict with 'path' and 'name' keys
239
  """
240
  from constants import MARK_BY_COMPANY, MARK_BY_OPENNESS, MARK_BY_COUNTRY
241
+
242
  if mark_by == MARK_BY_OPENNESS:
243
  return get_openness_icon(openness)
244
  elif mark_by == MARK_BY_COUNTRY:
 
247
  return get_company_from_model(model_name)
248
 
249
 
250
+ # Map the agent_name stored in the index repo's metadata.json to a file stem
251
+ # inside assets/harnesses/. Kept in sync with AGENT_NAME_BY_TYPE in
252
+ # OpenHands/evaluation push_to_index_from_archive.py β€” if a new ACP harness
253
+ # lands there, add the corresponding display name and a matching stem here.
254
+ #
255
+ # The scatter plot looks for {stem}.svg first, then {stem}.png in
256
+ # assets/harnesses/. This repo intentionally ships only a README in that
257
+ # folder: drop the logo files in by hand (SVG preferred, PNG works too via
258
+ # HF Xet) and they'll be picked up on the next app restart. If the file is
259
+ # missing, get_harness_icon() returns None and the scatter falls back to the
260
+ # single-marker path β€” same rendering the canonical OpenHands pages use β€”
261
+ # so logos can be added one harness at a time without breaking anything.
262
+ HARNESS_LOGO_STEMS: dict[str, str] = {
263
+ "Claude Code": "claude-code",
264
+ "Codex": "codex-cli",
265
+ "Gemini CLI": "gemini-cli",
266
+ "OpenHands": "openhands",
267
+ "OpenHands Sub-agents": "openhands",
268
+ }
269
+ HARNESS_LOGO_DIR = "assets/harnesses"
270
+ HARNESS_LOGO_EXTENSIONS = ("svg", "png")
271
+
272
+
273
+ def get_harness_icon(agent_name: Optional[str]) -> Optional[dict]:
274
+ """Return {'path', 'name'} for the harness logo, or None if not usable.
275
+
276
+ Consumed by the Alternative Agents scatter plot to draw a composite
277
+ marker (model provider on top, harness on bottom). Returns None in any
278
+ of three cases, all of which make the caller skip the harness layer:
279
+
280
+ - ``agent_name`` is empty or missing from the dataframe row.
281
+ - ``agent_name`` isn't in ``HARNESS_LOGO_STEMS`` (new harness that
282
+ hasn't been registered yet β€” register it and drop in a logo).
283
+ - The logo file for that stem doesn't exist in ``assets/harnesses/``
284
+ yet (the repo ships only the README).
285
+
286
+ That third case is the important one: it lets the Alternative Agents
287
+ page work immediately after checkout even when the harness logo files
288
+ haven't been dropped in. The corresponding points just render like a
289
+ canonical-page marker (model logo only) until the file is added.
290
+ """
291
+ if not agent_name:
292
+ return None
293
+ stem = HARNESS_LOGO_STEMS.get(str(agent_name).strip())
294
+ if stem is None:
295
+ return None
296
+ for ext in HARNESS_LOGO_EXTENSIONS:
297
+ path = f"{HARNESS_LOGO_DIR}/{stem}.{ext}"
298
+ if os.path.exists(path):
299
+ return {"path": path, "name": agent_name}
300
+ return None
301
+
302
+
303
  # Standard layout configuration for all charts
304
  STANDARD_LAYOUT = dict(
305
  template="plotly_white",
 
708
  # Case 1: Handle fixed, special-case mappings first.
709
  fixed_mappings = {
710
  'id': 'id',
711
+ 'agent_name': 'Agent',
712
  'SDK version': 'SDK Version',
713
  'Openhands version': 'SDK Version', # Legacy support
714
  'Language model': 'Language Model',
 
869
  df_view = df_sorted.copy()
870
 
871
  # --- 3. Add Columns for Agent Openness ---
872
+ # Only include the "Agent" column when the dataframe actually has
873
+ # more than one distinct agent. On the canonical OpenHands pages
874
+ # every row says "OpenHands", so adding the column is just noise;
875
+ # on the Alternative Agents page rows differ (Claude Code / Codex
876
+ # / Gemini CLI / OpenHands Sub-agents), so the column carries
877
+ # signal and disambiguates same-model rows from different
878
+ # harnesses.
879
+ has_mixed_agents = (
880
+ "Agent" in df_view.columns
881
+ and df_view["Agent"].dropna().nunique() > 1
882
+ )
883
+ if has_mixed_agents:
884
+ base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
885
+ else:
886
+ base_cols = ["id", "Language Model", "SDK Version", "Source"]
887
  new_cols = ["Openness"]
888
  ending_cols = ["Date", "Logs", "Visualization"]
889
 
 
1086
  """
1087
  Builds the complete HTML string for the plot's hover tooltip.
1088
  Format: {lm_name} (SDK {version})
1089
+ Harness: {agent} (only when the row carries an Agent β€”
1090
+ Alternative Agents page only; the
1091
+ canonical OpenHands pages drop the
1092
+ Agent column in view() so this line
1093
+ is skipped there)
1094
  Average Score: {score}
1095
  Average Cost/Runtime: {value}
1096
  Openness: {openness}
1097
  """
1098
  h_pad = " "
1099
  parts = ["<br>"]
1100
+
1101
  # Get and clean the language model name
1102
  llm_base_value = row.get('Language Model', '')
1103
  llm_base_value = clean_llm_base_list(llm_base_value)
 
1105
  lm_name = llm_base_value[0]
1106
  else:
1107
  lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
1108
+
1109
  # Get SDK version
1110
  sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
1111
+
1112
  # Title line: {lm_name} (SDK {version})
1113
  parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
1114
+
1115
+ # Harness line β€” only on pages where the Agent column is present
1116
+ # (Alternative Agents). Without this, two rows for the same LM run
1117
+ # under different harnesses (e.g. Claude Code vs OpenHands Sub-agents
1118
+ # on claude-sonnet-4-5) are indistinguishable on hover.
1119
+ agent_value = row.get('Agent')
1120
+ if agent_value is not None and pd.notna(agent_value) and str(agent_value).strip():
1121
+ parts.append(f"{h_pad}Harness: <b>{agent_value}</b>{h_pad}<br>")
1122
+
1123
  # Average Score
1124
  parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
1125
 
 
1192
  y_min = min_score - 5 if min_score > 5 else 0
1193
  y_max = max_score + 5
1194
 
1195
+ # Cache base64-encoded logos across rows β€” every Claude model on the
1196
+ # Alternative Agents page points at the same assets/harness-claude-code.svg,
1197
+ # so decoding once per path is ~NΓ— cheaper than once per point.
1198
+ _logo_cache: dict[str, str] = {}
1199
+ def _encode_logo(path: str) -> Optional[str]:
1200
+ if path in _logo_cache:
1201
+ return _logo_cache[path]
1202
+ if not os.path.exists(path):
1203
+ return None
1204
+ try:
1205
+ with open(path, "rb") as f:
1206
+ encoded = base64.b64encode(f.read()).decode("utf-8")
1207
+ except Exception as e:
1208
+ logger.warning(f"Could not load logo {path}: {e}")
1209
+ return None
1210
+ mime = "svg+xml" if path.lower().endswith(".svg") else "png"
1211
+ uri = f"data:image/{mime};base64,{encoded}"
1212
+ _logo_cache[path] = uri
1213
+ return uri
1214
+
1215
+ # Composite markers: on the Alternative Agents page the dataframe carries
1216
+ # an "Agent" column (Claude Code / Codex / Gemini CLI / OpenHands Sub-agents),
1217
+ # so a point for claude-sonnet-4-5 under Claude Code and under OpenHands
1218
+ # Sub-agents would otherwise share the exact same Anthropic logo marker
1219
+ # and be visually indistinguishable. When Agent is present, we stack
1220
+ # two logos at each point: model provider on top, harness on the bottom.
1221
+ # Canonical OpenHands pages drop the Agent column in view() (via the
1222
+ # has_mixed_agents check), so they fall through to the single-logo path
1223
+ # and render exactly as before.
1224
+ has_harness_column = (
1225
+ "Agent" in data_plot.columns
1226
+ and data_plot["Agent"].dropna().astype(str).str.strip().ne("").any()
1227
+ )
1228
+
1229
+ # Marker sizes. The composite variant fits two logos inside roughly the
1230
+ # same vertical footprint as a single marker, so each half is slightly
1231
+ # smaller and the two halves are offset symmetrically around the point's
1232
+ # true y-coordinate.
1233
+ SINGLE_SIZE_X, SINGLE_SIZE_Y = 0.04, 0.06
1234
+ STACKED_SIZE_X, STACKED_SIZE_Y = 0.035, 0.048
1235
+ STACKED_Y_OFFSET = 0.028 # half-separation between model (top) and harness (bottom)
1236
+
1237
  for _, row in data_plot.iterrows():
1238
  model_name = row.get('Language Model', '')
1239
  openness = row.get('Openness', '')
1240
  marker_info = get_marker_icon(model_name, openness, mark_by)
1241
+ model_logo_uri = _encode_logo(marker_info['path'])
1242
+ if model_logo_uri is None:
1243
+ continue
1244
+
1245
+ # Harness (only meaningful when the dataframe carries an Agent column).
1246
+ harness_uri = None
1247
+ if has_harness_column:
1248
+ harness_info = get_harness_icon(row.get("Agent"))
1249
+ if harness_info is not None:
1250
+ harness_uri = _encode_logo(harness_info["path"])
1251
+
1252
+ x_val = row[x_col_to_use]
1253
+ y_val = row[y_col_to_use]
1254
+
1255
+ # Convert to domain coordinates (0-1 range)
1256
+ # For log scale x: domain_x = (log10(x) - x_min_log) / (x_max_log - x_min_log)
1257
+ if x_val > 0:
1258
+ log_x = np.log10(x_val)
1259
+ domain_x = (log_x - x_min_log) / (x_max_log - x_min_log)
1260
+ else:
1261
+ domain_x = 0
1262
+
1263
+ # For linear y: domain_y = (y - y_min) / (y_max - y_min)
1264
+ domain_y = (y_val - y_min) / (y_max - y_min) if (y_max - y_min) > 0 else 0.5
1265
+
1266
+ # Clamp to valid range
1267
+ domain_x = max(0, min(1, domain_x))
1268
+ domain_y = max(0, min(1, domain_y))
1269
+
1270
+ if harness_uri is not None:
1271
+ # Composite: stack model on top, harness on bottom, clamping
1272
+ # each half to the plot area so markers near the edges don't
1273
+ # drift off-canvas.
1274
+ model_y = min(1, domain_y + STACKED_Y_OFFSET)
1275
+ harness_y = max(0, domain_y - STACKED_Y_OFFSET)
1276
+ layout_images.append(dict(
1277
+ source=model_logo_uri,
1278
+ xref="x domain", yref="y domain",
1279
+ x=domain_x, y=model_y,
1280
+ sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
1281
+ xanchor="center", yanchor="middle",
1282
+ layer="above",
1283
+ ))
1284
+ layout_images.append(dict(
1285
+ source=harness_uri,
1286
+ xref="x domain", yref="y domain",
1287
+ x=domain_x, y=harness_y,
1288
+ sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
1289
+ xanchor="center", yanchor="middle",
1290
+ layer="above",
1291
+ ))
1292
+ else:
1293
+ # Single marker (canonical OpenHands pages, or Alternative Agents
1294
+ # rows with an unknown harness name β€” the latter shouldn't happen
1295
+ # in practice since HARNESS_LOGO_PATHS covers every agent_name the
1296
+ # push-to-index script emits).
1297
+ layout_images.append(dict(
1298
+ source=model_logo_uri,
1299
+ xref="x domain", yref="y domain",
1300
+ x=domain_x, y=domain_y,
1301
+ sizex=SINGLE_SIZE_X, sizey=SINGLE_SIZE_Y,
1302
+ xanchor="center", yanchor="middle",
1303
+ layer="above",
1304
+ ))
1305
 
1306
  # --- Section 7: Add Model Name Labels to Frontier Points ---
1307
  if frontier_rows:
setup_data.py CHANGED
@@ -70,27 +70,39 @@ def fetch_data_from_github():
70
 
71
  # Look for data files in the cloned repository
72
  results_source = clone_dir / "results"
73
-
74
  if not results_source.exists():
75
  print(f"Results directory not found in repository")
76
  return False
77
-
78
  # Check if there are any agent result directories
79
  result_dirs = list(results_source.iterdir())
80
  if not result_dirs:
81
  print(f"No agent results found in {results_source}")
82
  return False
83
-
84
  print(f"Found {len(result_dirs)} agent result directories")
85
-
86
  # Create target directory and copy the results structure
87
  os.makedirs(target_dir.parent, exist_ok=True)
88
  if target_dir.exists():
89
  shutil.rmtree(target_dir)
90
-
91
  # Copy the entire results directory
92
  target_results = target_dir / "results"
93
  shutil.copytree(results_source, target_results)
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
96
 
 
70
 
71
  # Look for data files in the cloned repository
72
  results_source = clone_dir / "results"
73
+
74
  if not results_source.exists():
75
  print(f"Results directory not found in repository")
76
  return False
77
+
78
  # Check if there are any agent result directories
79
  result_dirs = list(results_source.iterdir())
80
  if not result_dirs:
81
  print(f"No agent results found in {results_source}")
82
  return False
83
+
84
  print(f"Found {len(result_dirs)} agent result directories")
85
+
86
  # Create target directory and copy the results structure
87
  os.makedirs(target_dir.parent, exist_ok=True)
88
  if target_dir.exists():
89
  shutil.rmtree(target_dir)
90
+
91
  # Copy the entire results directory
92
  target_results = target_dir / "results"
93
  shutil.copytree(results_source, target_results)
94
+
95
+ # Also copy alternative_agents/ if present, so the loader can pick up
96
+ # ACP runs (acp-claude, acp-codex, acp-gemini, openhands_subagents, ...)
97
+ # alongside the default OpenHands agent results.
98
+ alt_source = clone_dir / "alternative_agents"
99
+ if alt_source.exists():
100
+ alt_target = target_dir / "alternative_agents"
101
+ shutil.copytree(alt_source, alt_target)
102
+ agent_types = sorted(p.name for p in alt_source.iterdir() if p.is_dir())
103
+ print(f"Found alternative agent types: {agent_types}")
104
+ else:
105
+ print("No alternative_agents/ directory in repository (skipping)")
106
 
107
  print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
108
 
simple_data_loader.py CHANGED
@@ -96,17 +96,43 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
96
 
97
  class SimpleLeaderboardViewer:
98
  """Simple replacement for agent-eval's LeaderboardViewer."""
99
-
100
- def __init__(self, data_dir: str, config: str, split: str):
 
 
 
 
 
 
 
 
 
101
  """
102
  Args:
103
  data_dir: Path to data directory
104
  config: Config name (e.g., "1.0.0-dev1")
105
  split: Split name (e.g., "validation" or "test")
 
 
 
 
 
 
 
 
 
 
106
  """
 
 
 
 
 
 
107
  self.data_dir = Path(data_dir)
108
  self.config = config
109
  self.split = split
 
110
  self.config_path = self.data_dir / config
111
 
112
  # Benchmark to category mappings (single source of truth)
@@ -127,55 +153,115 @@ class SimpleLeaderboardViewer:
127
  if benchmark not in self.tag_map[category]:
128
  self.tag_map[category].append(benchmark)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def _load_from_agent_dirs(self):
131
- """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
132
- results_dir = self.config_path / "results"
133
-
134
- if not results_dir.exists():
135
- return None # Fall back to old format
136
-
 
 
 
 
 
 
 
137
  all_records = []
138
  all_validation_errors = []
139
-
140
- # Iterate through each agent directory
141
- for agent_dir in results_dir.iterdir():
142
- if not agent_dir.is_dir():
143
- continue
144
-
145
- # Load and validate using pydantic models
146
- metadata, scores, errors = load_and_validate_agent_data(agent_dir)
147
-
148
- if errors:
149
- all_validation_errors.extend(errors)
150
-
151
- if metadata is None or scores is None:
152
- continue
153
-
154
- # Skip entries that are hidden from the leaderboard
155
- if metadata.get('hide_from_leaderboard', False):
156
- logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
157
- continue
158
-
159
- # Create one record per benchmark (mimicking old JSONL format)
160
- for score_entry in scores:
161
- record = {
162
- 'agent_version': metadata.get('agent_version', 'Unknown'),
163
- 'llm_base': metadata.get('model', 'unknown'),
164
- 'openness': metadata.get('openness', 'unknown'),
165
- 'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
166
- 'release_date': metadata.get('release_date', ''), # Model release date
167
- 'parameter_count_b': metadata.get('parameter_count_b'), # Total params in billions
168
- 'active_parameter_count_b': metadata.get('active_parameter_count_b'), # Active params for MoE
169
- 'score': score_entry.get('score'),
170
- 'metric': score_entry.get('metric', 'unknown'),
171
- 'cost_per_instance': score_entry.get('cost_per_instance'),
172
- 'average_runtime': score_entry.get('average_runtime'),
173
- 'tags': [score_entry.get('benchmark')],
174
- 'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
175
- 'eval_visualization_page': score_entry.get('eval_visualization_page', ''), # Laminar visualization URL
176
- }
177
- all_records.append(record)
178
-
179
  # Log validation errors if any
180
  if all_validation_errors:
181
  logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
@@ -183,10 +269,10 @@ class SimpleLeaderboardViewer:
183
  logger.warning(f" - {error}")
184
  if len(all_validation_errors) > 5:
185
  logger.warning(f" ... and {len(all_validation_errors) - 5} more")
186
-
187
  if not all_records:
188
- return None # Fall back to old format
189
-
190
  return pd.DataFrame(all_records)
191
 
192
  def _load(self):
@@ -206,26 +292,36 @@ class SimpleLeaderboardViewer:
206
  # Group by agent (version + model combination) to aggregate results across datasets
207
  transformed_records = []
208
 
209
- # Create a unique identifier for each agent (version + model)
210
- df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
211
-
 
 
 
 
 
 
 
 
212
  for agent_id in df['agent_id'].unique():
213
  agent_records = df[df['agent_id'] == agent_id]
214
-
215
  # Build a single record for this agent
216
  first_record = agent_records.iloc[0]
217
  agent_version = first_record['agent_version']
218
-
 
219
  # Normalize openness to "open" or "closed"
220
  from aliases import OPENNESS_MAPPING
221
  raw_openness = first_record['openness']
222
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
223
-
224
  # All 5 categories for the leaderboard
225
  ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
226
-
227
  record = {
228
  # Core agent info - use final display names
 
229
  'SDK version': agent_version, # Will become "SDK Version"
230
  'Language model': first_record['llm_base'], # Will become "Language Model"
231
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
@@ -235,7 +331,7 @@ class SimpleLeaderboardViewer:
235
  'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
236
  'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
237
  # Additional columns expected by the transformer
238
- # Use agent_id (version_model) as unique identifier for Pareto frontier calculation
239
  'id': agent_id,
240
  'source': first_record.get('source', ''), # Will become "Source"
241
  'logs': first_record.get('logs', ''), # Will become "Logs"
 
96
 
97
  class SimpleLeaderboardViewer:
98
  """Simple replacement for agent-eval's LeaderboardViewer."""
99
+
100
+ AGENT_FILTER_OPENHANDS = "openhands"
101
+ AGENT_FILTER_ALTERNATIVE = "alternative"
102
+
103
+ def __init__(
104
+ self,
105
+ data_dir: str,
106
+ config: str,
107
+ split: str,
108
+ agent_filter: str = AGENT_FILTER_OPENHANDS,
109
+ ):
110
  """
111
  Args:
112
  data_dir: Path to data directory
113
  config: Config name (e.g., "1.0.0-dev1")
114
  split: Split name (e.g., "validation" or "test")
115
+ agent_filter: Which submissions to include.
116
+ ``"openhands"`` (default) loads only the default OpenHands
117
+ agent runs from ``results/{model}/`` β€” the canonical
118
+ leaderboard. ``"alternative"`` loads only third-party
119
+ harnesses (Claude Code / Codex / Gemini CLI / OpenHands
120
+ Sub-agents) from ``alternative_agents/{type}/{model}/``,
121
+ which power the standalone Alternative Agents page.
122
+ The two are kept on separate pages because their
123
+ cost/runtime numbers aren't apples-to-apples and mixing
124
+ them in one ranking would be misleading.
125
  """
126
+ if agent_filter not in (self.AGENT_FILTER_OPENHANDS, self.AGENT_FILTER_ALTERNATIVE):
127
+ raise ValueError(
128
+ f"agent_filter must be one of "
129
+ f"{{{self.AGENT_FILTER_OPENHANDS!r}, {self.AGENT_FILTER_ALTERNATIVE!r}}}, "
130
+ f"got {agent_filter!r}"
131
+ )
132
  self.data_dir = Path(data_dir)
133
  self.config = config
134
  self.split = split
135
+ self.agent_filter = agent_filter
136
  self.config_path = self.data_dir / config
137
 
138
  # Benchmark to category mappings (single source of truth)
 
153
  if benchmark not in self.tag_map[category]:
154
  self.tag_map[category].append(benchmark)
155
 
156
+ # Default agent_name when metadata.json doesn't carry one. Matches the
157
+ # default-agent value used by push_to_index_from_archive.py so legacy
158
+ # entries (which omit the field) still group cleanly with new entries.
159
+ DEFAULT_AGENT_NAME = "OpenHands"
160
+
161
+ def _records_from_agent_dir(self, agent_dir: Path, default_agent_name: str | None = None) -> tuple[list[dict], list[str]]:
162
+ """Build per-benchmark records from a single agent directory.
163
+
164
+ Shared by ``_load_from_agent_dirs`` (default OpenHands results) and
165
+ ``_load_from_alternative_agents_dirs`` (acp-claude / acp-codex / etc.).
166
+ Returns ``(records, validation_errors)``. Returns an empty list of
167
+ records when the directory has no scores or is hidden from the
168
+ leaderboard.
169
+ """
170
+ records: list[dict] = []
171
+ metadata, scores, errors = load_and_validate_agent_data(agent_dir)
172
+
173
+ if metadata is None or scores is None:
174
+ return records, errors
175
+
176
+ if metadata.get('hide_from_leaderboard', False):
177
+ logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
178
+ return records, errors
179
+
180
+ # Resolve the agent display name. Prefer the value stamped into
181
+ # metadata.json by push-to-index; fall back to the directory's
182
+ # default (e.g. "Claude Code" for acp-claude/) and finally to
183
+ # "OpenHands" for legacy results/ entries that predate the field.
184
+ agent_name = (
185
+ metadata.get('agent_name')
186
+ or default_agent_name
187
+ or self.DEFAULT_AGENT_NAME
188
+ )
189
+
190
+ for score_entry in scores:
191
+ record = {
192
+ 'agent_name': agent_name,
193
+ 'agent_version': metadata.get('agent_version', 'Unknown'),
194
+ 'llm_base': metadata.get('model', 'unknown'),
195
+ 'openness': metadata.get('openness', 'unknown'),
196
+ 'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
197
+ 'release_date': metadata.get('release_date', ''),
198
+ 'parameter_count_b': metadata.get('parameter_count_b'),
199
+ 'active_parameter_count_b': metadata.get('active_parameter_count_b'),
200
+ 'score': score_entry.get('score'),
201
+ 'metric': score_entry.get('metric', 'unknown'),
202
+ 'cost_per_instance': score_entry.get('cost_per_instance'),
203
+ 'average_runtime': score_entry.get('average_runtime'),
204
+ 'tags': [score_entry.get('benchmark')],
205
+ 'full_archive': score_entry.get('full_archive', ''),
206
+ 'eval_visualization_page': score_entry.get('eval_visualization_page', ''),
207
+ }
208
+ records.append(record)
209
+ return records, errors
210
+
211
  def _load_from_agent_dirs(self):
212
+ """Load agent records based on ``self.agent_filter``.
213
+
214
+ - ``"openhands"`` (default): only ``{config}/results/{model}/``,
215
+ which is the canonical OpenHands leaderboard. The Home page and
216
+ the per-category subpages use this.
217
+ - ``"alternative"``: only
218
+ ``{config}/alternative_agents/{type}/{model}/`` (acp-claude,
219
+ acp-codex, acp-gemini, openhands_subagents, ...). The dedicated
220
+ Alternative Agents page uses this.
221
+
222
+ Returns ``None`` if no records were found (which makes the caller
223
+ render an empty-state placeholder).
224
+ """
225
  all_records = []
226
  all_validation_errors = []
227
+
228
+ if self.agent_filter == self.AGENT_FILTER_OPENHANDS:
229
+ # Default OpenHands agent results
230
+ results_dir = self.config_path / "results"
231
+ if results_dir.exists():
232
+ for agent_dir in results_dir.iterdir():
233
+ if not agent_dir.is_dir():
234
+ continue
235
+ records, errors = self._records_from_agent_dir(agent_dir)
236
+ all_records.extend(records)
237
+ all_validation_errors.extend(errors)
238
+ else:
239
+ # Alternative agents (one subdirectory per agent_type, then per model)
240
+ # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE
241
+ # map in OpenHands/evaluation push_to_index_from_archive.py β€” keeping
242
+ # it in sync ensures rows are labelled the same way the index repo
243
+ # records them.
244
+ agent_type_default_name = {
245
+ 'acp-claude': 'Claude Code',
246
+ 'acp-codex': 'Codex',
247
+ 'acp-gemini': 'Gemini CLI',
248
+ 'openhands_subagents': 'OpenHands Sub-agents',
249
+ }
250
+ alt_dir = self.config_path / "alternative_agents"
251
+ if alt_dir.exists():
252
+ for type_dir in alt_dir.iterdir():
253
+ if not type_dir.is_dir():
254
+ continue
255
+ default_name = agent_type_default_name.get(type_dir.name)
256
+ for agent_dir in type_dir.iterdir():
257
+ if not agent_dir.is_dir():
258
+ continue
259
+ records, errors = self._records_from_agent_dir(
260
+ agent_dir, default_agent_name=default_name
261
+ )
262
+ all_records.extend(records)
263
+ all_validation_errors.extend(errors)
264
+
 
 
265
  # Log validation errors if any
266
  if all_validation_errors:
267
  logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
 
269
  logger.warning(f" - {error}")
270
  if len(all_validation_errors) > 5:
271
  logger.warning(f" ... and {len(all_validation_errors) - 5} more")
272
+
273
  if not all_records:
274
+ return None # Caller will render empty-state placeholder
275
+
276
  return pd.DataFrame(all_records)
277
 
278
  def _load(self):
 
292
  # Group by agent (version + model combination) to aggregate results across datasets
293
  transformed_records = []
294
 
295
+ # Create a unique identifier per (agent_name, agent_version, model)
296
+ # tuple. Including agent_name keeps an OpenHands run and a Claude
297
+ # Code run on the same SDK version + model from collapsing into
298
+ # one row when both submit to the leaderboard.
299
+ df['agent_name'] = df['agent_name'].fillna(self.DEFAULT_AGENT_NAME)
300
+ df['agent_id'] = (
301
+ df['agent_name'].astype(str)
302
+ + '_' + df['agent_version'].astype(str)
303
+ + '_' + df['llm_base'].astype(str)
304
+ )
305
+
306
  for agent_id in df['agent_id'].unique():
307
  agent_records = df[df['agent_id'] == agent_id]
308
+
309
  # Build a single record for this agent
310
  first_record = agent_records.iloc[0]
311
  agent_version = first_record['agent_version']
312
+ agent_name = first_record['agent_name']
313
+
314
  # Normalize openness to "open" or "closed"
315
  from aliases import OPENNESS_MAPPING
316
  raw_openness = first_record['openness']
317
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
318
+
319
  # All 5 categories for the leaderboard
320
  ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
321
+
322
  record = {
323
  # Core agent info - use final display names
324
+ 'agent_name': agent_name, # Will become "Agent"
325
  'SDK version': agent_version, # Will become "SDK Version"
326
  'Language model': first_record['llm_base'], # Will become "Language Model"
327
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
 
331
  'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
332
  'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
333
  # Additional columns expected by the transformer
334
+ # Use agent_id (name_version_model) as unique identifier for Pareto frontier calculation
335
  'id': agent_id,
336
  'source': first_record.get('source', ''), # Will become "Source"
337
  'logs': first_record.get('logs', ''), # Will become "Logs"
ui_components.py CHANGED
@@ -508,28 +508,36 @@ class DummyViewer:
508
  # The _load method returns the error DataFrame and an empty tag map
509
  return self._error_df, {}
510
 
511
- def get_leaderboard_viewer_instance(split: str):
 
 
 
512
  """
513
- Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid
514
- re-downloading data. On error, returns a stable DummyViewer object.
 
 
515
  """
516
  global CACHED_VIEWERS, CACHED_TAG_MAPS
517
 
 
 
518
  with _cache_lock:
519
- if split in CACHED_VIEWERS:
520
  # Cache hit: return the cached viewer and tag map
521
- return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
522
 
523
  # --- Cache miss: try to load data from the source ---
524
  try:
525
  # First try to load from extracted data directory (local mock data)
526
  data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
527
-
528
- print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
529
  viewer = SimpleLeaderboardViewer(
530
  data_dir=data_dir,
531
  config=CONFIG_NAME,
532
- split=split
 
533
  )
534
 
535
  # Simplify tag map creation
@@ -537,14 +545,14 @@ def get_leaderboard_viewer_instance(split: str):
537
 
538
  # Cache the results for next time (thread-safe)
539
  with _cache_lock:
540
- CACHED_VIEWERS[split] = viewer
541
- CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
542
 
543
  return viewer, pretty_tag_map
544
 
545
  except Exception as e:
546
  # On ANY error, create a consistent error message and cache a DummyViewer
547
- error_message = f"Error loading data for split '{split}': {e}"
548
  print(format_error(error_message))
549
 
550
  dummy_df = pd.DataFrame({"Message": [error_message]})
@@ -553,8 +561,8 @@ def get_leaderboard_viewer_instance(split: str):
553
 
554
  # Cache the dummy objects so we don't try to fetch again on this run
555
  with _cache_lock:
556
- CACHED_VIEWERS[split] = dummy_viewer
557
- CACHED_TAG_MAPS[split] = dummy_tag_map
558
 
559
  return dummy_viewer, dummy_tag_map
560
 
@@ -1268,12 +1276,17 @@ def create_benchmark_details_display(
1268
  legend_markdown = create_legend_markdown(benchmark_name)
1269
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
1270
 
1271
- def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
 
 
 
1272
  """
1273
- Loads and transforms the complete dataset for a given split.
1274
- This function handles caching and returns the final "pretty" DataFrame and tag map.
 
 
1275
  """
1276
- viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
1277
 
1278
  if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
1279
  raw_df, _ = viewer_or_data._load()
 
508
  # The _load method returns the error DataFrame and an empty tag map
509
  return self._error_df, {}
510
 
511
+ def get_leaderboard_viewer_instance(
512
+ split: str,
513
+ agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
514
+ ):
515
  """
516
+ Fetches the LeaderboardViewer for a (split, agent_filter) pair, using a
517
+ thread-safe cache to avoid re-downloading data. The cache is keyed on
518
+ both axes so the OpenHands and Alternative Agents pages don't fight
519
+ over a single slot. On error, returns a stable DummyViewer.
520
  """
521
  global CACHED_VIEWERS, CACHED_TAG_MAPS
522
 
523
+ cache_key = (split, agent_filter)
524
+
525
  with _cache_lock:
526
+ if cache_key in CACHED_VIEWERS:
527
  # Cache hit: return the cached viewer and tag map
528
+ return CACHED_VIEWERS[cache_key], CACHED_TAG_MAPS.get(cache_key, {"Overall": []})
529
 
530
  # --- Cache miss: try to load data from the source ---
531
  try:
532
  # First try to load from extracted data directory (local mock data)
533
  data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
534
+
535
+ print(f"Loading data for split '{split}' (agent_filter={agent_filter}) from: {data_dir}/{CONFIG_NAME}")
536
  viewer = SimpleLeaderboardViewer(
537
  data_dir=data_dir,
538
  config=CONFIG_NAME,
539
+ split=split,
540
+ agent_filter=agent_filter,
541
  )
542
 
543
  # Simplify tag map creation
 
545
 
546
  # Cache the results for next time (thread-safe)
547
  with _cache_lock:
548
+ CACHED_VIEWERS[cache_key] = viewer
549
+ CACHED_TAG_MAPS[cache_key] = pretty_tag_map # Cache the pretty map directly
550
 
551
  return viewer, pretty_tag_map
552
 
553
  except Exception as e:
554
  # On ANY error, create a consistent error message and cache a DummyViewer
555
+ error_message = f"Error loading data for split '{split}' (agent_filter={agent_filter}): {e}"
556
  print(format_error(error_message))
557
 
558
  dummy_df = pd.DataFrame({"Message": [error_message]})
 
561
 
562
  # Cache the dummy objects so we don't try to fetch again on this run
563
  with _cache_lock:
564
+ CACHED_VIEWERS[cache_key] = dummy_viewer
565
+ CACHED_TAG_MAPS[cache_key] = dummy_tag_map
566
 
567
  return dummy_viewer, dummy_tag_map
568
 
 
1276
  legend_markdown = create_legend_markdown(benchmark_name)
1277
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
1278
 
1279
+ def get_full_leaderboard_data(
1280
+ split: str,
1281
+ agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
1282
+ ) -> tuple[pd.DataFrame, dict]:
1283
  """
1284
+ Loads and transforms the complete dataset for a (split, agent_filter)
1285
+ pair. ``agent_filter`` defaults to ``"openhands"`` so existing pages
1286
+ that don't pass it stay on the canonical leaderboard. The Alternative
1287
+ Agents page passes ``"alternative"`` to get the third-party harnesses.
1288
  """
1289
+ viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split, agent_filter=agent_filter)
1290
 
1291
  if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
1292
  raw_df, _ = viewer_or_data._load()