simonrosenberg1 commited on
Commit
d7034b8
·
verified ·
1 Parent(s): bef7ade

Move alternative agents to a dedicated page

Browse files

Address review feedback from the original commit:

- Mixing default OpenHands runs and alternative-agent runs (Claude Code,
Codex, Gemini CLI, OpenHands Sub-agents) in the same leaderboard table
and the same scatter plot is misleading. Their cost/runtime numbers
aren't apples-to-apples across harnesses, and conflating them with the
OpenHands ranking on Home obscures what the index is actually
measuring.
- Move alternative-agent rows to a dedicated /alternative-agents page.
Home and the per-category subpages stay exactly as before
(default-OpenHands only).

Changes:

- simple_data_loader.SimpleLeaderboardViewer now takes an `agent_filter`
argument, defaulting to "openhands". When "openhands" it walks
results/{model}/. When "alternative" it walks
alternative_agents/{type}/{model}/.
- ui_components.get_leaderboard_viewer_instance / get_full_leaderboard_data
thread `agent_filter` through and key the in-memory cache on
(split, agent_filter) so the two pages don't fight over a single slot.
- New alternative_agents_page.build_page renders the same
create_leaderboard_display UI but with agent_filter="alternative".
Single Overall view, no per-category subpages (the dataset is small
and the goal is "show me all the alternatives at a glance").
- app.py wires `with demo.route('Alternative Agents', '/alternative-agents'):`
between Information Gathering and About in the top nav.
- leaderboard_transformer.view_table only inserts the "Agent" column
when the dataframe has more than one distinct agent_name. The
OpenHands pages now hide it (every row would say "OpenHands"); the
Alternative Agents page shows it because rows differ.

Local verification:

agent_filter=openhands -> 22 rows, all agent_name=OpenHands
agent_filter=alternative -> 7 rows: 2 Claude Code, 1 Codex, 2 OpenHands Sub-agents,
2 mislabelled Gemini entries (data fix needed
in openhands-index-results)

alternative_agents_page.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Alternative Agents leaderboard page.
2
+
3
+ The canonical OpenHands Index leaderboard (Home + the per-category pages)
4
+ ranks default OpenHands agent runs from ``results/{model}/`` in the
5
+ openhands-index-results repo. Third-party harnesses (Claude Code, Codex,
6
+ Gemini CLI, OpenHands Sub-agents, ...) live under
7
+ ``alternative_agents/{type}/{model}/`` and aren't directly comparable to
8
+ default OpenHands runs (different scaffolds, different cost/runtime
9
+ characteristics), so they get their own standalone page instead of being
10
+ mixed into the same ranking.
11
+
12
+ This page is intentionally a single Overall view (no per-category
13
+ subpages) — the alternative-agents dataset is small (one row per
14
+ harness × model) and the goal is "show me all the alternatives at a
15
+ glance", not "drill into Issue Resolution for Codex".
16
+ """
17
+ import matplotlib
18
+ matplotlib.use('Agg')
19
+ import gradio as gr
20
+
21
+ from simple_data_loader import SimpleLeaderboardViewer
22
+ from ui_components import (
23
+ create_leaderboard_display,
24
+ get_full_leaderboard_data,
25
+ )
26
+
27
+
28
+ ALTERNATIVE_AGENTS_INTRO = """
29
+ <div id="alternative-agents-intro">
30
+ <h2>Alternative Agents</h2>
31
+ <p>
32
+ Third-party agent harnesses running the OpenHands Index benchmarks.
33
+ These rows aren't part of the OpenHands ranking on the
34
+ <a href="/home">Home</a> page — they're tracked here as a comparison
35
+ point. Cost and runtime numbers come from each harness's own
36
+ instrumentation and aren't directly comparable across harnesses.
37
+ </p>
38
+ </div>
39
+ """
40
+
41
+
42
+ def build_page():
43
+ gr.HTML(ALTERNATIVE_AGENTS_INTRO)
44
+
45
+ gr.Markdown("---")
46
+
47
+ test_df, test_tag_map = get_full_leaderboard_data(
48
+ "test",
49
+ agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_ALTERNATIVE,
50
+ )
51
+
52
+ if test_df.empty:
53
+ gr.Markdown(
54
+ "No alternative agent submissions yet. New runs land in "
55
+ "`alternative_agents/{type}/{model}/` in "
56
+ "[openhands-index-results](https://github.com/OpenHands/openhands-index-results)."
57
+ )
58
+ return
59
+
60
+ create_leaderboard_display(
61
+ full_df=test_df,
62
+ tag_map=test_tag_map,
63
+ category_name="Overall",
64
+ split_name="test",
65
+ )
app.py CHANGED
@@ -35,6 +35,7 @@ from app_creation import build_page as build_app_creation_page
35
  from frontend_development import build_page as build_frontend_page
36
  from test_generation import build_page as build_test_generation_page
37
  from information_gathering import build_page as build_information_gathering_page
 
38
  from about import build_page as build_about_page
39
 
40
  logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
@@ -373,6 +374,9 @@ with demo.route("Testing", "/testing"):
373
  with demo.route("Information Gathering", "/information-gathering"):
374
  build_information_gathering_page()
375
 
 
 
 
376
  with demo.route("About", "/about"):
377
  build_about_page()
378
 
 
35
  from frontend_development import build_page as build_frontend_page
36
  from test_generation import build_page as build_test_generation_page
37
  from information_gathering import build_page as build_information_gathering_page
38
+ from alternative_agents_page import build_page as build_alternative_agents_page
39
  from about import build_page as build_about_page
40
 
41
  logger.info(f"All modules imported (LOCAL_DEBUG={LOCAL_DEBUG})")
 
374
  with demo.route("Information Gathering", "/information-gathering"):
375
  build_information_gathering_page()
376
 
377
+ with demo.route("Alternative Agents", "/alternative-agents"):
378
+ build_alternative_agents_page()
379
+
380
  with demo.route("About", "/about"):
381
  build_about_page()
382
 
leaderboard_transformer.py CHANGED
@@ -816,11 +816,21 @@ class DataTransformer:
816
  df_view = df_sorted.copy()
817
 
818
  # --- 3. Add Columns for Agent Openness ---
819
- # "Agent" sits between id and Language Model so OpenHands vs
820
- # alternative agents (Claude Code / Codex / Gemini CLI) are visible
821
- # at a glance, and so the same model with two different agents
822
- # doesn't look like a duplicate row.
823
- base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
 
 
 
 
 
 
 
 
 
 
824
  new_cols = ["Openness"]
825
  ending_cols = ["Date", "Logs", "Visualization"]
826
 
 
816
  df_view = df_sorted.copy()
817
 
818
  # --- 3. Add Columns for Agent Openness ---
819
+ # Only include the "Agent" column when the dataframe actually has
820
+ # more than one distinct agent. On the canonical OpenHands pages
821
+ # every row says "OpenHands", so adding the column is just noise;
822
+ # on the Alternative Agents page rows differ (Claude Code / Codex
823
+ # / Gemini CLI / OpenHands Sub-agents), so the column carries
824
+ # signal and disambiguates same-model rows from different
825
+ # harnesses.
826
+ has_mixed_agents = (
827
+ "Agent" in df_view.columns
828
+ and df_view["Agent"].dropna().nunique() > 1
829
+ )
830
+ if has_mixed_agents:
831
+ base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
832
+ else:
833
+ base_cols = ["id", "Language Model", "SDK Version", "Source"]
834
  new_cols = ["Openness"]
835
  ending_cols = ["Date", "Logs", "Visualization"]
836
 
simple_data_loader.py CHANGED
@@ -96,17 +96,43 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
96
 
97
  class SimpleLeaderboardViewer:
98
  """Simple replacement for agent-eval's LeaderboardViewer."""
99
-
100
- def __init__(self, data_dir: str, config: str, split: str):
 
 
 
 
 
 
 
 
 
101
  """
102
  Args:
103
  data_dir: Path to data directory
104
  config: Config name (e.g., "1.0.0-dev1")
105
  split: Split name (e.g., "validation" or "test")
 
 
 
 
 
 
 
 
 
 
106
  """
 
 
 
 
 
 
107
  self.data_dir = Path(data_dir)
108
  self.config = config
109
  self.split = split
 
110
  self.config_path = self.data_dir / config
111
 
112
  # Benchmark to category mappings (single source of truth)
@@ -183,52 +209,58 @@ class SimpleLeaderboardViewer:
183
  return records, errors
184
 
185
  def _load_from_agent_dirs(self):
186
- """Load default-agent results plus any alternative_agents/ entries.
 
 
 
 
 
 
 
 
187
 
188
- Reads ``{config}/results/{model}/`` for default OpenHands runs and
189
- ``{config}/alternative_agents/{type}/{model}/`` for ACP agent runs
190
- (acp-claude, acp-codex, acp-gemini, ...) so they all surface in the
191
- same leaderboard. Returns ``None`` if neither directory yields any
192
- records (which makes the caller render an empty-state placeholder).
193
  """
194
  all_records = []
195
  all_validation_errors = []
196
 
197
- # 1. Default OpenHands agent results
198
- results_dir = self.config_path / "results"
199
- if results_dir.exists():
200
- for agent_dir in results_dir.iterdir():
201
- if not agent_dir.is_dir():
202
- continue
203
- records, errors = self._records_from_agent_dir(agent_dir)
204
- all_records.extend(records)
205
- all_validation_errors.extend(errors)
206
-
207
- # 2. Alternative agents (one subdirectory per agent_type, then per model)
208
- # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE map
209
- # in OpenHands/evaluation push_to_index_from_archive.py — keeping it
210
- # in sync ensures rows are labelled the same way the index repo
211
- # records them.
212
- agent_type_default_name = {
213
- 'acp-claude': 'Claude Code',
214
- 'acp-codex': 'Codex',
215
- 'acp-gemini': 'Gemini CLI',
216
- 'openhands_subagents': 'OpenHands Sub-agents',
217
- }
218
- alt_dir = self.config_path / "alternative_agents"
219
- if alt_dir.exists():
220
- for type_dir in alt_dir.iterdir():
221
- if not type_dir.is_dir():
222
- continue
223
- default_name = agent_type_default_name.get(type_dir.name)
224
- for agent_dir in type_dir.iterdir():
225
  if not agent_dir.is_dir():
226
  continue
227
- records, errors = self._records_from_agent_dir(
228
- agent_dir, default_agent_name=default_name
229
- )
230
  all_records.extend(records)
231
  all_validation_errors.extend(errors)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  # Log validation errors if any
234
  if all_validation_errors:
 
96
 
97
  class SimpleLeaderboardViewer:
98
  """Simple replacement for agent-eval's LeaderboardViewer."""
99
+
100
+ AGENT_FILTER_OPENHANDS = "openhands"
101
+ AGENT_FILTER_ALTERNATIVE = "alternative"
102
+
103
+ def __init__(
104
+ self,
105
+ data_dir: str,
106
+ config: str,
107
+ split: str,
108
+ agent_filter: str = AGENT_FILTER_OPENHANDS,
109
+ ):
110
  """
111
  Args:
112
  data_dir: Path to data directory
113
  config: Config name (e.g., "1.0.0-dev1")
114
  split: Split name (e.g., "validation" or "test")
115
+ agent_filter: Which submissions to include.
116
+ ``"openhands"`` (default) loads only the default OpenHands
117
+ agent runs from ``results/{model}/`` — the canonical
118
+ leaderboard. ``"alternative"`` loads only third-party
119
+ harnesses (Claude Code / Codex / Gemini CLI / OpenHands
120
+ Sub-agents) from ``alternative_agents/{type}/{model}/``,
121
+ which power the standalone Alternative Agents page.
122
+ The two are kept on separate pages because their
123
+ cost/runtime numbers aren't apples-to-apples and mixing
124
+ them in one ranking would be misleading.
125
  """
126
+ if agent_filter not in (self.AGENT_FILTER_OPENHANDS, self.AGENT_FILTER_ALTERNATIVE):
127
+ raise ValueError(
128
+ f"agent_filter must be one of "
129
+ f"{{{self.AGENT_FILTER_OPENHANDS!r}, {self.AGENT_FILTER_ALTERNATIVE!r}}}, "
130
+ f"got {agent_filter!r}"
131
+ )
132
  self.data_dir = Path(data_dir)
133
  self.config = config
134
  self.split = split
135
+ self.agent_filter = agent_filter
136
  self.config_path = self.data_dir / config
137
 
138
  # Benchmark to category mappings (single source of truth)
 
209
  return records, errors
210
 
211
  def _load_from_agent_dirs(self):
212
+ """Load agent records based on ``self.agent_filter``.
213
+
214
+ - ``"openhands"`` (default): only ``{config}/results/{model}/``,
215
+ which is the canonical OpenHands leaderboard. The Home page and
216
+ the per-category subpages use this.
217
+ - ``"alternative"``: only
218
+ ``{config}/alternative_agents/{type}/{model}/`` (acp-claude,
219
+ acp-codex, acp-gemini, openhands_subagents, ...). The dedicated
220
+ Alternative Agents page uses this.
221
 
222
+ Returns ``None`` if no records were found (which makes the caller
223
+ render an empty-state placeholder).
 
 
 
224
  """
225
  all_records = []
226
  all_validation_errors = []
227
 
228
+ if self.agent_filter == self.AGENT_FILTER_OPENHANDS:
229
+ # Default OpenHands agent results
230
+ results_dir = self.config_path / "results"
231
+ if results_dir.exists():
232
+ for agent_dir in results_dir.iterdir():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  if not agent_dir.is_dir():
234
  continue
235
+ records, errors = self._records_from_agent_dir(agent_dir)
 
 
236
  all_records.extend(records)
237
  all_validation_errors.extend(errors)
238
+ else:
239
+ # Alternative agents (one subdirectory per agent_type, then per model)
240
+ # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE
241
+ # map in OpenHands/evaluation push_to_index_from_archive.py — keeping
242
+ # it in sync ensures rows are labelled the same way the index repo
243
+ # records them.
244
+ agent_type_default_name = {
245
+ 'acp-claude': 'Claude Code',
246
+ 'acp-codex': 'Codex',
247
+ 'acp-gemini': 'Gemini CLI',
248
+ 'openhands_subagents': 'OpenHands Sub-agents',
249
+ }
250
+ alt_dir = self.config_path / "alternative_agents"
251
+ if alt_dir.exists():
252
+ for type_dir in alt_dir.iterdir():
253
+ if not type_dir.is_dir():
254
+ continue
255
+ default_name = agent_type_default_name.get(type_dir.name)
256
+ for agent_dir in type_dir.iterdir():
257
+ if not agent_dir.is_dir():
258
+ continue
259
+ records, errors = self._records_from_agent_dir(
260
+ agent_dir, default_agent_name=default_name
261
+ )
262
+ all_records.extend(records)
263
+ all_validation_errors.extend(errors)
264
 
265
  # Log validation errors if any
266
  if all_validation_errors:
ui_components.py CHANGED
@@ -508,28 +508,36 @@ class DummyViewer:
508
  # The _load method returns the error DataFrame and an empty tag map
509
  return self._error_df, {}
510
 
511
- def get_leaderboard_viewer_instance(split: str):
 
 
 
512
  """
513
- Fetches the LeaderboardViewer for a split, using a thread-safe cache to avoid
514
- re-downloading data. On error, returns a stable DummyViewer object.
 
 
515
  """
516
  global CACHED_VIEWERS, CACHED_TAG_MAPS
517
 
 
 
518
  with _cache_lock:
519
- if split in CACHED_VIEWERS:
520
  # Cache hit: return the cached viewer and tag map
521
- return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})
522
 
523
  # --- Cache miss: try to load data from the source ---
524
  try:
525
  # First try to load from extracted data directory (local mock data)
526
  data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
527
-
528
- print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
529
  viewer = SimpleLeaderboardViewer(
530
  data_dir=data_dir,
531
  config=CONFIG_NAME,
532
- split=split
 
533
  )
534
 
535
  # Simplify tag map creation
@@ -537,14 +545,14 @@ def get_leaderboard_viewer_instance(split: str):
537
 
538
  # Cache the results for next time (thread-safe)
539
  with _cache_lock:
540
- CACHED_VIEWERS[split] = viewer
541
- CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly
542
 
543
  return viewer, pretty_tag_map
544
 
545
  except Exception as e:
546
  # On ANY error, create a consistent error message and cache a DummyViewer
547
- error_message = f"Error loading data for split '{split}': {e}"
548
  print(format_error(error_message))
549
 
550
  dummy_df = pd.DataFrame({"Message": [error_message]})
@@ -553,8 +561,8 @@ def get_leaderboard_viewer_instance(split: str):
553
 
554
  # Cache the dummy objects so we don't try to fetch again on this run
555
  with _cache_lock:
556
- CACHED_VIEWERS[split] = dummy_viewer
557
- CACHED_TAG_MAPS[split] = dummy_tag_map
558
 
559
  return dummy_viewer, dummy_tag_map
560
 
@@ -1268,12 +1276,17 @@ def create_benchmark_details_display(
1268
  legend_markdown = create_legend_markdown(benchmark_name)
1269
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
1270
 
1271
- def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
 
 
 
1272
  """
1273
- Loads and transforms the complete dataset for a given split.
1274
- This function handles caching and returns the final "pretty" DataFrame and tag map.
 
 
1275
  """
1276
- viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)
1277
 
1278
  if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
1279
  raw_df, _ = viewer_or_data._load()
 
508
  # The _load method returns the error DataFrame and an empty tag map
509
  return self._error_df, {}
510
 
511
+ def get_leaderboard_viewer_instance(
512
+ split: str,
513
+ agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
514
+ ):
515
  """
516
+ Fetches the LeaderboardViewer for a (split, agent_filter) pair, using a
517
+ thread-safe cache to avoid re-downloading data. The cache is keyed on
518
+ both axes so the OpenHands and Alternative Agents pages don't fight
519
+ over a single slot. On error, returns a stable DummyViewer.
520
  """
521
  global CACHED_VIEWERS, CACHED_TAG_MAPS
522
 
523
+ cache_key = (split, agent_filter)
524
+
525
  with _cache_lock:
526
+ if cache_key in CACHED_VIEWERS:
527
  # Cache hit: return the cached viewer and tag map
528
+ return CACHED_VIEWERS[cache_key], CACHED_TAG_MAPS.get(cache_key, {"Overall": []})
529
 
530
  # --- Cache miss: try to load data from the source ---
531
  try:
532
  # First try to load from extracted data directory (local mock data)
533
  data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
534
+
535
+ print(f"Loading data for split '{split}' (agent_filter={agent_filter}) from: {data_dir}/{CONFIG_NAME}")
536
  viewer = SimpleLeaderboardViewer(
537
  data_dir=data_dir,
538
  config=CONFIG_NAME,
539
+ split=split,
540
+ agent_filter=agent_filter,
541
  )
542
 
543
  # Simplify tag map creation
 
545
 
546
  # Cache the results for next time (thread-safe)
547
  with _cache_lock:
548
+ CACHED_VIEWERS[cache_key] = viewer
549
+ CACHED_TAG_MAPS[cache_key] = pretty_tag_map # Cache the pretty map directly
550
 
551
  return viewer, pretty_tag_map
552
 
553
  except Exception as e:
554
  # On ANY error, create a consistent error message and cache a DummyViewer
555
+ error_message = f"Error loading data for split '{split}' (agent_filter={agent_filter}): {e}"
556
  print(format_error(error_message))
557
 
558
  dummy_df = pd.DataFrame({"Message": [error_message]})
 
561
 
562
  # Cache the dummy objects so we don't try to fetch again on this run
563
  with _cache_lock:
564
+ CACHED_VIEWERS[cache_key] = dummy_viewer
565
+ CACHED_TAG_MAPS[cache_key] = dummy_tag_map
566
 
567
  return dummy_viewer, dummy_tag_map
568
 
 
1276
  legend_markdown = create_legend_markdown(benchmark_name)
1277
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
1278
 
1279
+ def get_full_leaderboard_data(
1280
+ split: str,
1281
+ agent_filter: str = SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
1282
+ ) -> tuple[pd.DataFrame, dict]:
1283
  """
1284
+ Loads and transforms the complete dataset for a (split, agent_filter)
1285
+ pair. ``agent_filter`` defaults to ``"openhands"`` so existing pages
1286
+ that don't pass it stay on the canonical leaderboard. The Alternative
1287
+ Agents page passes ``"alternative"`` to get the third-party harnesses.
1288
  """
1289
+ viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split, agent_filter=agent_filter)
1290
 
1291
  if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
1292
  raw_df, _ = viewer_or_data._load()