openhands openhands commited on
Commit
5998027
·
1 Parent(s): 55da48c

feat: Update leaderboard calculations and add incomplete entries toggle

Browse files

Changes:
1. Rename 'OpenHands Version' to 'SDK Version'
2. Rename 'Overall Score' to 'Average Score' - now divides by 5 regardless
of categories completed (missing categories count as 0)
3. Rename 'Overall Cost' to 'Total Cost' - now sums all category costs
4. Add 'Show incomplete entries' toggle (default: hidden) to filter
entries that don't have all 5 categories submitted
5. Fix agent grouping to use version+model combination instead of just
version (fixes issue where different models with same SDK version
were incorrectly merged)
6. Fix model_dump() to use mode='json' for proper enum serialization
7. Track 'categories_completed' count per agent

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (3) hide show
  1. leaderboard_transformer.py +16 -12
  2. simple_data_loader.py +42 -27
  3. ui_components.py +106 -67
leaderboard_transformer.py CHANGED
@@ -96,19 +96,23 @@ def _pretty_column_name(raw_col: str) -> str:
96
  """
97
  Takes a raw column name from the DataFrame and returns a "pretty" version.
98
  Handles three cases:
99
- 1. Fixed names (e.g., 'Openhands version' -> 'OpenHands Version', 'Language model' -> 'Language Model').
100
  2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
101
  3. Fallback for any other names.
102
  """
103
  # Case 1: Handle fixed, special-case mappings first.
104
  fixed_mappings = {
105
  'id': 'id',
106
- 'Openhands version': 'OpenHands Version',
 
107
  'Language model': 'Language Model',
108
  'Agent description': 'Agent Description',
109
  'Submission date': 'Date',
110
- 'Overall': 'Overall Score',
111
- 'Overall cost': 'Overall Cost',
 
 
 
112
  'Logs': 'Logs',
113
  'Openness': 'Openness',
114
  'LLM base': 'Model',
@@ -256,7 +260,7 @@ class DataTransformer:
256
  df_view = df_sorted.copy()
257
 
258
  # --- 3. Add Columns for Agent Openness ---
259
- base_cols = ["id","Language Model","OpenHands Version","Source"]
260
  new_cols = ["Openness"]
261
  ending_cols = ["Date", "Logs"]
262
 
@@ -310,7 +314,7 @@ class DataTransformer:
310
  data=df_view,
311
  x=primary_cost_col,
312
  y=primary_score_col,
313
- agent_col="OpenHands Version",
314
  name=primary_metric
315
  ) if use_plotly else go.Figure()
316
  # Use a consistent key for easy retrieval later
@@ -324,7 +328,7 @@ class DataTransformer:
324
  plots['scatter_plot'] = go.Figure()
325
  return df_view, plots
326
 
327
- DEFAULT_Y_COLUMN = "Overall Score"
328
  DUMMY_X_VALUE_FOR_MISSING_COSTS = 0
329
 
330
  def _plot_scatter_plotly(
@@ -551,7 +555,7 @@ def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
551
  - If both cost and score are null, it becomes "Not Attempted".
552
  Args:
553
  df: The DataFrame to modify.
554
- cost_col_name: The name of the cost column to format (e.g., "Overall Cost").
555
  Returns:
556
  The DataFrame with the formatted cost column.
557
  """
@@ -584,10 +588,10 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
584
  Applies custom formatting to a score column for display.
585
  - If a score is 0 or NaN, it's displayed as a colored "0".
586
  - Other scores are formatted to two decimal places.
587
- - Overall Score values are displayed in bold.
588
  """
589
  status_color = "#ec4899" # The same color as your other status text
590
- is_overall_score = (score_col_name == "Overall Score")
591
 
592
  def apply_formatting(score_value):
593
  # Explicitly handle missing values without turning them into zeros
@@ -601,8 +605,8 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
601
  else:
602
  formatted = str(score_value)
603
 
604
- # Make Overall Score bold
605
- if is_overall_score and score_value != 0:
606
  return f"<strong>{formatted}</strong>"
607
  return formatted
608
 
 
96
  """
97
  Takes a raw column name from the DataFrame and returns a "pretty" version.
98
  Handles three cases:
99
+ 1. Fixed names (e.g., 'SDK version' -> 'SDK Version', 'Language model' -> 'Language Model').
100
  2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
101
  3. Fallback for any other names.
102
  """
103
  # Case 1: Handle fixed, special-case mappings first.
104
  fixed_mappings = {
105
  'id': 'id',
106
+ 'SDK version': 'SDK Version',
107
+ 'Openhands version': 'SDK Version', # Legacy support
108
  'Language model': 'Language Model',
109
  'Agent description': 'Agent Description',
110
  'Submission date': 'Date',
111
+ 'average score': 'Average Score',
112
+ 'Overall': 'Average Score', # Legacy support
113
+ 'total cost': 'Total Cost',
114
+ 'Overall cost': 'Total Cost', # Legacy support
115
+ 'categories_completed': 'Categories Completed',
116
  'Logs': 'Logs',
117
  'Openness': 'Openness',
118
  'LLM base': 'Model',
 
260
  df_view = df_sorted.copy()
261
 
262
  # --- 3. Add Columns for Agent Openness ---
263
+ base_cols = ["id","Language Model","SDK Version","Source"]
264
  new_cols = ["Openness"]
265
  ending_cols = ["Date", "Logs"]
266
 
 
314
  data=df_view,
315
  x=primary_cost_col,
316
  y=primary_score_col,
317
+ agent_col="SDK Version",
318
  name=primary_metric
319
  ) if use_plotly else go.Figure()
320
  # Use a consistent key for easy retrieval later
 
328
  plots['scatter_plot'] = go.Figure()
329
  return df_view, plots
330
 
331
+ DEFAULT_Y_COLUMN = "Average Score"
332
  DUMMY_X_VALUE_FOR_MISSING_COSTS = 0
333
 
334
  def _plot_scatter_plotly(
 
555
  - If both cost and score are null, it becomes "Not Attempted".
556
  Args:
557
  df: The DataFrame to modify.
558
+ cost_col_name: The name of the cost column to format (e.g., "Total Cost").
559
  Returns:
560
  The DataFrame with the formatted cost column.
561
  """
 
588
  Applies custom formatting to a score column for display.
589
  - If a score is 0 or NaN, it's displayed as a colored "0".
590
  - Other scores are formatted to two decimal places.
591
+ - Average Score values are displayed in bold.
592
  """
593
  status_color = "#ec4899" # The same color as your other status text
594
+ is_average_score = (score_col_name == "Average Score")
595
 
596
  def apply_formatting(score_value):
597
  # Explicitly handle missing values without turning them into zeros
 
605
  else:
606
  formatted = str(score_value)
607
 
608
+ # Make Average Score bold
609
+ if is_average_score and score_value != 0:
610
  return f"<strong>{formatted}</strong>"
611
  return formatted
612
 
simple_data_loader.py CHANGED
@@ -65,7 +65,8 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
65
  if _ensure_schema_models() and Metadata and ScoreEntry:
66
  try:
67
  validated_metadata = Metadata(**metadata_raw)
68
- metadata_dict = validated_metadata.model_dump()
 
69
  except Exception as e:
70
  errors.append(f"Metadata validation error in {agent_dir.name}: {e}")
71
  metadata_dict = metadata_raw # Fall back to raw data
@@ -74,7 +75,8 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
74
  for i, score in enumerate(scores_raw):
75
  try:
76
  validated_score = ScoreEntry(**score)
77
- validated_scores.append(validated_score.model_dump())
 
78
  except Exception as e:
79
  errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
80
  validated_scores.append(score) # Fall back to raw data
@@ -223,23 +225,30 @@ class SimpleLeaderboardViewer:
223
  try:
224
 
225
  # Transform to expected format for leaderboard
226
- # Group by agent to aggregate results across datasets
227
  transformed_records = []
228
 
229
- for agent_version in df['agent_version'].unique():
230
- agent_records = df[df['agent_version'] == agent_version]
 
 
 
231
 
232
  # Build a single record for this agent
233
  first_record = agent_records.iloc[0]
 
234
 
235
  # Normalize openness to "open" or "closed"
236
  from aliases import OPENNESS_MAPPING
237
  raw_openness = first_record['openness']
238
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
239
 
 
 
 
240
  record = {
241
  # Core agent info - use final display names
242
- 'Openhands version': agent_version, # Will become "OpenHands Version"
243
  'Language model': first_record['llm_base'], # Will become "Language Model"
244
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
245
  'date': first_record['submission_time'], # Will become "Date"
@@ -273,30 +282,36 @@ class SimpleLeaderboardViewer:
273
  category_data[category]['scores'].append(row['score'])
274
  category_data[category]['costs'].append(row['total_cost'])
275
 
276
- # Calculate category-level aggregates
277
- category_avg_scores = []
278
- category_avg_costs = []
279
- for category, data in category_data.items():
280
- if data['scores']:
 
281
  avg_score = sum(data['scores']) / len(data['scores'])
282
  record[f'{category} score'] = avg_score
283
- category_avg_scores.append(avg_score)
284
- if data['costs']:
285
- avg_cost = sum(data['costs']) / len(data['costs'])
286
- record[f'{category} cost'] = avg_cost
287
- category_avg_costs.append(avg_cost)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
- # Calculate overall score and cost as macro-average of category averages
290
- # This ensures each category contributes equally regardless of benchmark count
291
- if category_avg_scores:
292
- record['overall score'] = sum(category_avg_scores) / len(category_avg_scores)
293
- else:
294
- record['overall score'] = None
295
-
296
- if category_avg_costs:
297
- record['overall cost'] = sum(category_avg_costs) / len(category_avg_costs)
298
- else:
299
- record['overall cost'] = None
300
 
301
  transformed_records.append(record)
302
 
 
65
  if _ensure_schema_models() and Metadata and ScoreEntry:
66
  try:
67
  validated_metadata = Metadata(**metadata_raw)
68
+ # Use mode='json' to serialize enums as strings
69
+ metadata_dict = validated_metadata.model_dump(mode='json')
70
  except Exception as e:
71
  errors.append(f"Metadata validation error in {agent_dir.name}: {e}")
72
  metadata_dict = metadata_raw # Fall back to raw data
 
75
  for i, score in enumerate(scores_raw):
76
  try:
77
  validated_score = ScoreEntry(**score)
78
+ # Use mode='json' to serialize enums as strings
79
+ validated_scores.append(validated_score.model_dump(mode='json'))
80
  except Exception as e:
81
  errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
82
  validated_scores.append(score) # Fall back to raw data
 
225
  try:
226
 
227
  # Transform to expected format for leaderboard
228
+ # Group by agent (version + model combination) to aggregate results across datasets
229
  transformed_records = []
230
 
231
+ # Create a unique identifier for each agent (version + model)
232
+ df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
233
+
234
+ for agent_id in df['agent_id'].unique():
235
+ agent_records = df[df['agent_id'] == agent_id]
236
 
237
  # Build a single record for this agent
238
  first_record = agent_records.iloc[0]
239
+ agent_version = first_record['agent_version']
240
 
241
  # Normalize openness to "open" or "closed"
242
  from aliases import OPENNESS_MAPPING
243
  raw_openness = first_record['openness']
244
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
245
 
246
+ # All 5 categories for the leaderboard
247
+ ALL_CATEGORIES = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
248
+
249
  record = {
250
  # Core agent info - use final display names
251
+ 'SDK version': agent_version, # Will become "SDK Version"
252
  'Language model': first_record['llm_base'], # Will become "Language Model"
253
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
254
  'date': first_record['submission_time'], # Will become "Date"
 
282
  category_data[category]['scores'].append(row['score'])
283
  category_data[category]['costs'].append(row['total_cost'])
284
 
285
+ # Calculate category-level aggregates and track total cost
286
+ total_cost = 0.0
287
+ categories_with_scores = 0
288
+ for category in ALL_CATEGORIES:
289
+ if category in category_data and category_data[category]['scores']:
290
+ data = category_data[category]
291
  avg_score = sum(data['scores']) / len(data['scores'])
292
  record[f'{category} score'] = avg_score
293
+ categories_with_scores += 1
294
+ if data['costs']:
295
+ cat_cost = sum(data['costs'])
296
+ record[f'{category} cost'] = cat_cost
297
+ total_cost += cat_cost
298
+ else:
299
+ # Category not submitted - will show as NA
300
+ pass
301
+
302
+ # Calculate average score: always divide by 5 (treating missing categories as 0)
303
+ # This penalizes incomplete submissions
304
+ score_sum = sum(
305
+ record.get(f'{cat} score', 0) or 0
306
+ for cat in ALL_CATEGORIES
307
+ )
308
+ record['average score'] = score_sum / 5
309
+
310
+ # Total cost is the sum of all category costs
311
+ record['total cost'] = total_cost if total_cost > 0 else None
312
 
313
+ # Track how many categories were completed
314
+ record['categories_completed'] = categories_with_scores
 
 
 
 
 
 
 
 
 
315
 
316
  transformed_records.append(record)
317
 
ui_components.py CHANGED
@@ -147,10 +147,10 @@ def build_descriptions_tooltip_content(table) -> str:
147
  """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
148
  if table == "Overall":
149
  return """
150
- <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
151
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
152
- <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the five category-level average scores. Each category contributes equally.</div>
153
- <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
154
  <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
155
  <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
156
  <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
@@ -166,7 +166,7 @@ def build_descriptions_tooltip_content(table) -> str:
166
  """
167
  elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
168
  return f"""
169
- <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
170
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
171
  <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
172
  <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
@@ -178,7 +178,7 @@ def build_descriptions_tooltip_content(table) -> str:
178
  else:
179
  # Fallback for any other table type, e.g., individual benchmarks
180
  return f"""
181
- <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
182
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
183
  <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
184
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
@@ -360,70 +360,83 @@ def create_leaderboard_display(
360
  # 1. Instantiate the transformer and get the specific view for this category.
361
  # The function no longer loads data itself; it filters the data it receives.
362
  transformer = DataTransformer(full_df, tag_map)
363
- df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
364
- pareto_df = get_pareto_df(df_view)
365
- # Get the list of agents on the frontier. We'll use this list later.
366
- trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
367
- trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
368
- if not pareto_df.empty and 'id' in pareto_df.columns:
369
- pareto_agent_names = pareto_df['id'].tolist()
370
- else:
371
- pareto_agent_names = []
372
- df_view['Pareto'] = df_view.apply(
373
- lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
374
- axis=1
375
- )
376
- # Generate openness icons for each row
377
- def get_openness_icon_html(row):
378
- openness_val = row.get('Openness', '')
379
- uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
380
- return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
381
 
382
- df_view['Icon'] = df_view.apply(get_openness_icon_html, axis=1)
383
-
384
- # Format cost columns
385
- for col in df_view.columns:
386
- if "Cost" in col:
387
- df_view = format_cost_column(df_view, col)
388
-
389
- # Apply score formatting without coercing NaN to 0
390
- for col in df_view.columns:
391
- if "Score" in col:
392
- df_view = format_score_column(df_view, col)
393
- scatter_plot = plots_dict.get('scatter_plot', go.Figure())
394
- #Make pretty and format the Language Model column
395
- df_view['Language Model'] = df_view['Language Model'].apply(clean_llm_base_list)
396
- df_view['Language Model'] = df_view['Language Model'].apply(format_llm_base_with_html)
397
- # append the repro url to the end of the OpenHands Version
398
- if 'Source' in df_view.columns:
399
- df_view['OpenHands Version'] = df_view.apply(
400
- lambda row: f"{row['OpenHands Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['OpenHands Version'],
401
  axis=1
402
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
- all_cols = df_view.columns.tolist()
405
- # Remove pareto and Icon columns and insert it at the beginning
406
- all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
407
- all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
408
- df_view = df_view[all_cols]
409
- # Drop internally used columns that are not needed in the display
410
- columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source']
411
- df_view = df_view.drop(columns=columns_to_drop, errors='ignore')
412
-
413
- header_rename_map = {
414
- "Pareto": "",
415
- "Icon": "",
416
- }
417
- # Rename columns first before getting headers
418
- df_view = df_view.rename(columns=header_rename_map)
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
  # Now get headers from the renamed dataframe
421
- df_headers = df_view.columns.tolist()
422
  df_datatypes = []
423
  for col in df_headers:
424
  if col == "Logs" or "Cost" in col or "Score" in col:
425
  df_datatypes.append("markdown")
426
- elif col in ["OpenHands Version","Language Model", ""]: # "" for renamed Pareto/Icon columns
427
  df_datatypes.append("html")
428
  else:
429
  df_datatypes.append("str")
@@ -451,9 +464,21 @@ def create_leaderboard_display(
451
 
452
  # Put table and key into an accordion
453
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
 
 
 
 
 
 
 
 
 
 
 
 
454
  dataframe_component = gr.DataFrame(
455
  headers=df_headers,
456
- value=df_view,
457
  datatype=df_datatypes,
458
  interactive=False,
459
  wrap=True,
@@ -462,6 +487,20 @@ def create_leaderboard_display(
462
  show_search="search",
463
  elem_id="main-leaderboard"
464
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  legend_markdown = create_legend_markdown(category_name)
466
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
467
 
@@ -508,7 +547,7 @@ def create_benchmark_details_display(
508
  benchmark_cost_col = f"{benchmark_name} Cost"
509
 
510
  # Define the columns needed for the detailed table
511
- table_cols = ['OpenHands Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
512
 
513
  # Filter to only columns that actually exist in the full dataframe
514
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -543,10 +582,10 @@ def create_benchmark_details_display(
543
  #Make pretty and format the Language Model column
544
  benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
545
  benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
546
- # append the repro url to the end of the OpenHands Version
547
  if 'Source' in benchmark_table_df.columns:
548
- benchmark_table_df['OpenHands Version'] = benchmark_table_df.apply(
549
- lambda row: f"{row['OpenHands Version']} {row['Source']}" if row['Source'] else row['OpenHands Version'],
550
  axis=1
551
  )
552
 
@@ -574,7 +613,7 @@ def create_benchmark_details_display(
574
  'Pareto',
575
  'Icon',
576
  'Language Model',
577
- 'OpenHands Version',
578
  'Attempted Benchmark',
579
  benchmark_score_col,
580
  benchmark_cost_col,
@@ -603,7 +642,7 @@ def create_benchmark_details_display(
603
  for col in df_headers:
604
  if "Logs" in col or "Cost" in col or "Score" in col:
605
  df_datatypes.append("markdown")
606
- elif col in ["OpenHands Version", "Language Model", ""]: # "" for renamed Pareto/Icon columns
607
  df_datatypes.append("html")
608
  else:
609
  df_datatypes.append("str")
@@ -611,7 +650,7 @@ def create_benchmark_details_display(
611
  data=full_df,
612
  x=benchmark_cost_col,
613
  y=benchmark_score_col,
614
- agent_col="OpenHands Version",
615
  name=benchmark_name
616
  )
617
  with gr.Row():
 
147
  """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
148
  if table == "Overall":
149
  return """
150
+ <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands SDK evaluated.</div>
151
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
152
+ <div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
153
+ <div class="tooltip-description-item"><b>Total Cost:</b> Sum of costs across all submitted categories, in USD.</div>
154
  <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
155
  <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
156
  <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
 
166
  """
167
  elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
168
  return f"""
169
+ <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
170
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
171
  <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
172
  <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
 
178
  else:
179
  # Fallback for any other table type, e.g., individual benchmarks
180
  return f"""
181
+ <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
182
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
183
  <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
184
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
 
360
  # 1. Instantiate the transformer and get the specific view for this category.
361
  # The function no longer loads data itself; it filters the data it receives.
362
  transformer = DataTransformer(full_df, tag_map)
363
+ df_view_full, plots_dict = transformer.view(tag=category_name, use_plotly=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
+ def prepare_df_for_display(df_view):
366
+ """Prepare a DataFrame for display with all formatting applied."""
367
+ df_display = df_view.copy()
368
+
369
+ pareto_df = get_pareto_df(df_display)
370
+ trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
371
+ trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
372
+ if not pareto_df.empty and 'id' in pareto_df.columns:
373
+ pareto_agent_names = pareto_df['id'].tolist()
374
+ else:
375
+ pareto_agent_names = []
376
+ df_display['Pareto'] = df_display.apply(
377
+ lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
 
 
 
 
 
 
378
  axis=1
379
  )
380
+
381
+ def get_openness_icon_html(row):
382
+ openness_val = row.get('Openness', '')
383
+ uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
384
+ return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
385
+
386
+ df_display['Icon'] = df_display.apply(get_openness_icon_html, axis=1)
387
+
388
+ for col in df_display.columns:
389
+ if "Cost" in col:
390
+ df_display = format_cost_column(df_display, col)
391
+
392
+ for col in df_display.columns:
393
+ if "Score" in col:
394
+ df_display = format_score_column(df_display, col)
395
+
396
+ df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
397
+ df_display['Language Model'] = df_display['Language Model'].apply(format_llm_base_with_html)
398
+
399
+ if 'Source' in df_display.columns:
400
+ df_display['SDK Version'] = df_display.apply(
401
+ lambda row: f"{row['SDK Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['SDK Version'],
402
+ axis=1
403
+ )
404
 
405
+ all_cols = df_display.columns.tolist()
406
+ all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
407
+ all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
408
+ df_display = df_display[all_cols]
409
+
410
+ columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source', 'Categories Completed']
411
+ df_display = df_display.drop(columns=columns_to_drop, errors='ignore')
412
+
413
+ header_rename_map = {
414
+ "Pareto": "",
415
+ "Icon": "",
416
+ }
417
+ df_display = df_display.rename(columns=header_rename_map)
418
+
419
+ return df_display
420
+
421
+ # Prepare both complete and all entries versions
422
+ # Complete entries have all 5 categories submitted
423
+ if 'Categories Completed' in df_view_full.columns:
424
+ df_view_complete = df_view_full[df_view_full['Categories Completed'] == 5].copy()
425
+ else:
426
+ df_view_complete = df_view_full.copy()
427
+
428
+ df_display_complete = prepare_df_for_display(df_view_complete)
429
+ df_display_all = prepare_df_for_display(df_view_full)
430
+
431
+ scatter_plot = plots_dict.get('scatter_plot', go.Figure())
432
 
433
  # Now get headers from the renamed dataframe
434
+ df_headers = df_display_complete.columns.tolist()
435
  df_datatypes = []
436
  for col in df_headers:
437
  if col == "Logs" or "Cost" in col or "Score" in col:
438
  df_datatypes.append("markdown")
439
+ elif col in ["SDK Version","Language Model", ""]: # "" for renamed Pareto/Icon columns
440
  df_datatypes.append("html")
441
  else:
442
  df_datatypes.append("str")
 
464
 
465
  # Put table and key into an accordion
466
  with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
467
+ # Add toggle for showing incomplete entries
468
+ num_complete = len(df_display_complete)
469
+ num_total = len(df_display_all)
470
+ num_incomplete = num_total - num_complete
471
+
472
+ show_incomplete_checkbox = gr.Checkbox(
473
+ label=f"Show incomplete entries ({num_incomplete} entries with fewer than 5 categories)",
474
+ value=False,
475
+ elem_id="show-incomplete-toggle"
476
+ )
477
+
478
+ # Start with complete entries only (default)
479
  dataframe_component = gr.DataFrame(
480
  headers=df_headers,
481
+ value=df_display_complete,
482
  datatype=df_datatypes,
483
  interactive=False,
484
  wrap=True,
 
487
  show_search="search",
488
  elem_id="main-leaderboard"
489
  )
490
+
491
+ # Update function for the toggle
492
+ def update_table(show_incomplete):
493
+ if show_incomplete:
494
+ return df_display_all
495
+ else:
496
+ return df_display_complete
497
+
498
+ show_incomplete_checkbox.change(
499
+ fn=update_table,
500
+ inputs=[show_incomplete_checkbox],
501
+ outputs=[dataframe_component]
502
+ )
503
+
504
  legend_markdown = create_legend_markdown(category_name)
505
  gr.HTML(value=legend_markdown, elem_id="legend-markdown")
506
 
 
547
  benchmark_cost_col = f"{benchmark_name} Cost"
548
 
549
  # Define the columns needed for the detailed table
550
+ table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
551
 
552
  # Filter to only columns that actually exist in the full dataframe
553
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
 
582
  #Make pretty and format the Language Model column
583
  benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
584
  benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
585
+ # append the repro url to the end of the SDK Version
586
  if 'Source' in benchmark_table_df.columns:
587
+ benchmark_table_df['SDK Version'] = benchmark_table_df.apply(
588
+ lambda row: f"{row['SDK Version']} {row['Source']}" if row['Source'] else row['SDK Version'],
589
  axis=1
590
  )
591
 
 
613
  'Pareto',
614
  'Icon',
615
  'Language Model',
616
+ 'SDK Version',
617
  'Attempted Benchmark',
618
  benchmark_score_col,
619
  benchmark_cost_col,
 
642
  for col in df_headers:
643
  if "Logs" in col or "Cost" in col or "Score" in col:
644
  df_datatypes.append("markdown")
645
+ elif col in ["SDK Version", "Language Model", ""]: # "" for renamed Pareto/Icon columns
646
  df_datatypes.append("html")
647
  else:
648
  df_datatypes.append("str")
 
650
  data=full_df,
651
  x=benchmark_cost_col,
652
  y=benchmark_score_col,
653
+ agent_col="SDK Version",
654
  name=benchmark_name
655
  )
656
  with gr.Row():