Fix: Scatter plot zoom and 'Show all labels' not working

#29
leaderboard_transformer.py CHANGED
@@ -971,7 +971,7 @@ def _plot_scatter_plotly(
971
  name: Optional[str] = None,
972
  plot_type: str = 'cost', # 'cost' or 'runtime'
973
  mark_by: Optional[str] = None, # 'Company', 'Openness', or 'Country'
974
- show_all_labels: bool = False # Show labels for all points vs only Pareto frontier
975
  ) -> go.Figure:
976
  from constants import MARK_BY_DEFAULT
977
  if mark_by is None:
@@ -1268,107 +1268,93 @@ def _plot_scatter_plotly(
1268
  domain_x = max(0, min(1, domain_x))
1269
  domain_y = max(0, min(1, domain_y))
1270
 
1271
- # Convert to data coordinates
1272
- # For log scale x: use log10(x) to match the axis type
1273
- x_log = np.log10(x_val) if x_val > 0 else x_min_log
1274
-
1275
  if harness_uri is not None:
1276
- # Composite: stack model on top, harness on bottom
1277
- # Use data coordinates (x, y) so logos zoom/pan together with labels
1278
- y_offset = 0.8 # Offset above the data point (in score units)
 
 
1279
  layout_images.append(dict(
1280
  source=model_logo_uri,
1281
- xref="x", yref="y",
1282
- x=x_log, y=y_val + y_offset,
1283
- sizex=STACKED_SIZE_X * (x_max_log - x_min_log),
1284
- sizey=STACKED_SIZE_Y * (y_max - y_min),
1285
  xanchor="center", yanchor="middle",
1286
  layer="above",
1287
  ))
1288
  layout_images.append(dict(
1289
  source=harness_uri,
1290
- xref="x", yref="y",
1291
- x=x_log, y=y_val - y_offset,
1292
- sizex=STACKED_SIZE_X * (x_max_log - x_min_log),
1293
- sizey=STACKED_SIZE_Y * (y_max - y_min),
1294
  xanchor="center", yanchor="middle",
1295
  layer="above",
1296
  ))
1297
  else:
1298
- # Single marker - use data coordinates so logo zooms/pans with labels
 
 
 
1299
  layout_images.append(dict(
1300
  source=model_logo_uri,
1301
- xref="x", yref="y",
1302
- x=x_log, y=y_val,
1303
- sizex=SINGLE_SIZE_X * (x_max_log - x_min_log),
1304
- sizey=SINGLE_SIZE_Y * (y_max - y_min),
1305
  xanchor="center", yanchor="middle",
1306
  layer="above",
1307
  ))
1308
 
1309
- # --- Section 7: Add Model Name Labels ---
1310
- # Show labels for all points if show_all_labels is True, otherwise just Pareto frontier
1311
- if show_all_labels:
1312
- # Label all data points
1313
- labels_data = []
1314
- for _, row in data_plot.iterrows():
1315
- x_val = row[x_col_to_use]
1316
- y_val = row[y_col_to_use]
1317
-
1318
- model_name = row.get('Language Model', '')
1319
- if isinstance(model_name, list):
1320
- model_name = model_name[0] if model_name else ''
1321
- model_name = str(model_name).split('/')[-1]
1322
- if len(model_name) > 25:
1323
- model_name = model_name[:22] + '...'
1324
-
1325
- labels_data.append({'x': x_val, 'y': y_val, 'label': model_name})
1326
- elif frontier_rows:
1327
- # Label only Pareto frontier points
1328
- labels_data = []
1329
 
1330
  for row in frontier_rows:
1331
  x_val = row[x_col_to_use]
1332
  y_val = row[y_col_to_use]
1333
 
 
1334
  model_name = row.get('Language Model', '')
1335
  if isinstance(model_name, list):
1336
  model_name = model_name[0] if model_name else ''
 
1337
  model_name = str(model_name).split('/')[-1]
 
1338
  if len(model_name) > 25:
1339
  model_name = model_name[:22] + '...'
1340
 
1341
- labels_data.append({'x': x_val, 'y': y_val, 'label': model_name})
1342
- else:
1343
- labels_data = []
1344
-
1345
- # Add annotations for each label
1346
- # For log scale x-axis, annotations need log10(x) coordinates (Plotly issue #2580)
1347
- for item in labels_data:
1348
- x_val = item['x']
1349
- y_val = item['y']
1350
- label = item['label']
1351
 
1352
- # Transform x to log10 for annotation positioning on log scale
1353
- if x_val > 0:
1354
- x_log = np.log10(x_val)
1355
- else:
1356
- x_log = x_min_log
1357
-
1358
- fig.add_annotation(
1359
- x=x_log,
1360
- y=y_val,
1361
- text=label,
1362
- showarrow=False,
1363
- yshift=25, # Move label higher above the icon
1364
- font=dict(
1365
- size=10,
1366
- color='#0D0D0F', # neutral-950
1367
- family=FONT_FAMILY_SHORT
1368
- ),
1369
- xanchor='center',
1370
- yanchor='bottom'
1371
- )
 
 
 
 
 
 
 
1372
 
1373
  # --- Section 8: Configure Layout ---
1374
  # Use the same axis ranges as calculated for domain coordinates
@@ -1487,38 +1473,47 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
1487
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
1488
 
1489
 
 
 
 
 
 
 
 
 
 
1490
  def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
1491
  """
1492
  Applies custom formatting to a runtime column based on its corresponding score column.
1493
  - If runtime is not null, formats as time with 's' suffix.
1494
  - If runtime is null but score is not, it becomes "Missing".
1495
  - If both runtime and score are null, it becomes "Not Submitted".
 
1496
  Args:
1497
  df: The DataFrame to modify.
1498
  runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
1499
  Returns:
1500
  The DataFrame with the formatted runtime column.
1501
  """
1502
- # Find the corresponding score column by replacing "Runtime" with "Score"
1503
  score_col_name = runtime_col_name.replace("Runtime", "Score")
1504
 
1505
- # Ensure the score column actually exists to avoid errors
1506
  if score_col_name not in df.columns:
1507
- return df # Return the DataFrame unmodified if there's no matching score
1508
 
1509
  def apply_formatting_logic(row):
1510
  runtime_value = row[runtime_col_name]
1511
  score_value = row[score_col_name]
1512
  status_color = "#ec4899"
 
 
1513
 
1514
  if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
1515
- return f"{runtime_value:.0f}s"
1516
  elif pd.notna(score_value):
1517
- return f'<span style="color: {status_color};">Missing</span>' # Score exists, but runtime is missing
1518
  else:
1519
- return f'<span style="color: {status_color};">Not Submitted</span>' # Neither score nor runtime exists
1520
 
1521
- # Apply the logic to the specified runtime column and update the DataFrame
1522
  df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
1523
 
1524
  return df
 
971
  name: Optional[str] = None,
972
  plot_type: str = 'cost', # 'cost' or 'runtime'
973
  mark_by: Optional[str] = None, # 'Company', 'Openness', or 'Country'
974
+ show_all_labels: bool = False
975
  ) -> go.Figure:
976
  from constants import MARK_BY_DEFAULT
977
  if mark_by is None:
 
1268
  domain_x = max(0, min(1, domain_x))
1269
  domain_y = max(0, min(1, domain_y))
1270
 
 
 
 
 
1271
  if harness_uri is not None:
1272
+ # Composite: stack model on top, harness on bottom, clamping
1273
+ # each half to the plot area so markers near the edges don't
1274
+ # drift off-canvas.
1275
+ model_y = min(1, domain_y + STACKED_Y_OFFSET)
1276
+ harness_y = max(0, domain_y - STACKED_Y_OFFSET)
1277
  layout_images.append(dict(
1278
  source=model_logo_uri,
1279
+ xref="x domain", yref="y domain",
1280
+ x=domain_x, y=model_y,
1281
+ sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
 
1282
  xanchor="center", yanchor="middle",
1283
  layer="above",
1284
  ))
1285
  layout_images.append(dict(
1286
  source=harness_uri,
1287
+ xref="x domain", yref="y domain",
1288
+ x=domain_x, y=harness_y,
1289
+ sizex=STACKED_SIZE_X, sizey=STACKED_SIZE_Y,
 
1290
  xanchor="center", yanchor="middle",
1291
  layer="above",
1292
  ))
1293
  else:
1294
+ # Single marker (canonical OpenHands pages, or Alternative Agents
1295
+ # rows with an unknown harness name — the latter shouldn't happen
1296
+ # in practice since HARNESS_LOGO_PATHS covers every agent_name the
1297
+ # push-to-index script emits).
1298
  layout_images.append(dict(
1299
  source=model_logo_uri,
1300
+ xref="x domain", yref="y domain",
1301
+ x=domain_x, y=domain_y,
1302
+ sizex=SINGLE_SIZE_X, sizey=SINGLE_SIZE_Y,
 
1303
  xanchor="center", yanchor="middle",
1304
  layer="above",
1305
  ))
1306
 
1307
+ # --- Section 7: Add Model Name Labels to Frontier Points ---
1308
+ if frontier_rows:
1309
+ frontier_labels_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1310
 
1311
  for row in frontier_rows:
1312
  x_val = row[x_col_to_use]
1313
  y_val = row[y_col_to_use]
1314
 
1315
+ # Get the model name for the label
1316
  model_name = row.get('Language Model', '')
1317
  if isinstance(model_name, list):
1318
  model_name = model_name[0] if model_name else ''
1319
+ # Clean the model name (remove path prefixes)
1320
  model_name = str(model_name).split('/')[-1]
1321
+ # Truncate long names
1322
  if len(model_name) > 25:
1323
  model_name = model_name[:22] + '...'
1324
 
1325
+ frontier_labels_data.append({
1326
+ 'x': x_val,
1327
+ 'y': y_val,
1328
+ 'label': model_name
1329
+ })
 
 
 
 
 
1330
 
1331
+ # Add annotations for each frontier label
1332
+ # For log scale x-axis, annotations need log10(x) coordinates (Plotly issue #2580)
1333
+ for item in frontier_labels_data:
1334
+ x_val = item['x']
1335
+ y_val = item['y']
1336
+ label = item['label']
1337
+
1338
+ # Transform x to log10 for annotation positioning on log scale
1339
+ if x_val > 0:
1340
+ x_log = np.log10(x_val)
1341
+ else:
1342
+ x_log = x_min_log
1343
+
1344
+ fig.add_annotation(
1345
+ x=x_log,
1346
+ y=y_val,
1347
+ text=label,
1348
+ showarrow=False,
1349
+ yshift=25, # Move label higher above the icon
1350
+ font=dict(
1351
+ size=10,
1352
+ color='#0D0D0F', # neutral-950
1353
+ family=FONT_FAMILY_SHORT
1354
+ ),
1355
+ xanchor='center',
1356
+ yanchor='bottom'
1357
+ )
1358
 
1359
  # --- Section 8: Configure Layout ---
1360
  # Use the same axis ranges as calculated for domain coordinates
 
1473
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
1474
 
1475
 
1476
+ def _hidden_runtime_sort_key(runtime_value: float | int | None, score_value: float | int | None) -> str:
1477
+ """Build a hidden prefix so Gradio's string-based runtime sorting behaves numerically."""
1478
+ if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
1479
+ return f"{float(runtime_value):020.6f}"
1480
+ if pd.notna(score_value):
1481
+ return "99999999999999999998"
1482
+ return "99999999999999999999"
1483
+
1484
+
1485
  def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
1486
  """
1487
  Applies custom formatting to a runtime column based on its corresponding score column.
1488
  - If runtime is not null, formats as time with 's' suffix.
1489
  - If runtime is null but score is not, it becomes "Missing".
1490
  - If both runtime and score are null, it becomes "Not Submitted".
1491
+ - Adds a hidden, zero-padded numeric prefix so Gradio sorts the column numerically.
1492
  Args:
1493
  df: The DataFrame to modify.
1494
  runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
1495
  Returns:
1496
  The DataFrame with the formatted runtime column.
1497
  """
 
1498
  score_col_name = runtime_col_name.replace("Runtime", "Score")
1499
 
 
1500
  if score_col_name not in df.columns:
1501
+ return df
1502
 
1503
  def apply_formatting_logic(row):
1504
  runtime_value = row[runtime_col_name]
1505
  score_value = row[score_col_name]
1506
  status_color = "#ec4899"
1507
+ sort_key = _hidden_runtime_sort_key(runtime_value, score_value)
1508
+ hidden_sort_prefix = f'<span style="display:none">{sort_key}</span>'
1509
 
1510
  if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
1511
+ return f"{hidden_sort_prefix}{runtime_value:.0f}s"
1512
  elif pd.notna(score_value):
1513
+ return f'{hidden_sort_prefix}<span style="color: {status_color};">Missing</span>'
1514
  else:
1515
+ return f'{hidden_sort_prefix}<span style="color: {status_color};">Not Submitted</span>'
1516
 
 
1517
  df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
1518
 
1519
  return df
simple_data_loader.py CHANGED
@@ -245,6 +245,7 @@ class SimpleLeaderboardViewer:
245
  'acp-claude': 'Claude Code',
246
  'acp-codex': 'Codex',
247
  'acp-gemini': 'Gemini CLI',
 
248
  }
249
  alt_dir = self.config_path / "alternative_agents"
250
  if alt_dir.exists():
@@ -252,8 +253,6 @@ class SimpleLeaderboardViewer:
252
  if not type_dir.is_dir():
253
  continue
254
  default_name = agent_type_default_name.get(type_dir.name)
255
- if default_name is None:
256
- continue # skip unlisted agent types (e.g. openhands_subagents)
257
  for agent_dir in type_dir.iterdir():
258
  if not agent_dir.is_dir():
259
  continue
 
245
  'acp-claude': 'Claude Code',
246
  'acp-codex': 'Codex',
247
  'acp-gemini': 'Gemini CLI',
248
+ 'openhands_subagents': 'OpenHands Sub-agents',
249
  }
250
  alt_dir = self.config_path / "alternative_agents"
251
  if alt_dir.exists():
 
253
  if not type_dir.is_dir():
254
  continue
255
  default_name = agent_type_default_name.get(type_dir.name)
 
 
256
  for agent_dir in type_dir.iterdir():
257
  if not agent_dir.is_dir():
258
  continue
ui_components.py CHANGED
@@ -954,7 +954,7 @@ def create_leaderboard_display(
954
  if not new_df.empty:
955
  new_transformer = DataTransformer(new_df, new_tag_map)
956
  new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True)
957
-
958
  # Prepare both complete and all entries versions
959
  if 'Categories Attempted' in new_df_view_full.columns:
960
  new_df_view_complete = new_df_view_full[new_df_view_full['Categories Attempted'] == '5/5'].copy()
@@ -1014,22 +1014,16 @@ def create_leaderboard_display(
1014
 
1015
  # Connect the timer to the refresh function
1016
  if show_incomplete_checkbox is not None:
 
1017
  if show_open_only_checkbox is not None:
1018
- refresh_timer.tick(
1019
- fn=check_and_refresh_data,
1020
- inputs=[show_incomplete_checkbox, show_open_only_checkbox, mark_by_dropdown, show_all_labels_checkbox],
1021
- outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
1022
- )
1023
- else:
1024
- # No open/closed split in this dataset — gr.State can't fill the gap in a
1025
- # timer tick (no session context), so use a wrapper that fixes show_open_only=False.
1026
- def _timer_refresh_no_open(show_incomplete, mark_by, show_all_labels):
1027
- return check_and_refresh_data(show_incomplete, False, mark_by, show_all_labels)
1028
- refresh_timer.tick(
1029
- fn=_timer_refresh_no_open,
1030
- inputs=[show_incomplete_checkbox, mark_by_dropdown, show_all_labels_checkbox],
1031
- outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
1032
- )
1033
  else:
1034
  # If no incomplete checkbox, always show all data (but still filter by open if needed)
1035
  def check_and_refresh_all(show_open_only=False, mark_by=MARK_BY_DEFAULT, show_all_labels=False):
 
954
  if not new_df.empty:
955
  new_transformer = DataTransformer(new_df, new_tag_map)
956
  new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True)
957
+
958
  # Prepare both complete and all entries versions
959
  if 'Categories Attempted' in new_df_view_full.columns:
960
  new_df_view_complete = new_df_view_full[new_df_view_full['Categories Attempted'] == '5/5'].copy()
 
1014
 
1015
  # Connect the timer to the refresh function
1016
  if show_incomplete_checkbox is not None:
1017
+ timer_inputs = [show_incomplete_checkbox]
1018
  if show_open_only_checkbox is not None:
1019
+ timer_inputs.append(show_open_only_checkbox)
1020
+ timer_inputs.append(mark_by_dropdown) # Always include mark_by
1021
+ timer_inputs.append(show_all_labels_checkbox)
1022
+ refresh_timer.tick(
1023
+ fn=check_and_refresh_data,
1024
+ inputs=timer_inputs,
1025
+ outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
1026
+ )
 
 
 
 
 
 
 
1027
  else:
1028
  # If no incomplete checkbox, always show all data (but still filter by open if needed)
1029
  def check_and_refresh_all(show_open_only=False, mark_by=MARK_BY_DEFAULT, show_all_labels=False):