openhands openhands commited on
Commit
2854ddd
·
1 Parent(s): 16307c3

Add runtime column and Cost/Performance + Runtime/Performance charts to all pages

Browse files

- Add runtime tracking to data loader (simple_data_loader.py)
- Add format_runtime_column function (leaderboard_transformer.py)
- Update _plot_scatter_plotly with plot_type parameter ('cost' or 'runtime')
- Update chart titles to 'OpenHands Index XXX Cost/Performance' and 'OpenHands Index XXX Runtime/Performance'
- Add Runtime column to main leaderboard and benchmark details tables
- Display both cost/performance and runtime/performance scatter plots on all pages

Co-authored-by: openhands <openhands@all-hands.dev>

leaderboard_transformer.py CHANGED
@@ -164,6 +164,7 @@ def _pretty_column_name(raw_col: str) -> str:
164
  'average cost': 'Average Cost',
165
  'total cost': 'Average Cost', # Legacy support
166
  'Overall cost': 'Average Cost', # Legacy support
 
167
  'categories_completed': 'Categories Completed',
168
  'Logs': 'Logs',
169
  'Openness': 'Openness',
@@ -395,7 +396,8 @@ def _plot_scatter_plotly(
395
  x: Optional[str],
396
  y: str,
397
  agent_col: str = 'Agent',
398
- name: Optional[str] = None
 
399
  ) -> go.Figure:
400
 
401
  # --- Section 1: Define Mappings ---
@@ -427,7 +429,11 @@ def _plot_scatter_plotly(
427
  data_plot = data.copy()
428
  data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
429
 
430
- x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
 
 
 
 
431
  max_reported_cost = 0
432
  divider_line_x = 0
433
 
@@ -500,12 +506,12 @@ def _plot_scatter_plotly(
500
  ))
501
 
502
  # --- Section 5: Prepare for Marker Plotting ---
503
- def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
504
  """
505
  Builds the complete HTML string for the plot's hover tooltip.
506
  Format: {lm_name} (SDK {version})
507
  Average Score: {score}
508
- Average Cost: {cost}
509
  Openness: {openness}
510
  """
511
  h_pad = " "
@@ -528,11 +534,17 @@ def _plot_scatter_plotly(
528
  # Average Score
529
  parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
530
 
531
- # Average Cost
532
- if divider_line_x > 0 and row[x_col] >= divider_line_x:
533
- parts.append(f"{h_pad}Average Cost: <b>Missing</b>{h_pad}<br>")
 
 
 
534
  else:
535
- parts.append(f"{h_pad}Average Cost: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
 
 
 
536
 
537
  # Openness
538
  parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
@@ -548,7 +560,8 @@ def _plot_scatter_plotly(
548
  x_axis_label=x_axis_label,
549
  x_col=x_col_to_use,
550
  y_col=y_col_to_use,
551
- divider_line_x=divider_line_x
 
552
  ),
553
  axis=1
554
  )
@@ -695,10 +708,16 @@ def _plot_scatter_plotly(
695
  range=[x_min_log, x_max_log] # Match domain coordinate calculation
696
  )
697
 
 
 
 
 
 
 
698
  # Build layout configuration
699
  layout_config = dict(
700
  template="plotly_white",
701
- title=f"OpenHands Index {name} Leaderboard",
702
  xaxis=xaxis_config,
703
  yaxis=dict(title="Average (mean) score", range=[y_min, y_max]), # Match domain calculation
704
  legend=dict(
@@ -789,6 +808,43 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
789
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
790
 
791
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792
  def get_pareto_df(data, cost_col=None, score_col=None):
793
  """
794
  Calculate the Pareto frontier for the given data.
 
164
  'average cost': 'Average Cost',
165
  'total cost': 'Average Cost', # Legacy support
166
  'Overall cost': 'Average Cost', # Legacy support
167
+ 'average runtime': 'Average Runtime',
168
  'categories_completed': 'Categories Completed',
169
  'Logs': 'Logs',
170
  'Openness': 'Openness',
 
396
  x: Optional[str],
397
  y: str,
398
  agent_col: str = 'Agent',
399
+ name: Optional[str] = None,
400
+ plot_type: str = 'cost' # 'cost' or 'runtime'
401
  ) -> go.Figure:
402
 
403
  # --- Section 1: Define Mappings ---
 
429
  data_plot = data.copy()
430
  data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
431
 
432
+ # Set axis labels based on plot type
433
+ if plot_type == 'runtime':
434
+ x_axis_label = f"Average (mean) runtime per problem (seconds)" if x else "Runtime (Data N/A)"
435
+ else:
436
+ x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
437
  max_reported_cost = 0
438
  divider_line_x = 0
439
 
 
506
  ))
507
 
508
  # --- Section 5: Prepare for Marker Plotting ---
509
+ def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x, is_runtime=False):
510
  """
511
  Builds the complete HTML string for the plot's hover tooltip.
512
  Format: {lm_name} (SDK {version})
513
  Average Score: {score}
514
+ Average Cost/Runtime: {value}
515
  Openness: {openness}
516
  """
517
  h_pad = " "
 
534
  # Average Score
535
  parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
536
 
537
+ # Average Cost or Runtime
538
+ if is_runtime:
539
+ if divider_line_x > 0 and row[x_col] >= divider_line_x:
540
+ parts.append(f"{h_pad}Average Runtime: <b>Missing</b>{h_pad}<br>")
541
+ else:
542
+ parts.append(f"{h_pad}Average Runtime: <b>{row[x_col]:.0f}s</b>{h_pad}<br>")
543
  else:
544
+ if divider_line_x > 0 and row[x_col] >= divider_line_x:
545
+ parts.append(f"{h_pad}Average Cost: <b>Missing</b>{h_pad}<br>")
546
+ else:
547
+ parts.append(f"{h_pad}Average Cost: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
548
 
549
  # Openness
550
  parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
 
560
  x_axis_label=x_axis_label,
561
  x_col=x_col_to_use,
562
  y_col=y_col_to_use,
563
+ divider_line_x=divider_line_x,
564
+ is_runtime=(plot_type == 'runtime')
565
  ),
566
  axis=1
567
  )
 
708
  range=[x_min_log, x_max_log] # Match domain coordinate calculation
709
  )
710
 
711
+ # Set title based on plot type
712
+ if plot_type == 'runtime':
713
+ plot_title = f"OpenHands Index {name} Runtime/Performance"
714
+ else:
715
+ plot_title = f"OpenHands Index {name} Cost/Performance"
716
+
717
  # Build layout configuration
718
  layout_config = dict(
719
  template="plotly_white",
720
+ title=plot_title,
721
  xaxis=xaxis_config,
722
  yaxis=dict(title="Average (mean) score", range=[y_min, y_max]), # Match domain calculation
723
  legend=dict(
 
808
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
809
 
810
 
811
+ def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
812
+ """
813
+ Applies custom formatting to a runtime column based on its corresponding score column.
814
+ - If runtime is not null, formats as time with 's' suffix.
815
+ - If runtime is null but score is not, it becomes "Missing".
816
+ - If both runtime and score are null, it becomes "Not Submitted".
817
+ Args:
818
+ df: The DataFrame to modify.
819
+ runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
820
+ Returns:
821
+ The DataFrame with the formatted runtime column.
822
+ """
823
+ # Find the corresponding score column by replacing "Runtime" with "Score"
824
+ score_col_name = runtime_col_name.replace("Runtime", "Score")
825
+
826
+ # Ensure the score column actually exists to avoid errors
827
+ if score_col_name not in df.columns:
828
+ return df # Return the DataFrame unmodified if there's no matching score
829
+
830
+ def apply_formatting_logic(row):
831
+ runtime_value = row[runtime_col_name]
832
+ score_value = row[score_col_name]
833
+ status_color = "#ec4899"
834
+
835
+ if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
836
+ return f"{runtime_value:.0f}s"
837
+ elif pd.notna(score_value):
838
+ return f'<span style="color: {status_color};">Missing</span>' # Score exists, but runtime is missing
839
+ else:
840
+ return f'<span style="color: {status_color};">Not Submitted</span>' # Neither score nor runtime exists
841
+
842
+ # Apply the logic to the specified runtime column and update the DataFrame
843
+ df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
844
+
845
+ return df
846
+
847
+
848
  def get_pareto_df(data, cost_col=None, score_col=None):
849
  """
850
  Calculate the Pareto frontier for the given data.
simple_data_loader.py CHANGED
@@ -269,7 +269,7 @@ class SimpleLeaderboardViewer:
269
  dataset_costs = []
270
 
271
  # Track category-level data for aggregation
272
- category_data = {} # {category: {'scores': [...], 'costs': [...]}}
273
 
274
  for _, row in agent_records.iterrows():
275
  tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
@@ -277,6 +277,7 @@ class SimpleLeaderboardViewer:
277
  # Add columns for this specific dataset/benchmark
278
  record[f'{tag} score'] = row['score']
279
  record[f'{tag} cost'] = row['cost_per_instance']
 
280
  dataset_scores.append(row['score'])
281
  dataset_costs.append(row['cost_per_instance'])
282
 
@@ -289,12 +290,14 @@ class SimpleLeaderboardViewer:
289
  if tag in self.benchmark_to_categories:
290
  for category in self.benchmark_to_categories[tag]:
291
  if category not in category_data:
292
- category_data[category] = {'scores': [], 'costs': []}
293
  category_data[category]['scores'].append(row['score'])
294
  category_data[category]['costs'].append(row['cost_per_instance'])
 
295
 
296
- # Calculate category-level aggregates and track average cost
297
  all_costs = []
 
298
  categories_with_scores = 0
299
  for category in ALL_CATEGORIES:
300
  if category in category_data and category_data[category]['scores']:
@@ -308,6 +311,12 @@ class SimpleLeaderboardViewer:
308
  avg_cost = sum(valid_costs) / len(valid_costs)
309
  record[f'{category} cost'] = avg_cost
310
  all_costs.extend(valid_costs)
 
 
 
 
 
 
311
  else:
312
  # Category not submitted - will show as NA
313
  pass
@@ -323,6 +332,9 @@ class SimpleLeaderboardViewer:
323
  # Average cost per instance across all benchmarks
324
  record['average cost'] = sum(all_costs) / len(all_costs) if all_costs else None
325
 
 
 
 
326
  # Track how many categories were completed
327
  record['categories_completed'] = categories_with_scores
328
 
 
269
  dataset_costs = []
270
 
271
  # Track category-level data for aggregation
272
+ category_data = {} # {category: {'scores': [...], 'costs': [], 'runtimes': []}}
273
 
274
  for _, row in agent_records.iterrows():
275
  tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
 
277
  # Add columns for this specific dataset/benchmark
278
  record[f'{tag} score'] = row['score']
279
  record[f'{tag} cost'] = row['cost_per_instance']
280
+ record[f'{tag} runtime'] = row.get('average_runtime')
281
  dataset_scores.append(row['score'])
282
  dataset_costs.append(row['cost_per_instance'])
283
 
 
290
  if tag in self.benchmark_to_categories:
291
  for category in self.benchmark_to_categories[tag]:
292
  if category not in category_data:
293
+ category_data[category] = {'scores': [], 'costs': [], 'runtimes': []}
294
  category_data[category]['scores'].append(row['score'])
295
  category_data[category]['costs'].append(row['cost_per_instance'])
296
+ category_data[category]['runtimes'].append(row.get('average_runtime'))
297
 
298
+ # Calculate category-level aggregates and track average cost/runtime
299
  all_costs = []
300
+ all_runtimes = []
301
  categories_with_scores = 0
302
  for category in ALL_CATEGORIES:
303
  if category in category_data and category_data[category]['scores']:
 
311
  avg_cost = sum(valid_costs) / len(valid_costs)
312
  record[f'{category} cost'] = avg_cost
313
  all_costs.extend(valid_costs)
314
+ if data['runtimes']:
315
+ valid_runtimes = [r for r in data['runtimes'] if r is not None]
316
+ if valid_runtimes:
317
+ avg_runtime = sum(valid_runtimes) / len(valid_runtimes)
318
+ record[f'{category} runtime'] = avg_runtime
319
+ all_runtimes.extend(valid_runtimes)
320
  else:
321
  # Category not submitted - will show as NA
322
  pass
 
332
  # Average cost per instance across all benchmarks
333
  record['average cost'] = sum(all_costs) / len(all_costs) if all_costs else None
334
 
335
+ # Average runtime per instance across all benchmarks
336
+ record['average runtime'] = sum(all_runtimes) / len(all_runtimes) if all_runtimes else None
337
+
338
  # Track how many categories were completed
339
  record['categories_completed'] = categories_with_scores
340
 
ui_components.py CHANGED
@@ -17,6 +17,7 @@ from leaderboard_transformer import (
17
  _plot_scatter_plotly,
18
  format_cost_column,
19
  format_score_column,
 
20
  get_pareto_df,
21
  clean_llm_base_list,
22
  )
@@ -515,6 +516,10 @@ def create_leaderboard_display(
515
  if "Score" in col:
516
  df_display = format_score_column(df_display, col)
517
 
 
 
 
 
518
  # Clean the Language Model column first
519
  df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
520
 
@@ -583,47 +588,65 @@ def create_leaderboard_display(
583
  # If no complete entries exist, show all entries by default
584
  has_complete_entries = len(df_display_complete) > 0
585
 
586
- # Determine primary score/cost columns for scatter plot
587
  if category_name == "Overall":
588
  primary_score_col = "Average Score"
589
  primary_cost_col = "Average Cost"
 
590
  else:
591
  primary_score_col = f"{category_name} Score"
592
  primary_cost_col = f"{category_name} Cost"
 
593
 
594
- # Function to create scatter plot from data
595
- def create_scatter_plot(df_data):
596
  return _plot_scatter_plotly(
597
  data=df_data,
598
  x=primary_cost_col if primary_cost_col in df_data.columns else None,
599
  y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
600
  agent_col="SDK Version",
601
- name=category_name
 
 
 
 
 
 
 
 
 
 
 
 
602
  )
603
 
604
- # Create initial scatter plots for both complete and all data
605
- scatter_plot_complete = create_scatter_plot(df_view_complete) if has_complete_entries else go.Figure()
606
- scatter_plot_all = create_scatter_plot(df_view_full)
 
 
 
 
607
 
608
  # Now get headers from the renamed dataframe (use all entries to ensure headers are present)
609
  df_headers = df_display_all.columns.tolist()
610
  df_datatypes = []
611
  for col in df_headers:
612
- if col == "Logs" or "Cost" in col or "Score" in col:
613
  df_datatypes.append("markdown")
614
  elif col in ["SDK Version", "Language Model"]:
615
  df_datatypes.append("html")
616
  else:
617
  df_datatypes.append("str")
618
  # Dynamically set widths for the DataFrame columns
619
- # Order: Language Model, SDK Version, Average Score, Average Cost, ...
620
- fixed_start_widths = [280, 100, 100] # Language Model (with icons), SDK Version, Average Score
621
- num_score_cost_cols = 0
622
  remaining_headers = df_headers[len(fixed_start_widths):]
623
  for col in remaining_headers:
624
- if "Score" in col or "Cost" in col:
625
- num_score_cost_cols += 1
626
- dynamic_widths = [90] * num_score_cost_cols
627
  fixed_end_widths = [90, 100, 50] # Categories Attempted, Date, Logs
628
  # 5. Combine all the lists to create the final, fully dynamic list.
629
  final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
@@ -644,10 +667,18 @@ def create_leaderboard_display(
644
  show_incomplete_checkbox = None
645
  gr.Markdown(f"*No entries with all 5 categories completed yet. Showing all {num_total} entries.*")
646
 
647
- # Plot component - show complete entries by default if available
648
- initial_plot = scatter_plot_complete if has_complete_entries else scatter_plot_all
649
- plot_component = gr.Plot(
650
- value=initial_plot,
 
 
 
 
 
 
 
 
651
  show_label=False,
652
  )
653
  gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
@@ -669,17 +700,17 @@ def create_leaderboard_display(
669
  elem_id="main-leaderboard"
670
  )
671
 
672
- # Update function for the toggle - updates both table and plot
673
  def update_display(show_incomplete):
674
  if show_incomplete:
675
- return df_display_all, scatter_plot_all
676
  else:
677
- return df_display_complete, scatter_plot_complete
678
 
679
  show_incomplete_checkbox.change(
680
  fn=update_display,
681
  inputs=[show_incomplete_checkbox],
682
- outputs=[dataframe_component, plot_component]
683
  )
684
  else:
685
  dataframe_component = gr.DataFrame(
@@ -719,21 +750,23 @@ def create_leaderboard_display(
719
  new_df_display_complete = prepare_df_for_display(new_df_view_complete)
720
  new_df_display_all = prepare_df_for_display(new_df_view_full)
721
 
722
- # Create new scatter plots
723
- new_scatter_complete = create_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure()
724
- new_scatter_all = create_scatter_plot(new_df_view_full)
 
 
725
 
726
  # Return the appropriate data based on checkbox state
727
  if current_checkbox_state:
728
- return new_df_display_all, new_scatter_all
729
  else:
730
- return new_df_display_complete, new_scatter_complete
731
 
732
  # No change, return current values
733
  if current_checkbox_state:
734
- return df_display_all, scatter_plot_all
735
  else:
736
- return df_display_complete, scatter_plot_complete
737
 
738
  # Create a timer that checks for updates every 60 seconds
739
  refresh_timer = gr.Timer(value=60)
@@ -743,7 +776,7 @@ def create_leaderboard_display(
743
  refresh_timer.tick(
744
  fn=check_and_refresh_data,
745
  inputs=[show_incomplete_checkbox],
746
- outputs=[dataframe_component, plot_component]
747
  )
748
  else:
749
  # If no checkbox, always show all data
@@ -756,18 +789,19 @@ def create_leaderboard_display(
756
  new_transformer = DataTransformer(new_df, new_tag_map)
757
  new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True)
758
  new_df_display_all = prepare_df_for_display(new_df_view_full)
759
- new_scatter_all = create_scatter_plot(new_df_view_full)
760
- return new_df_display_all, new_scatter_all
761
- return df_display_all, scatter_plot_all
 
762
 
763
  refresh_timer.tick(
764
  fn=check_and_refresh_all,
765
  inputs=[],
766
- outputs=[dataframe_component, plot_component]
767
  )
768
 
769
  # Return the components so they can be referenced elsewhere.
770
- return plot_component, dataframe_component
771
 
772
  # # --- Detailed Benchmark Display ---
773
  def create_benchmark_details_display(
@@ -807,10 +841,11 @@ def create_benchmark_details_display(
807
  # 3. Prepare the data for this specific benchmark's table and plot
808
  benchmark_score_col = f"{benchmark_name} Score"
809
  benchmark_cost_col = f"{benchmark_name} Cost"
 
810
  benchmark_download_col = f"{benchmark_name} Download"
811
 
812
  # Define the columns needed for the detailed table
813
- table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs', benchmark_download_col, 'id', 'Language Model']
814
 
815
  # Filter to only columns that actually exist in the full dataframe
816
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -914,6 +949,7 @@ def create_benchmark_details_display(
914
  'Attempted Benchmark',
915
  benchmark_score_col,
916
  benchmark_cost_col,
 
917
  'Date',
918
  'Logs',
919
  benchmark_download_col
@@ -922,10 +958,16 @@ def create_benchmark_details_display(
922
  if col not in benchmark_table_df.columns:
923
  benchmark_table_df[col] = pd.NA # Add as an empty column
924
  benchmark_table_df = benchmark_table_df[desired_cols_in_order]
 
 
 
 
 
925
  # Rename columns for a cleaner table display, as requested
926
  benchmark_table_df.rename(columns={
927
  benchmark_score_col: 'Score',
928
  benchmark_cost_col: 'Cost',
 
929
  benchmark_download_col: '⬇️', # Empty-ish header with icon hint
930
  }, inplace=True)
931
 
@@ -933,20 +975,34 @@ def create_benchmark_details_display(
933
  df_headers = benchmark_table_df.columns.tolist()
934
  df_datatypes = []
935
  for col in df_headers:
936
- if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col:
937
  df_datatypes.append("markdown")
938
  elif col in ["SDK Version", "Language Model"]:
939
  df_datatypes.append("html")
940
  else:
941
  df_datatypes.append("str")
942
- benchmark_plot = _plot_scatter_plotly(
 
 
943
  data=full_df,
944
  x=benchmark_cost_col,
945
  y=benchmark_score_col,
946
  agent_col="SDK Version",
947
- name=benchmark_name
 
 
 
 
 
 
 
 
 
 
 
 
948
  )
949
- gr.Plot(value=benchmark_plot, show_label=False)
950
  gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
951
 
952
  # Put table and key into an accordion
@@ -957,7 +1013,7 @@ def create_benchmark_details_display(
957
  datatype=df_datatypes,
958
  interactive=False,
959
  wrap=True,
960
- column_widths=[200, 80, 40, 80, 80, 150, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Date, Logs, Download
961
  show_search="search",
962
  elem_classes=["wrap-header-df"]
963
  )
 
17
  _plot_scatter_plotly,
18
  format_cost_column,
19
  format_score_column,
20
+ format_runtime_column,
21
  get_pareto_df,
22
  clean_llm_base_list,
23
  )
 
516
  if "Score" in col:
517
  df_display = format_score_column(df_display, col)
518
 
519
+ for col in df_display.columns:
520
+ if "Runtime" in col:
521
+ df_display = format_runtime_column(df_display, col)
522
+
523
  # Clean the Language Model column first
524
  df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
525
 
 
588
  # If no complete entries exist, show all entries by default
589
  has_complete_entries = len(df_display_complete) > 0
590
 
591
+ # Determine primary score/cost/runtime columns for scatter plot
592
  if category_name == "Overall":
593
  primary_score_col = "Average Score"
594
  primary_cost_col = "Average Cost"
595
+ primary_runtime_col = "Average Runtime"
596
  else:
597
  primary_score_col = f"{category_name} Score"
598
  primary_cost_col = f"{category_name} Cost"
599
+ primary_runtime_col = f"{category_name} Runtime"
600
 
601
+ # Function to create cost/performance scatter plot from data
602
+ def create_cost_scatter_plot(df_data):
603
  return _plot_scatter_plotly(
604
  data=df_data,
605
  x=primary_cost_col if primary_cost_col in df_data.columns else None,
606
  y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
607
  agent_col="SDK Version",
608
+ name=category_name,
609
+ plot_type='cost'
610
+ )
611
+
612
+ # Function to create runtime/performance scatter plot from data
613
+ def create_runtime_scatter_plot(df_data):
614
+ return _plot_scatter_plotly(
615
+ data=df_data,
616
+ x=primary_runtime_col if primary_runtime_col in df_data.columns else None,
617
+ y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
618
+ agent_col="SDK Version",
619
+ name=category_name,
620
+ plot_type='runtime'
621
  )
622
 
623
+ # Create initial cost scatter plots for both complete and all data
624
+ cost_scatter_complete = create_cost_scatter_plot(df_view_complete) if has_complete_entries else go.Figure()
625
+ cost_scatter_all = create_cost_scatter_plot(df_view_full)
626
+
627
+ # Create initial runtime scatter plots for both complete and all data
628
+ runtime_scatter_complete = create_runtime_scatter_plot(df_view_complete) if has_complete_entries else go.Figure()
629
+ runtime_scatter_all = create_runtime_scatter_plot(df_view_full)
630
 
631
  # Now get headers from the renamed dataframe (use all entries to ensure headers are present)
632
  df_headers = df_display_all.columns.tolist()
633
  df_datatypes = []
634
  for col in df_headers:
635
+ if col == "Logs" or "Cost" in col or "Score" in col or "Runtime" in col:
636
  df_datatypes.append("markdown")
637
  elif col in ["SDK Version", "Language Model"]:
638
  df_datatypes.append("html")
639
  else:
640
  df_datatypes.append("str")
641
  # Dynamically set widths for the DataFrame columns
642
+ # Order: Language Model, SDK Version, Average Score, Average Cost, Average Runtime, ...
643
+ fixed_start_widths = [280, 100, 100, 90] # Language Model (with icons), SDK Version, Average Score, Average Runtime
644
+ num_score_cost_runtime_cols = 0
645
  remaining_headers = df_headers[len(fixed_start_widths):]
646
  for col in remaining_headers:
647
+ if "Score" in col or "Cost" in col or "Runtime" in col:
648
+ num_score_cost_runtime_cols += 1
649
+ dynamic_widths = [90] * num_score_cost_runtime_cols
650
  fixed_end_widths = [90, 100, 50] # Categories Attempted, Date, Logs
651
  # 5. Combine all the lists to create the final, fully dynamic list.
652
  final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
 
667
  show_incomplete_checkbox = None
668
  gr.Markdown(f"*No entries with all 5 categories completed yet. Showing all {num_total} entries.*")
669
 
670
+ # Plot components - show complete entries by default if available
671
+ # Cost/Performance plot
672
+ initial_cost_plot = cost_scatter_complete if has_complete_entries else cost_scatter_all
673
+ cost_plot_component = gr.Plot(
674
+ value=initial_cost_plot,
675
+ show_label=False,
676
+ )
677
+
678
+ # Runtime/Performance plot
679
+ initial_runtime_plot = runtime_scatter_complete if has_complete_entries else runtime_scatter_all
680
+ runtime_plot_component = gr.Plot(
681
+ value=initial_runtime_plot,
682
  show_label=False,
683
  )
684
  gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
 
700
  elem_id="main-leaderboard"
701
  )
702
 
703
+ # Update function for the toggle - updates both table and plots
704
  def update_display(show_incomplete):
705
  if show_incomplete:
706
+ return df_display_all, cost_scatter_all, runtime_scatter_all
707
  else:
708
+ return df_display_complete, cost_scatter_complete, runtime_scatter_complete
709
 
710
  show_incomplete_checkbox.change(
711
  fn=update_display,
712
  inputs=[show_incomplete_checkbox],
713
+ outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
714
  )
715
  else:
716
  dataframe_component = gr.DataFrame(
 
750
  new_df_display_complete = prepare_df_for_display(new_df_view_complete)
751
  new_df_display_all = prepare_df_for_display(new_df_view_full)
752
 
753
+ # Create new scatter plots (both cost and runtime)
754
+ new_cost_scatter_complete = create_cost_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure()
755
+ new_cost_scatter_all = create_cost_scatter_plot(new_df_view_full)
756
+ new_runtime_scatter_complete = create_runtime_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure()
757
+ new_runtime_scatter_all = create_runtime_scatter_plot(new_df_view_full)
758
 
759
  # Return the appropriate data based on checkbox state
760
  if current_checkbox_state:
761
+ return new_df_display_all, new_cost_scatter_all, new_runtime_scatter_all
762
  else:
763
+ return new_df_display_complete, new_cost_scatter_complete, new_runtime_scatter_complete
764
 
765
  # No change, return current values
766
  if current_checkbox_state:
767
+ return df_display_all, cost_scatter_all, runtime_scatter_all
768
  else:
769
+ return df_display_complete, cost_scatter_complete, runtime_scatter_complete
770
 
771
  # Create a timer that checks for updates every 60 seconds
772
  refresh_timer = gr.Timer(value=60)
 
776
  refresh_timer.tick(
777
  fn=check_and_refresh_data,
778
  inputs=[show_incomplete_checkbox],
779
+ outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
780
  )
781
  else:
782
  # If no checkbox, always show all data
 
789
  new_transformer = DataTransformer(new_df, new_tag_map)
790
  new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True)
791
  new_df_display_all = prepare_df_for_display(new_df_view_full)
792
+ new_cost_scatter_all = create_cost_scatter_plot(new_df_view_full)
793
+ new_runtime_scatter_all = create_runtime_scatter_plot(new_df_view_full)
794
+ return new_df_display_all, new_cost_scatter_all, new_runtime_scatter_all
795
+ return df_display_all, cost_scatter_all, runtime_scatter_all
796
 
797
  refresh_timer.tick(
798
  fn=check_and_refresh_all,
799
  inputs=[],
800
+ outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
801
  )
802
 
803
  # Return the components so they can be referenced elsewhere.
804
+ return cost_plot_component, runtime_plot_component, dataframe_component
805
 
806
  # # --- Detailed Benchmark Display ---
807
  def create_benchmark_details_display(
 
841
  # 3. Prepare the data for this specific benchmark's table and plot
842
  benchmark_score_col = f"{benchmark_name} Score"
843
  benchmark_cost_col = f"{benchmark_name} Cost"
844
+ benchmark_runtime_col = f"{benchmark_name} Runtime"
845
  benchmark_download_col = f"{benchmark_name} Download"
846
 
847
  # Define the columns needed for the detailed table
848
+ table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_download_col, 'id', 'Language Model']
849
 
850
  # Filter to only columns that actually exist in the full dataframe
851
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
 
949
  'Attempted Benchmark',
950
  benchmark_score_col,
951
  benchmark_cost_col,
952
+ benchmark_runtime_col,
953
  'Date',
954
  'Logs',
955
  benchmark_download_col
 
958
  if col not in benchmark_table_df.columns:
959
  benchmark_table_df[col] = pd.NA # Add as an empty column
960
  benchmark_table_df = benchmark_table_df[desired_cols_in_order]
961
+
962
+ # Format the runtime column before renaming
963
+ if benchmark_runtime_col in benchmark_table_df.columns:
964
+ benchmark_table_df = format_runtime_column(benchmark_table_df, benchmark_runtime_col)
965
+
966
  # Rename columns for a cleaner table display, as requested
967
  benchmark_table_df.rename(columns={
968
  benchmark_score_col: 'Score',
969
  benchmark_cost_col: 'Cost',
970
+ benchmark_runtime_col: 'Runtime',
971
  benchmark_download_col: '⬇️', # Empty-ish header with icon hint
972
  }, inplace=True)
973
 
 
975
  df_headers = benchmark_table_df.columns.tolist()
976
  df_datatypes = []
977
  for col in df_headers:
978
+ if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col or "Runtime" in col:
979
  df_datatypes.append("markdown")
980
  elif col in ["SDK Version", "Language Model"]:
981
  df_datatypes.append("html")
982
  else:
983
  df_datatypes.append("str")
984
+
985
+ # Cost/Performance plot
986
+ cost_benchmark_plot = _plot_scatter_plotly(
987
  data=full_df,
988
  x=benchmark_cost_col,
989
  y=benchmark_score_col,
990
  agent_col="SDK Version",
991
+ name=benchmark_name,
992
+ plot_type='cost'
993
+ )
994
+ gr.Plot(value=cost_benchmark_plot, show_label=False)
995
+
996
+ # Runtime/Performance plot
997
+ runtime_benchmark_plot = _plot_scatter_plotly(
998
+ data=full_df,
999
+ x=benchmark_runtime_col,
1000
+ y=benchmark_score_col,
1001
+ agent_col="SDK Version",
1002
+ name=benchmark_name,
1003
+ plot_type='runtime'
1004
  )
1005
+ gr.Plot(value=runtime_benchmark_plot, show_label=False)
1006
  gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
1007
 
1008
  # Put table and key into an accordion
 
1013
  datatype=df_datatypes,
1014
  interactive=False,
1015
  wrap=True,
1016
+ column_widths=[200, 80, 40, 80, 80, 80, 150, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Download
1017
  show_search="search",
1018
  elem_classes=["wrap-header-df"]
1019
  )