openhands openhands commited on
Commit
4d0ae13
·
1 Parent(s): b5317d7

Move Download column to benchmark-specific tables only

Browse files

- Download column now appears on benchmark-specific leaderboard tables (e.g., SWE-Bench, GAIA)
- Each benchmark has its own download link to its specific trajectory archive
- Removed Download from Overall aggregate table (doesn't make sense there)
- Added download icon explanation to legend on benchmark tables
- Column header shows just ⬇️ icon to save space

Co-authored-by: openhands <openhands@all-hands.dev>

leaderboard_transformer.py CHANGED
@@ -166,7 +166,6 @@ def _pretty_column_name(raw_col: str) -> str:
166
  'Overall cost': 'Average Cost', # Legacy support
167
  'categories_completed': 'Categories Completed',
168
  'Logs': 'Logs',
169
- 'Download': 'Download',
170
  'Openness': 'Openness',
171
  'LLM base': 'Model',
172
  'Source': 'Source',
@@ -316,7 +315,7 @@ class DataTransformer:
316
  # --- 3. Add Columns for Agent Openness ---
317
  base_cols = ["id","Language Model","SDK Version","Source"]
318
  new_cols = ["Openness"]
319
- ending_cols = ["Date", "Logs", "Download"]
320
 
321
  # For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
322
  if tag is None or tag == "Overall":
 
166
  'Overall cost': 'Average Cost', # Legacy support
167
  'categories_completed': 'Categories Completed',
168
  'Logs': 'Logs',
 
169
  'Openness': 'Openness',
170
  'LLM base': 'Model',
171
  'Source': 'Source',
 
315
  # --- 3. Add Columns for Agent Openness ---
316
  base_cols = ["id","Language Model","SDK Version","Source"]
317
  new_cols = ["Openness"]
318
+ ending_cols = ["Date", "Logs"]
319
 
320
  # For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
321
  if tag is None or tag == "Overall":
simple_data_loader.py CHANGED
@@ -271,9 +271,6 @@ class SimpleLeaderboardViewer:
271
  # Track category-level data for aggregation
272
  category_data = {} # {category: {'scores': [...], 'costs': [...]}}
273
 
274
- # Collect all full_archive URLs for this agent
275
- archive_urls = []
276
-
277
  for _, row in agent_records.iterrows():
278
  tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
279
  for tag in tags:
@@ -283,10 +280,9 @@ class SimpleLeaderboardViewer:
283
  dataset_scores.append(row['score'])
284
  dataset_costs.append(row['cost_per_instance'])
285
 
286
- # Store the full_archive URL for this benchmark
287
  full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
288
  if full_archive_url:
289
- archive_urls.append(full_archive_url)
290
  record[f'{tag} download'] = full_archive_url
291
 
292
  # Track category-level data for aggregation
@@ -330,10 +326,6 @@ class SimpleLeaderboardViewer:
330
  # Track how many categories were completed
331
  record['categories_completed'] = categories_with_scores
332
 
333
- # Store all download URLs (for overall view, we'll show the first one or all)
334
- # Use the first archive URL as the main download link
335
- record['download'] = archive_urls[0] if archive_urls else ''
336
-
337
  transformed_records.append(record)
338
 
339
  transformed_df = pd.DataFrame(transformed_records)
 
271
  # Track category-level data for aggregation
272
  category_data = {} # {category: {'scores': [...], 'costs': [...]}}
273
 
 
 
 
274
  for _, row in agent_records.iterrows():
275
  tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
276
  for tag in tags:
 
280
  dataset_scores.append(row['score'])
281
  dataset_costs.append(row['cost_per_instance'])
282
 
283
+ # Store the full_archive URL for this benchmark (for benchmark-specific download)
284
  full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
285
  if full_archive_url:
 
286
  record[f'{tag} download'] = full_archive_url
287
 
288
  # Track category-level data for aggregation
 
326
  # Track how many categories were completed
327
  record['categories_completed'] = categories_with_scores
328
 
 
 
 
 
329
  transformed_records.append(record)
330
 
331
  transformed_df = pd.DataFrame(transformed_records)
ui_components.py CHANGED
@@ -235,7 +235,6 @@ def build_descriptions_tooltip_content(table) -> str:
235
  <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
236
  <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
237
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
238
- <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
239
  """
240
  elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
241
  return f"""
@@ -286,6 +285,20 @@ def create_legend_markdown(which_table: str) -> str:
286
  """
287
  descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
288
  trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  legend_markdown = f"""
290
  <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;">
291
 
@@ -307,6 +320,8 @@ def create_legend_markdown(which_table: str) -> str:
307
  <div class="table-legend-item">{openness_html}</div>
308
  </div>
309
 
 
 
310
  <div><!-- Container for the Column Descriptions section -->
311
  <b>Column Descriptions</b>
312
  <span class="tooltip-icon-legend">
@@ -594,7 +609,7 @@ def create_leaderboard_display(
594
  df_headers = df_display_all.columns.tolist()
595
  df_datatypes = []
596
  for col in df_headers:
597
- if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
598
  df_datatypes.append("markdown")
599
  elif col in ["SDK Version", "Language Model"]:
600
  df_datatypes.append("html")
@@ -609,7 +624,7 @@ def create_leaderboard_display(
609
  if "Score" in col or "Cost" in col:
610
  num_score_cost_cols += 1
611
  dynamic_widths = [90] * num_score_cost_cols
612
- fixed_end_widths = [90, 100, 50, 60] # Categories Attempted, Date, Logs, Download
613
  # 5. Combine all the lists to create the final, fully dynamic list.
614
  final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
615
 
@@ -792,9 +807,10 @@ def create_benchmark_details_display(
792
  # 3. Prepare the data for this specific benchmark's table and plot
793
  benchmark_score_col = f"{benchmark_name} Score"
794
  benchmark_cost_col = f"{benchmark_name} Cost"
 
795
 
796
  # Define the columns needed for the detailed table
797
- table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
798
 
799
  # Filter to only columns that actually exist in the full dataframe
800
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -883,6 +899,15 @@ def create_benchmark_details_display(
883
  # 1. Format the cost and score columns
884
  benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
885
  benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
 
 
 
 
 
 
 
 
 
886
  desired_cols_in_order = [
887
  'Language Model',
888
  'SDK Version',
@@ -890,23 +915,25 @@ def create_benchmark_details_display(
890
  benchmark_score_col,
891
  benchmark_cost_col,
892
  'Date',
893
- 'Logs'
 
894
  ]
895
  for col in desired_cols_in_order:
896
  if col not in benchmark_table_df.columns:
897
  benchmark_table_df[col] = pd.NA # Add as an empty column
898
  benchmark_table_df = benchmark_table_df[desired_cols_in_order]
899
  # Rename columns for a cleaner table display, as requested
900
- benchmark_table_df.rename({
901
  benchmark_score_col: 'Score',
902
  benchmark_cost_col: 'Cost',
 
903
  }, inplace=True)
904
 
905
  # Now get headers from the renamed dataframe
906
  df_headers = benchmark_table_df.columns.tolist()
907
  df_datatypes = []
908
  for col in df_headers:
909
- if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
910
  df_datatypes.append("markdown")
911
  elif col in ["SDK Version", "Language Model"]:
912
  df_datatypes.append("html")
@@ -930,7 +957,7 @@ def create_benchmark_details_display(
930
  datatype=df_datatypes,
931
  interactive=False,
932
  wrap=True,
933
- column_widths=[40, 40, 40, 200, 150, 175, 85, 100, 100, 80, 40],
934
  show_search="search",
935
  elem_classes=["wrap-header-df"]
936
  )
@@ -959,15 +986,6 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
959
  # Apply the function to the "Logs" column
960
  pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
961
 
962
- if "Download" in pretty_df.columns:
963
- def format_download_to_html(raw_url):
964
- # Handle empty or NaN values, returning a blank string.
965
- if pd.isna(raw_url) or raw_url == "": return ""
966
- # Create a download link with a download icon
967
- return f'<a href="{raw_url}" target="_blank" title="Download trajectories">⬇️</a>'
968
- # Apply the function to the "Download" column
969
- pretty_df["Download"] = pretty_df["Download"].apply(format_download_to_html)
970
-
971
  if "Source" in pretty_df.columns:
972
  def format_source_url_to_html(raw_url):
973
  # Handle empty or NaN values, returning a blank string.
 
235
  <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
236
  <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
237
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
 
238
  """
239
  elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
240
  return f"""
 
285
  """
286
  descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
287
  trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
288
+
289
+ # Add download section for benchmark-specific tables (not Overall or category pages)
290
+ download_section = ""
291
+ if which_table not in ["Overall", "Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
292
+ download_section = """
293
+ <div> <!-- Container for the Download section -->
294
+ <b>Download</b>
295
+ <div class="table-legend-item">
296
+ <span style="font-size: 16px; margin-right: 4px;">⬇️</span>
297
+ <span>Trajectories</span>
298
+ </div>
299
+ </div>
300
+ """
301
+
302
  legend_markdown = f"""
303
  <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;">
304
 
 
320
  <div class="table-legend-item">{openness_html}</div>
321
  </div>
322
 
323
+ {download_section}
324
+
325
  <div><!-- Container for the Column Descriptions section -->
326
  <b>Column Descriptions</b>
327
  <span class="tooltip-icon-legend">
 
609
  df_headers = df_display_all.columns.tolist()
610
  df_datatypes = []
611
  for col in df_headers:
612
+ if col == "Logs" or "Cost" in col or "Score" in col:
613
  df_datatypes.append("markdown")
614
  elif col in ["SDK Version", "Language Model"]:
615
  df_datatypes.append("html")
 
624
  if "Score" in col or "Cost" in col:
625
  num_score_cost_cols += 1
626
  dynamic_widths = [90] * num_score_cost_cols
627
+ fixed_end_widths = [90, 100, 50] # Categories Attempted, Date, Logs
628
  # 5. Combine all the lists to create the final, fully dynamic list.
629
  final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
630
 
 
807
  # 3. Prepare the data for this specific benchmark's table and plot
808
  benchmark_score_col = f"{benchmark_name} Score"
809
  benchmark_cost_col = f"{benchmark_name} Cost"
810
+ benchmark_download_col = f"{benchmark_name} Download"
811
 
812
  # Define the columns needed for the detailed table
813
+ table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs', benchmark_download_col, 'id', 'Language Model']
814
 
815
  # Filter to only columns that actually exist in the full dataframe
816
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
 
899
  # 1. Format the cost and score columns
900
  benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
901
  benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
902
+
903
+ # Format download column as clickable icon
904
+ if benchmark_download_col in benchmark_table_df.columns:
905
+ def format_download_link(url):
906
+ if pd.isna(url) or url == "":
907
+ return ""
908
+ return f"[⬇️]({url})"
909
+ benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
910
+
911
  desired_cols_in_order = [
912
  'Language Model',
913
  'SDK Version',
 
915
  benchmark_score_col,
916
  benchmark_cost_col,
917
  'Date',
918
+ 'Logs',
919
+ benchmark_download_col
920
  ]
921
  for col in desired_cols_in_order:
922
  if col not in benchmark_table_df.columns:
923
  benchmark_table_df[col] = pd.NA # Add as an empty column
924
  benchmark_table_df = benchmark_table_df[desired_cols_in_order]
925
  # Rename columns for a cleaner table display, as requested
926
+ benchmark_table_df.rename(columns={
927
  benchmark_score_col: 'Score',
928
  benchmark_cost_col: 'Cost',
929
+ benchmark_download_col: '⬇️', # Empty-ish header with icon hint
930
  }, inplace=True)
931
 
932
  # Now get headers from the renamed dataframe
933
  df_headers = benchmark_table_df.columns.tolist()
934
  df_datatypes = []
935
  for col in df_headers:
936
+ if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col:
937
  df_datatypes.append("markdown")
938
  elif col in ["SDK Version", "Language Model"]:
939
  df_datatypes.append("html")
 
957
  datatype=df_datatypes,
958
  interactive=False,
959
  wrap=True,
960
+ column_widths=[200, 80, 40, 80, 80, 150, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Date, Logs, Download
961
  show_search="search",
962
  elem_classes=["wrap-header-df"]
963
  )
 
986
  # Apply the function to the "Logs" column
987
  pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
988
 
 
 
 
 
 
 
 
 
 
989
  if "Source" in pretty_df.columns:
990
  def format_source_url_to_html(raw_url):
991
  # Handle empty or NaN values, returning a blank string.