openhands openhands commited on
Commit
b5317d7
·
1 Parent(s): 1739efc

Add Download column for trajectory archives and increase table font size

Browse files

Changes:
- Add Download column to leaderboard table with download icon (⬇️) linking to trajectory archives
- Preserve full_archive URLs from score data even when pydantic validation is used
- Increase table font size from 14px to 15px for better readability
- Add Download column documentation to tooltip descriptions

Co-authored-by: openhands <openhands@all-hands.dev>

content.py CHANGED
@@ -342,7 +342,7 @@ table.svelte-1e98i6s td {
342
  vertical-align: top !important;
343
  }
344
  table.gr-table {
345
- font-size: 14px !important;
346
  }
347
  .html-container {
348
  padding-top: 0 !important;
 
342
  vertical-align: top !important;
343
  }
344
  table.gr-table {
345
+ font-size: 15px !important;
346
  }
347
  .html-container {
348
  padding-top: 0 !important;
leaderboard_transformer.py CHANGED
@@ -166,6 +166,7 @@ def _pretty_column_name(raw_col: str) -> str:
166
  'Overall cost': 'Average Cost', # Legacy support
167
  'categories_completed': 'Categories Completed',
168
  'Logs': 'Logs',
 
169
  'Openness': 'Openness',
170
  'LLM base': 'Model',
171
  'Source': 'Source',
@@ -315,7 +316,7 @@ class DataTransformer:
315
  # --- 3. Add Columns for Agent Openness ---
316
  base_cols = ["id","Language Model","SDK Version","Source"]
317
  new_cols = ["Openness"]
318
- ending_cols = ["Date", "Logs"]
319
 
320
  # For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
321
  if tag is None or tag == "Overall":
 
166
  'Overall cost': 'Average Cost', # Legacy support
167
  'categories_completed': 'Categories Completed',
168
  'Logs': 'Logs',
169
+ 'Download': 'Download',
170
  'Openness': 'Openness',
171
  'LLM base': 'Model',
172
  'Source': 'Source',
 
316
  # --- 3. Add Columns for Agent Openness ---
317
  base_cols = ["id","Language Model","SDK Version","Source"]
318
  new_cols = ["Openness"]
319
+ ending_cols = ["Date", "Logs", "Download"]
320
 
321
  # For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
322
  if tag is None or tag == "Overall":
simple_data_loader.py CHANGED
@@ -76,7 +76,12 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
76
  try:
77
  validated_score = ScoreEntry(**score)
78
  # Use mode='json' to serialize enums as strings
79
- validated_scores.append(validated_score.model_dump(mode='json'))
 
 
 
 
 
80
  except Exception as e:
81
  errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
82
  validated_scores.append(score) # Fall back to raw data
@@ -194,6 +199,7 @@ class SimpleLeaderboardViewer:
194
  'cost_per_instance': score_entry.get('cost_per_instance'),
195
  'average_runtime': score_entry.get('average_runtime'),
196
  'tags': [score_entry.get('benchmark')],
 
197
  }
198
  all_records.append(record)
199
 
@@ -265,6 +271,9 @@ class SimpleLeaderboardViewer:
265
  # Track category-level data for aggregation
266
  category_data = {} # {category: {'scores': [...], 'costs': [...]}}
267
 
 
 
 
268
  for _, row in agent_records.iterrows():
269
  tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
270
  for tag in tags:
@@ -274,6 +283,12 @@ class SimpleLeaderboardViewer:
274
  dataset_scores.append(row['score'])
275
  dataset_costs.append(row['cost_per_instance'])
276
 
 
 
 
 
 
 
277
  # Track category-level data for aggregation
278
  if tag in self.benchmark_to_categories:
279
  for category in self.benchmark_to_categories[tag]:
@@ -315,6 +330,10 @@ class SimpleLeaderboardViewer:
315
  # Track how many categories were completed
316
  record['categories_completed'] = categories_with_scores
317
 
 
 
 
 
318
  transformed_records.append(record)
319
 
320
  transformed_df = pd.DataFrame(transformed_records)
 
76
  try:
77
  validated_score = ScoreEntry(**score)
78
  # Use mode='json' to serialize enums as strings
79
+ validated_dict = validated_score.model_dump(mode='json')
80
+ # Preserve any extra fields from raw data (like full_archive)
81
+ for key, value in score.items():
82
+ if key not in validated_dict:
83
+ validated_dict[key] = value
84
+ validated_scores.append(validated_dict)
85
  except Exception as e:
86
  errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
87
  validated_scores.append(score) # Fall back to raw data
 
199
  'cost_per_instance': score_entry.get('cost_per_instance'),
200
  'average_runtime': score_entry.get('average_runtime'),
201
  'tags': [score_entry.get('benchmark')],
202
+ 'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
203
  }
204
  all_records.append(record)
205
 
 
271
  # Track category-level data for aggregation
272
  category_data = {} # {category: {'scores': [...], 'costs': [...]}}
273
 
274
+ # Collect all full_archive URLs for this agent
275
+ archive_urls = []
276
+
277
  for _, row in agent_records.iterrows():
278
  tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
279
  for tag in tags:
 
283
  dataset_scores.append(row['score'])
284
  dataset_costs.append(row['cost_per_instance'])
285
 
286
+ # Store the full_archive URL for this benchmark
287
+ full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
288
+ if full_archive_url:
289
+ archive_urls.append(full_archive_url)
290
+ record[f'{tag} download'] = full_archive_url
291
+
292
  # Track category-level data for aggregation
293
  if tag in self.benchmark_to_categories:
294
  for category in self.benchmark_to_categories[tag]:
 
330
  # Track how many categories were completed
331
  record['categories_completed'] = categories_with_scores
332
 
333
+ # Store all download URLs (for overall view, we'll show the first one or all)
334
+ # Use the first archive URL as the main download link
335
+ record['download'] = archive_urls[0] if archive_urls else ''
336
+
337
  transformed_records.append(record)
338
 
339
  transformed_df = pd.DataFrame(transformed_records)
ui_components.py CHANGED
@@ -235,6 +235,7 @@ def build_descriptions_tooltip_content(table) -> str:
235
  <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
236
  <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
237
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
 
238
  """
239
  elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
240
  return f"""
@@ -246,6 +247,7 @@ def build_descriptions_tooltip_content(table) -> str:
246
  <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
247
  <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
248
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
 
249
  """
250
  else:
251
  # Fallback for any other table type, e.g., individual benchmarks
@@ -256,6 +258,7 @@ def build_descriptions_tooltip_content(table) -> str:
256
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
257
  <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
258
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
 
259
  """
260
 
261
  # Create HTML for the "Openness" legend items for table using custom SVG lock icons
@@ -591,7 +594,7 @@ def create_leaderboard_display(
591
  df_headers = df_display_all.columns.tolist()
592
  df_datatypes = []
593
  for col in df_headers:
594
- if col == "Logs" or "Cost" in col or "Score" in col:
595
  df_datatypes.append("markdown")
596
  elif col in ["SDK Version", "Language Model"]:
597
  df_datatypes.append("html")
@@ -606,7 +609,7 @@ def create_leaderboard_display(
606
  if "Score" in col or "Cost" in col:
607
  num_score_cost_cols += 1
608
  dynamic_widths = [90] * num_score_cost_cols
609
- fixed_end_widths = [90, 100, 50]
610
  # 5. Combine all the lists to create the final, fully dynamic list.
611
  final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
612
 
@@ -903,7 +906,7 @@ def create_benchmark_details_display(
903
  df_headers = benchmark_table_df.columns.tolist()
904
  df_datatypes = []
905
  for col in df_headers:
906
- if "Logs" in col or "Cost" in col or "Score" in col:
907
  df_datatypes.append("markdown")
908
  elif col in ["SDK Version", "Language Model"]:
909
  df_datatypes.append("html")
@@ -956,6 +959,15 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
956
  # Apply the function to the "Logs" column
957
  pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
958
 
 
 
 
 
 
 
 
 
 
959
  if "Source" in pretty_df.columns:
960
  def format_source_url_to_html(raw_url):
961
  # Handle empty or NaN values, returning a blank string.
 
235
  <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
236
  <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
237
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
238
+ <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
239
  """
240
  elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
241
  return f"""
 
247
  <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
248
  <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
249
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
250
+ <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
251
  """
252
  else:
253
  # Fallback for any other table type, e.g., individual benchmarks
 
258
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
259
  <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
260
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
261
+ <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
262
  """
263
 
264
  # Create HTML for the "Openness" legend items for table using custom SVG lock icons
 
594
  df_headers = df_display_all.columns.tolist()
595
  df_datatypes = []
596
  for col in df_headers:
597
+ if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
598
  df_datatypes.append("markdown")
599
  elif col in ["SDK Version", "Language Model"]:
600
  df_datatypes.append("html")
 
609
  if "Score" in col or "Cost" in col:
610
  num_score_cost_cols += 1
611
  dynamic_widths = [90] * num_score_cost_cols
612
+ fixed_end_widths = [90, 100, 50, 60] # Categories Attempted, Date, Logs, Download
613
  # 5. Combine all the lists to create the final, fully dynamic list.
614
  final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
615
 
 
906
  df_headers = benchmark_table_df.columns.tolist()
907
  df_datatypes = []
908
  for col in df_headers:
909
+ if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
910
  df_datatypes.append("markdown")
911
  elif col in ["SDK Version", "Language Model"]:
912
  df_datatypes.append("html")
 
959
  # Apply the function to the "Logs" column
960
  pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
961
 
962
+ if "Download" in pretty_df.columns:
963
+ def format_download_to_html(raw_url):
964
+ # Handle empty or NaN values, returning a blank string.
965
+ if pd.isna(raw_url) or raw_url == "": return ""
966
+ # Create a download link with a download icon
967
+ return f'<a href="{raw_url}" target="_blank" title="Download trajectories">⬇️</a>'
968
+ # Apply the function to the "Download" column
969
+ pretty_df["Download"] = pretty_df["Download"].apply(format_download_to_html)
970
+
971
  if "Source" in pretty_df.columns:
972
  def format_source_url_to_html(raw_url):
973
  # Handle empty or NaN values, returning a blank string.