Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
b5317d7
1
Parent(s):
1739efc
Add Download column for trajectory archives and increase table font size
Browse filesChanges:
- Add Download column to leaderboard table with download icon (⬇️) linking to trajectory archives
- Preserve full_archive URLs from score data even when pydantic validation is used
- Increase table font size from 14px to 15px for better readability
- Add Download column documentation to tooltip descriptions
Co-authored-by: openhands <openhands@all-hands.dev>
- content.py +1 -1
- leaderboard_transformer.py +2 -1
- simple_data_loader.py +20 -1
- ui_components.py +15 -3
content.py
CHANGED
|
@@ -342,7 +342,7 @@ table.svelte-1e98i6s td {
|
|
| 342 |
vertical-align: top !important;
|
| 343 |
}
|
| 344 |
table.gr-table {
|
| 345 |
-
font-size:
|
| 346 |
}
|
| 347 |
.html-container {
|
| 348 |
padding-top: 0 !important;
|
|
|
|
| 342 |
vertical-align: top !important;
|
| 343 |
}
|
| 344 |
table.gr-table {
|
| 345 |
+
font-size: 15px !important;
|
| 346 |
}
|
| 347 |
.html-container {
|
| 348 |
padding-top: 0 !important;
|
leaderboard_transformer.py
CHANGED
|
@@ -166,6 +166,7 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 166 |
'Overall cost': 'Average Cost', # Legacy support
|
| 167 |
'categories_completed': 'Categories Completed',
|
| 168 |
'Logs': 'Logs',
|
|
|
|
| 169 |
'Openness': 'Openness',
|
| 170 |
'LLM base': 'Model',
|
| 171 |
'Source': 'Source',
|
|
@@ -315,7 +316,7 @@ class DataTransformer:
|
|
| 315 |
# --- 3. Add Columns for Agent Openness ---
|
| 316 |
base_cols = ["id","Language Model","SDK Version","Source"]
|
| 317 |
new_cols = ["Openness"]
|
| 318 |
-
ending_cols = ["Date", "Logs"]
|
| 319 |
|
| 320 |
# For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
|
| 321 |
if tag is None or tag == "Overall":
|
|
|
|
| 166 |
'Overall cost': 'Average Cost', # Legacy support
|
| 167 |
'categories_completed': 'Categories Completed',
|
| 168 |
'Logs': 'Logs',
|
| 169 |
+
'Download': 'Download',
|
| 170 |
'Openness': 'Openness',
|
| 171 |
'LLM base': 'Model',
|
| 172 |
'Source': 'Source',
|
|
|
|
| 316 |
# --- 3. Add Columns for Agent Openness ---
|
| 317 |
base_cols = ["id","Language Model","SDK Version","Source"]
|
| 318 |
new_cols = ["Openness"]
|
| 319 |
+
ending_cols = ["Date", "Logs", "Download"]
|
| 320 |
|
| 321 |
# For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
|
| 322 |
if tag is None or tag == "Overall":
|
simple_data_loader.py
CHANGED
|
@@ -76,7 +76,12 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
|
|
| 76 |
try:
|
| 77 |
validated_score = ScoreEntry(**score)
|
| 78 |
# Use mode='json' to serialize enums as strings
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
except Exception as e:
|
| 81 |
errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
|
| 82 |
validated_scores.append(score) # Fall back to raw data
|
|
@@ -194,6 +199,7 @@ class SimpleLeaderboardViewer:
|
|
| 194 |
'cost_per_instance': score_entry.get('cost_per_instance'),
|
| 195 |
'average_runtime': score_entry.get('average_runtime'),
|
| 196 |
'tags': [score_entry.get('benchmark')],
|
|
|
|
| 197 |
}
|
| 198 |
all_records.append(record)
|
| 199 |
|
|
@@ -265,6 +271,9 @@ class SimpleLeaderboardViewer:
|
|
| 265 |
# Track category-level data for aggregation
|
| 266 |
category_data = {} # {category: {'scores': [...], 'costs': [...]}}
|
| 267 |
|
|
|
|
|
|
|
|
|
|
| 268 |
for _, row in agent_records.iterrows():
|
| 269 |
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
|
| 270 |
for tag in tags:
|
|
@@ -274,6 +283,12 @@ class SimpleLeaderboardViewer:
|
|
| 274 |
dataset_scores.append(row['score'])
|
| 275 |
dataset_costs.append(row['cost_per_instance'])
|
| 276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
# Track category-level data for aggregation
|
| 278 |
if tag in self.benchmark_to_categories:
|
| 279 |
for category in self.benchmark_to_categories[tag]:
|
|
@@ -315,6 +330,10 @@ class SimpleLeaderboardViewer:
|
|
| 315 |
# Track how many categories were completed
|
| 316 |
record['categories_completed'] = categories_with_scores
|
| 317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
transformed_records.append(record)
|
| 319 |
|
| 320 |
transformed_df = pd.DataFrame(transformed_records)
|
|
|
|
| 76 |
try:
|
| 77 |
validated_score = ScoreEntry(**score)
|
| 78 |
# Use mode='json' to serialize enums as strings
|
| 79 |
+
validated_dict = validated_score.model_dump(mode='json')
|
| 80 |
+
# Preserve any extra fields from raw data (like full_archive)
|
| 81 |
+
for key, value in score.items():
|
| 82 |
+
if key not in validated_dict:
|
| 83 |
+
validated_dict[key] = value
|
| 84 |
+
validated_scores.append(validated_dict)
|
| 85 |
except Exception as e:
|
| 86 |
errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
|
| 87 |
validated_scores.append(score) # Fall back to raw data
|
|
|
|
| 199 |
'cost_per_instance': score_entry.get('cost_per_instance'),
|
| 200 |
'average_runtime': score_entry.get('average_runtime'),
|
| 201 |
'tags': [score_entry.get('benchmark')],
|
| 202 |
+
'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
|
| 203 |
}
|
| 204 |
all_records.append(record)
|
| 205 |
|
|
|
|
| 271 |
# Track category-level data for aggregation
|
| 272 |
category_data = {} # {category: {'scores': [...], 'costs': [...]}}
|
| 273 |
|
| 274 |
+
# Collect all full_archive URLs for this agent
|
| 275 |
+
archive_urls = []
|
| 276 |
+
|
| 277 |
for _, row in agent_records.iterrows():
|
| 278 |
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
|
| 279 |
for tag in tags:
|
|
|
|
| 283 |
dataset_scores.append(row['score'])
|
| 284 |
dataset_costs.append(row['cost_per_instance'])
|
| 285 |
|
| 286 |
+
# Store the full_archive URL for this benchmark
|
| 287 |
+
full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
|
| 288 |
+
if full_archive_url:
|
| 289 |
+
archive_urls.append(full_archive_url)
|
| 290 |
+
record[f'{tag} download'] = full_archive_url
|
| 291 |
+
|
| 292 |
# Track category-level data for aggregation
|
| 293 |
if tag in self.benchmark_to_categories:
|
| 294 |
for category in self.benchmark_to_categories[tag]:
|
|
|
|
| 330 |
# Track how many categories were completed
|
| 331 |
record['categories_completed'] = categories_with_scores
|
| 332 |
|
| 333 |
+
# Store all download URLs (for overall view, we'll show the first one or all)
|
| 334 |
+
# Use the first archive URL as the main download link
|
| 335 |
+
record['download'] = archive_urls[0] if archive_urls else ''
|
| 336 |
+
|
| 337 |
transformed_records.append(record)
|
| 338 |
|
| 339 |
transformed_df = pd.DataFrame(transformed_records)
|
ui_components.py
CHANGED
|
@@ -235,6 +235,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 235 |
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
|
| 236 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 237 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
|
|
|
| 238 |
"""
|
| 239 |
elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
|
| 240 |
return f"""
|
|
@@ -246,6 +247,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 246 |
<div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
|
| 247 |
<div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
|
| 248 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
|
|
|
| 249 |
"""
|
| 250 |
else:
|
| 251 |
# Fallback for any other table type, e.g., individual benchmarks
|
|
@@ -256,6 +258,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 256 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
| 257 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
|
| 258 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
|
|
|
| 259 |
"""
|
| 260 |
|
| 261 |
# Create HTML for the "Openness" legend items for table using custom SVG lock icons
|
|
@@ -591,7 +594,7 @@ def create_leaderboard_display(
|
|
| 591 |
df_headers = df_display_all.columns.tolist()
|
| 592 |
df_datatypes = []
|
| 593 |
for col in df_headers:
|
| 594 |
-
if col
|
| 595 |
df_datatypes.append("markdown")
|
| 596 |
elif col in ["SDK Version", "Language Model"]:
|
| 597 |
df_datatypes.append("html")
|
|
@@ -606,7 +609,7 @@ def create_leaderboard_display(
|
|
| 606 |
if "Score" in col or "Cost" in col:
|
| 607 |
num_score_cost_cols += 1
|
| 608 |
dynamic_widths = [90] * num_score_cost_cols
|
| 609 |
-
fixed_end_widths = [90, 100, 50]
|
| 610 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 611 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 612 |
|
|
@@ -903,7 +906,7 @@ def create_benchmark_details_display(
|
|
| 903 |
df_headers = benchmark_table_df.columns.tolist()
|
| 904 |
df_datatypes = []
|
| 905 |
for col in df_headers:
|
| 906 |
-
if "Logs"
|
| 907 |
df_datatypes.append("markdown")
|
| 908 |
elif col in ["SDK Version", "Language Model"]:
|
| 909 |
df_datatypes.append("html")
|
|
@@ -956,6 +959,15 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
|
| 956 |
# Apply the function to the "Logs" column
|
| 957 |
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
|
| 958 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 959 |
if "Source" in pretty_df.columns:
|
| 960 |
def format_source_url_to_html(raw_url):
|
| 961 |
# Handle empty or NaN values, returning a blank string.
|
|
|
|
| 235 |
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
|
| 236 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 237 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 238 |
+
<div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
|
| 239 |
"""
|
| 240 |
elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
|
| 241 |
return f"""
|
|
|
|
| 247 |
<div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
|
| 248 |
<div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
|
| 249 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 250 |
+
<div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
|
| 251 |
"""
|
| 252 |
else:
|
| 253 |
# Fallback for any other table type, e.g., individual benchmarks
|
|
|
|
| 258 |
<div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
|
| 259 |
<div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
|
| 260 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 261 |
+
<div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
|
| 262 |
"""
|
| 263 |
|
| 264 |
# Create HTML for the "Openness" legend items for table using custom SVG lock icons
|
|
|
|
| 594 |
df_headers = df_display_all.columns.tolist()
|
| 595 |
df_datatypes = []
|
| 596 |
for col in df_headers:
|
| 597 |
+
if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
|
| 598 |
df_datatypes.append("markdown")
|
| 599 |
elif col in ["SDK Version", "Language Model"]:
|
| 600 |
df_datatypes.append("html")
|
|
|
|
| 609 |
if "Score" in col or "Cost" in col:
|
| 610 |
num_score_cost_cols += 1
|
| 611 |
dynamic_widths = [90] * num_score_cost_cols
|
| 612 |
+
fixed_end_widths = [90, 100, 50, 60] # Categories Attempted, Date, Logs, Download
|
| 613 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 614 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 615 |
|
|
|
|
| 906 |
df_headers = benchmark_table_df.columns.tolist()
|
| 907 |
df_datatypes = []
|
| 908 |
for col in df_headers:
|
| 909 |
+
if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
|
| 910 |
df_datatypes.append("markdown")
|
| 911 |
elif col in ["SDK Version", "Language Model"]:
|
| 912 |
df_datatypes.append("html")
|
|
|
|
| 959 |
# Apply the function to the "Logs" column
|
| 960 |
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
|
| 961 |
|
| 962 |
+
if "Download" in pretty_df.columns:
|
| 963 |
+
def format_download_to_html(raw_url):
|
| 964 |
+
# Handle empty or NaN values, returning a blank string.
|
| 965 |
+
if pd.isna(raw_url) or raw_url == "": return ""
|
| 966 |
+
# Create a download link with a download icon
|
| 967 |
+
return f'<a href="{raw_url}" target="_blank" title="Download trajectories">⬇️</a>'
|
| 968 |
+
# Apply the function to the "Download" column
|
| 969 |
+
pretty_df["Download"] = pretty_df["Download"].apply(format_download_to_html)
|
| 970 |
+
|
| 971 |
if "Source" in pretty_df.columns:
|
| 972 |
def format_source_url_to_html(raw_url):
|
| 973 |
# Handle empty or NaN values, returning a blank string.
|