Spaces:
Running
Running
openhands openhands commited on
Commit ·
4d0ae13
1
Parent(s): b5317d7
Move Download column to benchmark-specific tables only
Browse files- Download column now appears on benchmark-specific leaderboard tables (e.g., SWE-Bench, GAIA)
- Each benchmark has its own download link to its specific trajectory archive
- Removed Download from Overall aggregate table (doesn't make sense there)
- Added download icon explanation to legend on benchmark tables
- Column header shows just ⬇️ icon to save space
Co-authored-by: openhands <openhands@all-hands.dev>
- leaderboard_transformer.py +1 -2
- simple_data_loader.py +1 -9
- ui_components.py +35 -17
leaderboard_transformer.py
CHANGED
|
@@ -166,7 +166,6 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 166 |
'Overall cost': 'Average Cost', # Legacy support
|
| 167 |
'categories_completed': 'Categories Completed',
|
| 168 |
'Logs': 'Logs',
|
| 169 |
-
'Download': 'Download',
|
| 170 |
'Openness': 'Openness',
|
| 171 |
'LLM base': 'Model',
|
| 172 |
'Source': 'Source',
|
|
@@ -316,7 +315,7 @@ class DataTransformer:
|
|
| 316 |
# --- 3. Add Columns for Agent Openness ---
|
| 317 |
base_cols = ["id","Language Model","SDK Version","Source"]
|
| 318 |
new_cols = ["Openness"]
|
| 319 |
-
ending_cols = ["Date", "Logs"
|
| 320 |
|
| 321 |
# For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
|
| 322 |
if tag is None or tag == "Overall":
|
|
|
|
| 166 |
'Overall cost': 'Average Cost', # Legacy support
|
| 167 |
'categories_completed': 'Categories Completed',
|
| 168 |
'Logs': 'Logs',
|
|
|
|
| 169 |
'Openness': 'Openness',
|
| 170 |
'LLM base': 'Model',
|
| 171 |
'Source': 'Source',
|
|
|
|
| 315 |
# --- 3. Add Columns for Agent Openness ---
|
| 316 |
base_cols = ["id","Language Model","SDK Version","Source"]
|
| 317 |
new_cols = ["Openness"]
|
| 318 |
+
ending_cols = ["Date", "Logs"]
|
| 319 |
|
| 320 |
# For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
|
| 321 |
if tag is None or tag == "Overall":
|
simple_data_loader.py
CHANGED
|
@@ -271,9 +271,6 @@ class SimpleLeaderboardViewer:
|
|
| 271 |
# Track category-level data for aggregation
|
| 272 |
category_data = {} # {category: {'scores': [...], 'costs': [...]}}
|
| 273 |
|
| 274 |
-
# Collect all full_archive URLs for this agent
|
| 275 |
-
archive_urls = []
|
| 276 |
-
|
| 277 |
for _, row in agent_records.iterrows():
|
| 278 |
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
|
| 279 |
for tag in tags:
|
|
@@ -283,10 +280,9 @@ class SimpleLeaderboardViewer:
|
|
| 283 |
dataset_scores.append(row['score'])
|
| 284 |
dataset_costs.append(row['cost_per_instance'])
|
| 285 |
|
| 286 |
-
# Store the full_archive URL for this benchmark
|
| 287 |
full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
|
| 288 |
if full_archive_url:
|
| 289 |
-
archive_urls.append(full_archive_url)
|
| 290 |
record[f'{tag} download'] = full_archive_url
|
| 291 |
|
| 292 |
# Track category-level data for aggregation
|
|
@@ -330,10 +326,6 @@ class SimpleLeaderboardViewer:
|
|
| 330 |
# Track how many categories were completed
|
| 331 |
record['categories_completed'] = categories_with_scores
|
| 332 |
|
| 333 |
-
# Store all download URLs (for overall view, we'll show the first one or all)
|
| 334 |
-
# Use the first archive URL as the main download link
|
| 335 |
-
record['download'] = archive_urls[0] if archive_urls else ''
|
| 336 |
-
|
| 337 |
transformed_records.append(record)
|
| 338 |
|
| 339 |
transformed_df = pd.DataFrame(transformed_records)
|
|
|
|
| 271 |
# Track category-level data for aggregation
|
| 272 |
category_data = {} # {category: {'scores': [...], 'costs': [...]}}
|
| 273 |
|
|
|
|
|
|
|
|
|
|
| 274 |
for _, row in agent_records.iterrows():
|
| 275 |
tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
|
| 276 |
for tag in tags:
|
|
|
|
| 280 |
dataset_scores.append(row['score'])
|
| 281 |
dataset_costs.append(row['cost_per_instance'])
|
| 282 |
|
| 283 |
+
# Store the full_archive URL for this benchmark (for benchmark-specific download)
|
| 284 |
full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
|
| 285 |
if full_archive_url:
|
|
|
|
| 286 |
record[f'{tag} download'] = full_archive_url
|
| 287 |
|
| 288 |
# Track category-level data for aggregation
|
|
|
|
| 326 |
# Track how many categories were completed
|
| 327 |
record['categories_completed'] = categories_with_scores
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
transformed_records.append(record)
|
| 330 |
|
| 331 |
transformed_df = pd.DataFrame(transformed_records)
|
ui_components.py
CHANGED
|
@@ -235,7 +235,6 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 235 |
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
|
| 236 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 237 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 238 |
-
<div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
|
| 239 |
"""
|
| 240 |
elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
|
| 241 |
return f"""
|
|
@@ -286,6 +285,20 @@ def create_legend_markdown(which_table: str) -> str:
|
|
| 286 |
"""
|
| 287 |
descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
|
| 288 |
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
legend_markdown = f"""
|
| 290 |
<div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;">
|
| 291 |
|
|
@@ -307,6 +320,8 @@ def create_legend_markdown(which_table: str) -> str:
|
|
| 307 |
<div class="table-legend-item">{openness_html}</div>
|
| 308 |
</div>
|
| 309 |
|
|
|
|
|
|
|
| 310 |
<div><!-- Container for the Column Descriptions section -->
|
| 311 |
<b>Column Descriptions</b>
|
| 312 |
<span class="tooltip-icon-legend">
|
|
@@ -594,7 +609,7 @@ def create_leaderboard_display(
|
|
| 594 |
df_headers = df_display_all.columns.tolist()
|
| 595 |
df_datatypes = []
|
| 596 |
for col in df_headers:
|
| 597 |
-
if col
|
| 598 |
df_datatypes.append("markdown")
|
| 599 |
elif col in ["SDK Version", "Language Model"]:
|
| 600 |
df_datatypes.append("html")
|
|
@@ -609,7 +624,7 @@ def create_leaderboard_display(
|
|
| 609 |
if "Score" in col or "Cost" in col:
|
| 610 |
num_score_cost_cols += 1
|
| 611 |
dynamic_widths = [90] * num_score_cost_cols
|
| 612 |
-
fixed_end_widths = [90, 100, 50
|
| 613 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 614 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 615 |
|
|
@@ -792,9 +807,10 @@ def create_benchmark_details_display(
|
|
| 792 |
# 3. Prepare the data for this specific benchmark's table and plot
|
| 793 |
benchmark_score_col = f"{benchmark_name} Score"
|
| 794 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
|
|
|
| 795 |
|
| 796 |
# Define the columns needed for the detailed table
|
| 797 |
-
table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
|
| 798 |
|
| 799 |
# Filter to only columns that actually exist in the full dataframe
|
| 800 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
@@ -883,6 +899,15 @@ def create_benchmark_details_display(
|
|
| 883 |
# 1. Format the cost and score columns
|
| 884 |
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
|
| 885 |
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 886 |
desired_cols_in_order = [
|
| 887 |
'Language Model',
|
| 888 |
'SDK Version',
|
|
@@ -890,23 +915,25 @@ def create_benchmark_details_display(
|
|
| 890 |
benchmark_score_col,
|
| 891 |
benchmark_cost_col,
|
| 892 |
'Date',
|
| 893 |
-
'Logs'
|
|
|
|
| 894 |
]
|
| 895 |
for col in desired_cols_in_order:
|
| 896 |
if col not in benchmark_table_df.columns:
|
| 897 |
benchmark_table_df[col] = pd.NA # Add as an empty column
|
| 898 |
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
|
| 899 |
# Rename columns for a cleaner table display, as requested
|
| 900 |
-
benchmark_table_df.rename({
|
| 901 |
benchmark_score_col: 'Score',
|
| 902 |
benchmark_cost_col: 'Cost',
|
|
|
|
| 903 |
}, inplace=True)
|
| 904 |
|
| 905 |
# Now get headers from the renamed dataframe
|
| 906 |
df_headers = benchmark_table_df.columns.tolist()
|
| 907 |
df_datatypes = []
|
| 908 |
for col in df_headers:
|
| 909 |
-
if col in ["Logs", "
|
| 910 |
df_datatypes.append("markdown")
|
| 911 |
elif col in ["SDK Version", "Language Model"]:
|
| 912 |
df_datatypes.append("html")
|
|
@@ -930,7 +957,7 @@ def create_benchmark_details_display(
|
|
| 930 |
datatype=df_datatypes,
|
| 931 |
interactive=False,
|
| 932 |
wrap=True,
|
| 933 |
-
column_widths=[
|
| 934 |
show_search="search",
|
| 935 |
elem_classes=["wrap-header-df"]
|
| 936 |
)
|
|
@@ -959,15 +986,6 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
|
|
| 959 |
# Apply the function to the "Logs" column
|
| 960 |
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
|
| 961 |
|
| 962 |
-
if "Download" in pretty_df.columns:
|
| 963 |
-
def format_download_to_html(raw_url):
|
| 964 |
-
# Handle empty or NaN values, returning a blank string.
|
| 965 |
-
if pd.isna(raw_url) or raw_url == "": return ""
|
| 966 |
-
# Create a download link with a download icon
|
| 967 |
-
return f'<a href="{raw_url}" target="_blank" title="Download trajectories">⬇️</a>'
|
| 968 |
-
# Apply the function to the "Download" column
|
| 969 |
-
pretty_df["Download"] = pretty_df["Download"].apply(format_download_to_html)
|
| 970 |
-
|
| 971 |
if "Source" in pretty_df.columns:
|
| 972 |
def format_source_url_to_html(raw_url):
|
| 973 |
# Handle empty or NaN values, returning a blank string.
|
|
|
|
| 235 |
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
|
| 236 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 237 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
|
|
|
| 238 |
"""
|
| 239 |
elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
|
| 240 |
return f"""
|
|
|
|
| 285 |
"""
|
| 286 |
descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
|
| 287 |
trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
|
| 288 |
+
|
| 289 |
+
# Add download section for benchmark-specific tables (not Overall or category pages)
|
| 290 |
+
download_section = ""
|
| 291 |
+
if which_table not in ["Overall", "Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
|
| 292 |
+
download_section = """
|
| 293 |
+
<div> <!-- Container for the Download section -->
|
| 294 |
+
<b>Download</b>
|
| 295 |
+
<div class="table-legend-item">
|
| 296 |
+
<span style="font-size: 16px; margin-right: 4px;">⬇️</span>
|
| 297 |
+
<span>Trajectories</span>
|
| 298 |
+
</div>
|
| 299 |
+
</div>
|
| 300 |
+
"""
|
| 301 |
+
|
| 302 |
legend_markdown = f"""
|
| 303 |
<div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;">
|
| 304 |
|
|
|
|
| 320 |
<div class="table-legend-item">{openness_html}</div>
|
| 321 |
</div>
|
| 322 |
|
| 323 |
+
{download_section}
|
| 324 |
+
|
| 325 |
<div><!-- Container for the Column Descriptions section -->
|
| 326 |
<b>Column Descriptions</b>
|
| 327 |
<span class="tooltip-icon-legend">
|
|
|
|
| 609 |
df_headers = df_display_all.columns.tolist()
|
| 610 |
df_datatypes = []
|
| 611 |
for col in df_headers:
|
| 612 |
+
if col == "Logs" or "Cost" in col or "Score" in col:
|
| 613 |
df_datatypes.append("markdown")
|
| 614 |
elif col in ["SDK Version", "Language Model"]:
|
| 615 |
df_datatypes.append("html")
|
|
|
|
| 624 |
if "Score" in col or "Cost" in col:
|
| 625 |
num_score_cost_cols += 1
|
| 626 |
dynamic_widths = [90] * num_score_cost_cols
|
| 627 |
+
fixed_end_widths = [90, 100, 50] # Categories Attempted, Date, Logs
|
| 628 |
# 5. Combine all the lists to create the final, fully dynamic list.
|
| 629 |
final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
|
| 630 |
|
|
|
|
| 807 |
# 3. Prepare the data for this specific benchmark's table and plot
|
| 808 |
benchmark_score_col = f"{benchmark_name} Score"
|
| 809 |
benchmark_cost_col = f"{benchmark_name} Cost"
|
| 810 |
+
benchmark_download_col = f"{benchmark_name} Download"
|
| 811 |
|
| 812 |
# Define the columns needed for the detailed table
|
| 813 |
+
table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs', benchmark_download_col, 'id', 'Language Model']
|
| 814 |
|
| 815 |
# Filter to only columns that actually exist in the full dataframe
|
| 816 |
existing_table_cols = [col for col in table_cols if col in full_df.columns]
|
|
|
|
| 899 |
# 1. Format the cost and score columns
|
| 900 |
benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
|
| 901 |
benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
|
| 902 |
+
|
| 903 |
+
# Format download column as clickable icon
|
| 904 |
+
if benchmark_download_col in benchmark_table_df.columns:
|
| 905 |
+
def format_download_link(url):
|
| 906 |
+
if pd.isna(url) or url == "":
|
| 907 |
+
return ""
|
| 908 |
+
return f"[⬇️]({url})"
|
| 909 |
+
benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
|
| 910 |
+
|
| 911 |
desired_cols_in_order = [
|
| 912 |
'Language Model',
|
| 913 |
'SDK Version',
|
|
|
|
| 915 |
benchmark_score_col,
|
| 916 |
benchmark_cost_col,
|
| 917 |
'Date',
|
| 918 |
+
'Logs',
|
| 919 |
+
benchmark_download_col
|
| 920 |
]
|
| 921 |
for col in desired_cols_in_order:
|
| 922 |
if col not in benchmark_table_df.columns:
|
| 923 |
benchmark_table_df[col] = pd.NA # Add as an empty column
|
| 924 |
benchmark_table_df = benchmark_table_df[desired_cols_in_order]
|
| 925 |
# Rename columns for a cleaner table display, as requested
|
| 926 |
+
benchmark_table_df.rename(columns={
|
| 927 |
benchmark_score_col: 'Score',
|
| 928 |
benchmark_cost_col: 'Cost',
|
| 929 |
+
benchmark_download_col: '⬇️', # Empty-ish header with icon hint
|
| 930 |
}, inplace=True)
|
| 931 |
|
| 932 |
# Now get headers from the renamed dataframe
|
| 933 |
df_headers = benchmark_table_df.columns.tolist()
|
| 934 |
df_datatypes = []
|
| 935 |
for col in df_headers:
|
| 936 |
+
if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col:
|
| 937 |
df_datatypes.append("markdown")
|
| 938 |
elif col in ["SDK Version", "Language Model"]:
|
| 939 |
df_datatypes.append("html")
|
|
|
|
| 957 |
datatype=df_datatypes,
|
| 958 |
interactive=False,
|
| 959 |
wrap=True,
|
| 960 |
+
column_widths=[200, 80, 40, 80, 80, 150, 40, 40], # Language Model, SDK Version, Attempted, Score, Cost, Date, Logs, Download
|
| 961 |
show_search="search",
|
| 962 |
elem_classes=["wrap-header-df"]
|
| 963 |
)
|
|
|
|
| 986 |
# Apply the function to the "Logs" column
|
| 987 |
pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
|
| 988 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 989 |
if "Source" in pretty_df.columns:
|
| 990 |
def format_source_url_to_html(raw_url):
|
| 991 |
# Handle empty or NaN values, returning a blank string.
|