Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Jan 22

Commit

4d0ae13

1 Parent(s): b5317d7

Move Download column to benchmark-specific tables only

- Download column now appears on benchmark-specific leaderboard tables (e.g., SWE-Bench, GAIA)
- Each benchmark has its own download link to its specific trajectory archive
- Removed Download from Overall aggregate table (doesn't make sense there)
- Added download icon explanation to legend on benchmark tables
- Column header shows just ⬇️ icon to save space

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (3) hide show

leaderboard_transformer.py +1 -2
simple_data_loader.py +1 -9
ui_components.py +35 -17

leaderboard_transformer.py CHANGED Viewed

@@ -166,7 +166,6 @@ def _pretty_column_name(raw_col: str) -> str:
         'Overall cost': 'Average Cost',  # Legacy support
         'categories_completed': 'Categories Completed',
         'Logs': 'Logs',
-        'Download': 'Download',
         'Openness': 'Openness',
         'LLM base': 'Model',
         'Source': 'Source',
@@ -316,7 +315,7 @@ class DataTransformer:
         # --- 3. Add Columns for Agent Openness ---
         base_cols = ["id","Language Model","SDK Version","Source"]
         new_cols = ["Openness"]
-        ending_cols = ["Date", "Logs", "Download"]
         # For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
         if tag is None or tag == "Overall":

         'Overall cost': 'Average Cost',  # Legacy support
         'categories_completed': 'Categories Completed',
         'Logs': 'Logs',
         'Openness': 'Openness',
         'LLM base': 'Model',
         'Source': 'Source',
         # --- 3. Add Columns for Agent Openness ---
         base_cols = ["id","Language Model","SDK Version","Source"]
         new_cols = ["Openness"]
+        ending_cols = ["Date", "Logs"]
         # For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
         if tag is None or tag == "Overall":

simple_data_loader.py CHANGED Viewed

@@ -271,9 +271,6 @@ class SimpleLeaderboardViewer:
                 # Track category-level data for aggregation
                 category_data = {}  # {category: {'scores': [...], 'costs': [...]}}
-                # Collect all full_archive URLs for this agent
-                archive_urls = []
                 for _, row in agent_records.iterrows():
                     tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                     for tag in tags:
@@ -283,10 +280,9 @@ class SimpleLeaderboardViewer:
                         dataset_scores.append(row['score'])
                         dataset_costs.append(row['cost_per_instance'])
-                        # Store the full_archive URL for this benchmark
                         full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
                         if full_archive_url:
-                            archive_urls.append(full_archive_url)
                             record[f'{tag} download'] = full_archive_url
                         # Track category-level data for aggregation
@@ -330,10 +326,6 @@ class SimpleLeaderboardViewer:
                 # Track how many categories were completed
                 record['categories_completed'] = categories_with_scores
-                # Store all download URLs (for overall view, we'll show the first one or all)
-                # Use the first archive URL as the main download link
-                record['download'] = archive_urls[0] if archive_urls else ''
                 transformed_records.append(record)
             transformed_df = pd.DataFrame(transformed_records)

                 # Track category-level data for aggregation
                 category_data = {}  # {category: {'scores': [...], 'costs': [...]}}
                 for _, row in agent_records.iterrows():
                     tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                     for tag in tags:
                         dataset_scores.append(row['score'])
                         dataset_costs.append(row['cost_per_instance'])
+                        # Store the full_archive URL for this benchmark (for benchmark-specific download)
                         full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
                         if full_archive_url:
                             record[f'{tag} download'] = full_archive_url
                         # Track category-level data for aggregation
                 # Track how many categories were completed
                 record['categories_completed'] = categories_with_scores
                 transformed_records.append(record)
             transformed_df = pd.DataFrame(transformed_records)

ui_components.py CHANGED Viewed

@@ -235,7 +235,6 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
-            <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
         """
     elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
         return f"""
@@ -286,6 +285,20 @@ def create_legend_markdown(which_table: str) -> str:
     """
     descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
     trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
     legend_markdown = f"""
     <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;">
@@ -307,6 +320,8 @@ def create_legend_markdown(which_table: str) -> str:
             <div class="table-legend-item">{openness_html}</div>
         </div>
         <div><!-- Container for the Column Descriptions section -->
             <b>Column Descriptions</b>
             <span class="tooltip-icon-legend">
@@ -594,7 +609,7 @@ def create_leaderboard_display(
     df_headers = df_display_all.columns.tolist()
     df_datatypes = []
     for col in df_headers:
-        if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
         elif col in ["SDK Version", "Language Model"]:
             df_datatypes.append("html")
@@ -609,7 +624,7 @@ def create_leaderboard_display(
         if "Score" in col or "Cost" in col:
             num_score_cost_cols += 1
     dynamic_widths = [90] * num_score_cost_cols
-    fixed_end_widths = [90, 100, 50, 60]  # Categories Attempted, Date, Logs, Download
     # 5. Combine all the lists to create the final, fully dynamic list.
     final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
@@ -792,9 +807,10 @@ def create_benchmark_details_display(
         # 3. Prepare the data for this specific benchmark's table and plot
         benchmark_score_col = f"{benchmark_name} Score"
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
-        table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -883,6 +899,15 @@ def create_benchmark_details_display(
         # 1. Format the cost and score columns
         benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
         benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
         desired_cols_in_order = [
             'Language Model',
             'SDK Version',
@@ -890,23 +915,25 @@ def create_benchmark_details_display(
             benchmark_score_col,
             benchmark_cost_col,
             'Date',
-            'Logs'
         ]
         for col in desired_cols_in_order:
             if col not in benchmark_table_df.columns:
                 benchmark_table_df[col] = pd.NA # Add as an empty column
         benchmark_table_df = benchmark_table_df[desired_cols_in_order]
         # Rename columns for a cleaner table display, as requested
-        benchmark_table_df.rename({
             benchmark_score_col: 'Score',
             benchmark_cost_col: 'Cost',
         }, inplace=True)
         # Now get headers from the renamed dataframe
         df_headers = benchmark_table_df.columns.tolist()
         df_datatypes = []
         for col in df_headers:
-            if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
             elif col in ["SDK Version", "Language Model"]:
                 df_datatypes.append("html")
@@ -930,7 +957,7 @@ def create_benchmark_details_display(
                 datatype=df_datatypes,
                 interactive=False,
                 wrap=True,
-                column_widths=[40, 40, 40, 200, 150, 175, 85, 100, 100, 80, 40],
                 show_search="search",
                 elem_classes=["wrap-header-df"]
             )
@@ -959,15 +986,6 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
             # Apply the function to the "Logs" column
             pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
-        if "Download" in pretty_df.columns:
-            def format_download_to_html(raw_url):
-                # Handle empty or NaN values, returning a blank string.
-                if pd.isna(raw_url) or raw_url == "": return ""
-                # Create a download link with a download icon
-                return f'<a href="{raw_url}" target="_blank" title="Download trajectories">⬇️</a>'
-            # Apply the function to the "Download" column
-            pretty_df["Download"] = pretty_df["Download"].apply(format_download_to_html)
         if "Source" in pretty_df.columns:
             def format_source_url_to_html(raw_url):
                 # Handle empty or NaN values, returning a blank string.

             <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
     elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
         return f"""
     """
     descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
     trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
+    # Add download section for benchmark-specific tables (not Overall or category pages)
+    download_section = ""
+    if which_table not in ["Overall", "Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
+        download_section = """
+        <div> <!-- Container for the Download section -->
+            <b>Download</b>
+            <div class="table-legend-item">
+                <span style="font-size: 16px; margin-right: 4px;">⬇️</span>
+                <span>Trajectories</span>
+            </div>
+        </div>
+        """
     legend_markdown = f"""
     <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;">
             <div class="table-legend-item">{openness_html}</div>
         </div>
+        {download_section}
         <div><!-- Container for the Column Descriptions section -->
             <b>Column Descriptions</b>
             <span class="tooltip-icon-legend">
     df_headers = df_display_all.columns.tolist()
     df_datatypes = []
     for col in df_headers:
+        if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
         elif col in ["SDK Version", "Language Model"]:
             df_datatypes.append("html")
         if "Score" in col or "Cost" in col:
             num_score_cost_cols += 1
     dynamic_widths = [90] * num_score_cost_cols
+    fixed_end_widths = [90, 100, 50]  # Categories Attempted, Date, Logs
     # 5. Combine all the lists to create the final, fully dynamic list.
     final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
         # 3. Prepare the data for this specific benchmark's table and plot
         benchmark_score_col = f"{benchmark_name} Score"
         benchmark_cost_col = f"{benchmark_name} Cost"
+        benchmark_download_col = f"{benchmark_name} Download"
         # Define the columns needed for the detailed table
+        table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs', benchmark_download_col, 'id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
         # 1. Format the cost and score columns
         benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
         benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
+        # Format download column as clickable icon
+        if benchmark_download_col in benchmark_table_df.columns:
+            def format_download_link(url):
+                if pd.isna(url) or url == "":
+                    return ""
+                return f"[⬇️]({url})"
+            benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
         desired_cols_in_order = [
             'Language Model',
             'SDK Version',
             benchmark_score_col,
             benchmark_cost_col,
             'Date',
+            'Logs',
+            benchmark_download_col
         ]
         for col in desired_cols_in_order:
             if col not in benchmark_table_df.columns:
                 benchmark_table_df[col] = pd.NA # Add as an empty column
         benchmark_table_df = benchmark_table_df[desired_cols_in_order]
         # Rename columns for a cleaner table display, as requested
+        benchmark_table_df.rename(columns={
             benchmark_score_col: 'Score',
             benchmark_cost_col: 'Cost',
+            benchmark_download_col: '⬇️',  # Empty-ish header with icon hint
         }, inplace=True)
         # Now get headers from the renamed dataframe
         df_headers = benchmark_table_df.columns.tolist()
         df_datatypes = []
         for col in df_headers:
+            if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
             elif col in ["SDK Version", "Language Model"]:
                 df_datatypes.append("html")
                 datatype=df_datatypes,
                 interactive=False,
                 wrap=True,
+                column_widths=[200, 80, 40, 80, 80, 150, 40, 40],  # Language Model, SDK Version, Attempted, Score, Cost, Date, Logs, Download
                 show_search="search",
                 elem_classes=["wrap-header-df"]
             )
             # Apply the function to the "Logs" column
             pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
         if "Source" in pretty_df.columns:
             def format_source_url_to_html(raw_url):
                 # Handle empty or NaN values, returning a blank string.