Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Jan 22

Commit

b5317d7

1 Parent(s): 1739efc

Add Download column for trajectory archives and increase table font size

Changes:
- Add Download column to leaderboard table with download icon (⬇️) linking to trajectory archives
- Preserve full_archive URLs from score data even when pydantic validation is used
- Increase table font size from 14px to 15px for better readability
- Add Download column documentation to tooltip descriptions

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (4) hide show

content.py +1 -1
leaderboard_transformer.py +2 -1
simple_data_loader.py +20 -1
ui_components.py +15 -3

content.py CHANGED Viewed

@@ -342,7 +342,7 @@ table.svelte-1e98i6s td {
     vertical-align: top !important;
 }
 table.gr-table {
-    font-size: 14px !important;
 }
 .html-container {
     padding-top: 0 !important;

     vertical-align: top !important;
 }
 table.gr-table {
+    font-size: 15px !important;
 }
 .html-container {
     padding-top: 0 !important;

leaderboard_transformer.py CHANGED Viewed

@@ -166,6 +166,7 @@ def _pretty_column_name(raw_col: str) -> str:
         'Overall cost': 'Average Cost',  # Legacy support
         'categories_completed': 'Categories Completed',
         'Logs': 'Logs',
         'Openness': 'Openness',
         'LLM base': 'Model',
         'Source': 'Source',
@@ -315,7 +316,7 @@ class DataTransformer:
         # --- 3. Add Columns for Agent Openness ---
         base_cols = ["id","Language Model","SDK Version","Source"]
         new_cols = ["Openness"]
-        ending_cols = ["Date", "Logs"]
         # For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
         if tag is None or tag == "Overall":

         'Overall cost': 'Average Cost',  # Legacy support
         'categories_completed': 'Categories Completed',
         'Logs': 'Logs',
+        'Download': 'Download',
         'Openness': 'Openness',
         'LLM base': 'Model',
         'Source': 'Source',
         # --- 3. Add Columns for Agent Openness ---
         base_cols = ["id","Language Model","SDK Version","Source"]
         new_cols = ["Openness"]
+        ending_cols = ["Date", "Logs", "Download"]
         # For Overall view, use "Average Cost" (average cost per instance across all benchmarks)
         if tag is None or tag == "Overall":

simple_data_loader.py CHANGED Viewed

@@ -76,7 +76,12 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
             try:
                 validated_score = ScoreEntry(**score)
                 # Use mode='json' to serialize enums as strings
-                validated_scores.append(validated_score.model_dump(mode='json'))
             except Exception as e:
                 errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
                 validated_scores.append(score)  # Fall back to raw data
@@ -194,6 +199,7 @@ class SimpleLeaderboardViewer:
                     'cost_per_instance': score_entry.get('cost_per_instance'),
                     'average_runtime': score_entry.get('average_runtime'),
                     'tags': [score_entry.get('benchmark')],
                 }
                 all_records.append(record)
@@ -265,6 +271,9 @@ class SimpleLeaderboardViewer:
                 # Track category-level data for aggregation
                 category_data = {}  # {category: {'scores': [...], 'costs': [...]}}
                 for _, row in agent_records.iterrows():
                     tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                     for tag in tags:
@@ -274,6 +283,12 @@ class SimpleLeaderboardViewer:
                         dataset_scores.append(row['score'])
                         dataset_costs.append(row['cost_per_instance'])
                         # Track category-level data for aggregation
                         if tag in self.benchmark_to_categories:
                             for category in self.benchmark_to_categories[tag]:
@@ -315,6 +330,10 @@ class SimpleLeaderboardViewer:
                 # Track how many categories were completed
                 record['categories_completed'] = categories_with_scores
                 transformed_records.append(record)
             transformed_df = pd.DataFrame(transformed_records)

             try:
                 validated_score = ScoreEntry(**score)
                 # Use mode='json' to serialize enums as strings
+                validated_dict = validated_score.model_dump(mode='json')
+                # Preserve any extra fields from raw data (like full_archive)
+                for key, value in score.items():
+                    if key not in validated_dict:
+                        validated_dict[key] = value
+                validated_scores.append(validated_dict)
             except Exception as e:
                 errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
                 validated_scores.append(score)  # Fall back to raw data
                     'cost_per_instance': score_entry.get('cost_per_instance'),
                     'average_runtime': score_entry.get('average_runtime'),
                     'tags': [score_entry.get('benchmark')],
+                    'full_archive': score_entry.get('full_archive', ''),  # Download URL for trajectories
                 }
                 all_records.append(record)
                 # Track category-level data for aggregation
                 category_data = {}  # {category: {'scores': [...], 'costs': [...]}}
+                # Collect all full_archive URLs for this agent
+                archive_urls = []
                 for _, row in agent_records.iterrows():
                     tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                     for tag in tags:
                         dataset_scores.append(row['score'])
                         dataset_costs.append(row['cost_per_instance'])
+                        # Store the full_archive URL for this benchmark
+                        full_archive_url = row.get('full_archive', '') if hasattr(row, 'get') else row['full_archive'] if 'full_archive' in row.index else ''
+                        if full_archive_url:
+                            archive_urls.append(full_archive_url)
+                            record[f'{tag} download'] = full_archive_url
                         # Track category-level data for aggregation
                         if tag in self.benchmark_to_categories:
                             for category in self.benchmark_to_categories[tag]:
                 # Track how many categories were completed
                 record['categories_completed'] = categories_with_scores
+                # Store all download URLs (for overall view, we'll show the first one or all)
+                # Use the first archive URL as the main download link
+                record['download'] = archive_urls[0] if archive_urls else ''
                 transformed_records.append(record)
             transformed_df = pd.DataFrame(transformed_records)

ui_components.py CHANGED Viewed

@@ -235,6 +235,7 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
     elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
         return f"""
@@ -246,6 +247,7 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
             <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
     else:
         # Fallback for any other table type, e.g., individual benchmarks
@@ -256,6 +258,7 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
 # Create HTML for the "Openness" legend items for table using custom SVG lock icons
@@ -591,7 +594,7 @@ def create_leaderboard_display(
     df_headers = df_display_all.columns.tolist()
     df_datatypes = []
     for col in df_headers:
-        if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
         elif col in ["SDK Version", "Language Model"]:
             df_datatypes.append("html")
@@ -606,7 +609,7 @@ def create_leaderboard_display(
         if "Score" in col or "Cost" in col:
             num_score_cost_cols += 1
     dynamic_widths = [90] * num_score_cost_cols
-    fixed_end_widths = [90, 100, 50]
     # 5. Combine all the lists to create the final, fully dynamic list.
     final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
@@ -903,7 +906,7 @@ def create_benchmark_details_display(
         df_headers = benchmark_table_df.columns.tolist()
         df_datatypes = []
         for col in df_headers:
-            if "Logs" in col or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
             elif col in ["SDK Version", "Language Model"]:
                 df_datatypes.append("html")
@@ -956,6 +959,15 @@ def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
             # Apply the function to the "Logs" column
             pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
         if "Source" in pretty_df.columns:
             def format_source_url_to_html(raw_url):
                 # Handle empty or NaN values, returning a blank string.

             <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
+            <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
         """
     elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
         return f"""
             <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
             <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
+            <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
         """
     else:
         # Fallback for any other table type, e.g., individual benchmarks
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
+            <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
         """
 # Create HTML for the "Openness" legend items for table using custom SVG lock icons
     df_headers = df_display_all.columns.tolist()
     df_datatypes = []
     for col in df_headers:
+        if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
         elif col in ["SDK Version", "Language Model"]:
             df_datatypes.append("html")
         if "Score" in col or "Cost" in col:
             num_score_cost_cols += 1
     dynamic_widths = [90] * num_score_cost_cols
+    fixed_end_widths = [90, 100, 50, 60]  # Categories Attempted, Date, Logs, Download
     # 5. Combine all the lists to create the final, fully dynamic list.
     final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
         df_headers = benchmark_table_df.columns.tolist()
         df_datatypes = []
         for col in df_headers:
+            if col in ["Logs", "Download"] or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
             elif col in ["SDK Version", "Language Model"]:
                 df_datatypes.append("html")
             # Apply the function to the "Logs" column
             pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)
+        if "Download" in pretty_df.columns:
+            def format_download_to_html(raw_url):
+                # Handle empty or NaN values, returning a blank string.
+                if pd.isna(raw_url) or raw_url == "": return ""
+                # Create a download link with a download icon
+                return f'<a href="{raw_url}" target="_blank" title="Download trajectories">⬇️</a>'
+            # Apply the function to the "Download" column
+            pretty_df["Download"] = pretty_df["Download"].apply(format_download_to_html)
         if "Source" in pretty_df.columns:
             def format_source_url_to_html(raw_url):
                 # Handle empty or NaN values, returning a blank string.