Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Feb 9

Commit

dfa8bfc

1 Parent(s): 0f8031a

feat: Add Visualization column for Laminar eval links

Display eval_visualization_page URLs as clickable 📊 icons in a new
Visualization column next to Logs.

Changes:
- Load eval_visualization_page from score entries in simple_data_loader.py
- Store as '{benchmark} visualization' column similar to download URLs
- Add 📊 column to benchmark detail tables with clickable links
- Only show icon if visualization URL exists, otherwise blank
- Add tooltip documentation for the new column

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (2) hide show

simple_data_loader.py +6 -0
ui_components.py +17 -4

simple_data_loader.py CHANGED Viewed

@@ -172,6 +172,7 @@ class SimpleLeaderboardViewer:
                     'average_runtime': score_entry.get('average_runtime'),
                     'tags': [score_entry.get('benchmark')],
                     'full_archive': score_entry.get('full_archive', ''),  # Download URL for trajectories
                 }
                 all_records.append(record)
@@ -262,6 +263,11 @@ class SimpleLeaderboardViewer:
                         if full_archive_url:
                             record[f'{tag} download'] = full_archive_url
                         # Track category-level data for aggregation
                         if tag in self.benchmark_to_categories:
                             for category in self.benchmark_to_categories[tag]:

                     'average_runtime': score_entry.get('average_runtime'),
                     'tags': [score_entry.get('benchmark')],
                     'full_archive': score_entry.get('full_archive', ''),  # Download URL for trajectories
+                    'eval_visualization_page': score_entry.get('eval_visualization_page', ''),  # Laminar visualization URL
                 }
                 all_records.append(record)
                         if full_archive_url:
                             record[f'{tag} download'] = full_archive_url
+                        # Store the eval_visualization_page URL for this benchmark (for Laminar visualization)
+                        viz_url = row.get('eval_visualization_page', '') if hasattr(row, 'get') else row['eval_visualization_page'] if 'eval_visualization_page' in row.index else ''
+                        if viz_url:
+                            record[f'{tag} visualization'] = viz_url
                         # Track category-level data for aggregation
                         if tag in self.benchmark_to_categories:
                             for category in self.benchmark_to_categories[tag]:

ui_components.py CHANGED Viewed

@@ -181,6 +181,7 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
             <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
             <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
         """
     else:
@@ -192,6 +193,7 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
             <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
         """
@@ -1073,9 +1075,10 @@ def create_benchmark_details_display(
         benchmark_cost_col = f"{benchmark_name} Cost"
         benchmark_runtime_col = f"{benchmark_name} Runtime"
         benchmark_download_col = f"{benchmark_name} Download"
         # Define the columns needed for the detailed table
-        table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_download_col, 'id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -1173,6 +1176,14 @@ def create_benchmark_details_display(
                 return f"[⬇️]({url})"
             benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
         desired_cols_in_order = [
             'Language Model',
             'SDK Version',
@@ -1182,6 +1193,7 @@ def create_benchmark_details_display(
             benchmark_runtime_col,
             'Date',
             'Logs',
             benchmark_download_col
         ]
         for col in desired_cols_in_order:
@@ -1202,14 +1214,15 @@ def create_benchmark_details_display(
             benchmark_score_col: 'Score',
             benchmark_cost_col: 'Cost',
             benchmark_runtime_col: 'Runtime',
-            benchmark_download_col: '⬇️',  # Empty-ish header with icon hint
         }, inplace=True)
         # Now get headers from the renamed dataframe
         df_headers = benchmark_table_df.columns.tolist()
         df_datatypes = []
         for col in df_headers:
-            if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col or "Runtime" in col:
                 df_datatypes.append("markdown")
             elif col in ["SDK Version", "Language Model"]:
                 df_datatypes.append("html")
@@ -1247,7 +1260,7 @@ def create_benchmark_details_display(
                 datatype=df_datatypes,
                 interactive=False,
                 wrap=True,
-                column_widths=[200, 80, 40, 80, 80, 80, 100, 120, 40],  # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Download
                 show_search="search",
                 elem_classes=["wrap-header-df"]
             )

             <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
             <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
+            <div class="tooltip-description-item"><b>📊 Visualization:</b> View interactive evaluation visualization (when available).</div>
             <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
         """
     else:
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
+            <div class="tooltip-description-item"><b>📊 Visualization:</b> View interactive evaluation visualization (when available).</div>
             <div class="tooltip-description-item"><b>Download:</b> Download evaluation trajectories archive.</div>
         """
         benchmark_cost_col = f"{benchmark_name} Cost"
         benchmark_runtime_col = f"{benchmark_name} Runtime"
         benchmark_download_col = f"{benchmark_name} Download"
+        benchmark_visualization_col = f"{benchmark_name} Visualization"
         # Define the columns needed for the detailed table
+        table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_visualization_col, benchmark_download_col, 'id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
                 return f"[⬇️]({url})"
             benchmark_table_df[benchmark_download_col] = benchmark_table_df[benchmark_download_col].apply(format_download_link)
+        # Format visualization column as clickable icon (bar chart emoji)
+        if benchmark_visualization_col in benchmark_table_df.columns:
+            def format_visualization_link(url):
+                if pd.isna(url) or url == "":
+                    return ""
+                return f"[📊]({url})"
+            benchmark_table_df[benchmark_visualization_col] = benchmark_table_df[benchmark_visualization_col].apply(format_visualization_link)
         desired_cols_in_order = [
             'Language Model',
             'SDK Version',
             benchmark_runtime_col,
             'Date',
             'Logs',
+            benchmark_visualization_col,
             benchmark_download_col
         ]
         for col in desired_cols_in_order:
             benchmark_score_col: 'Score',
             benchmark_cost_col: 'Cost',
             benchmark_runtime_col: 'Runtime',
+            benchmark_visualization_col: '📊',  # Visualization column header
+            benchmark_download_col: '⬇️',  # Download column header
         }, inplace=True)
         # Now get headers from the renamed dataframe
         df_headers = benchmark_table_df.columns.tolist()
         df_datatypes = []
         for col in df_headers:
+            if col in ["Logs", "📊", "⬇️"] or "Cost" in col or "Score" in col or "Runtime" in col:
                 df_datatypes.append("markdown")
             elif col in ["SDK Version", "Language Model"]:
                 df_datatypes.append("html")
                 datatype=df_datatypes,
                 interactive=False,
                 wrap=True,
+                column_widths=[200, 80, 40, 80, 80, 80, 100, 120, 40, 40],  # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Visualization, Download
                 show_search="search",
                 elem_classes=["wrap-header-df"]
             )