Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Jan 26

Commit

2854ddd

1 Parent(s): 16307c3

Add runtime column and Cost/Performance + Runtime/Performance charts to all pages

- Add runtime tracking to data loader (simple_data_loader.py)
- Add format_runtime_column function (leaderboard_transformer.py)
- Update _plot_scatter_plotly with plot_type parameter ('cost' or 'runtime')
- Update chart titles to 'OpenHands Index XXX Cost/Performance' and 'OpenHands Index XXX Runtime/Performance'
- Add Runtime column to main leaderboard and benchmark details tables
- Display both cost/performance and runtime/performance scatter plots on all pages

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (3) hide show

leaderboard_transformer.py +66 -10
simple_data_loader.py +15 -3
ui_components.py +97 -41

leaderboard_transformer.py CHANGED Viewed

@@ -164,6 +164,7 @@ def _pretty_column_name(raw_col: str) -> str:
         'average cost': 'Average Cost',
         'total cost': 'Average Cost',  # Legacy support
         'Overall cost': 'Average Cost',  # Legacy support
         'categories_completed': 'Categories Completed',
         'Logs': 'Logs',
         'Openness': 'Openness',
@@ -395,7 +396,8 @@ def _plot_scatter_plotly(
         x: Optional[str],
         y: str,
         agent_col: str = 'Agent',
-        name: Optional[str] = None
 ) -> go.Figure:
     # --- Section 1: Define Mappings ---
@@ -427,7 +429,11 @@ def _plot_scatter_plotly(
     data_plot = data.copy()
     data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
-    x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
     max_reported_cost = 0
     divider_line_x = 0
@@ -500,12 +506,12 @@ def _plot_scatter_plotly(
             ))
     # --- Section 5: Prepare for Marker Plotting ---
-    def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
         """
         Builds the complete HTML string for the plot's hover tooltip.
         Format: {lm_name} (SDK {version})
                 Average Score: {score}
-                Average Cost: {cost}
                 Openness: {openness}
         """
         h_pad = "   "
@@ -528,11 +534,17 @@ def _plot_scatter_plotly(
         # Average Score
         parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
-        # Average Cost
-        if divider_line_x > 0 and row[x_col] >= divider_line_x:
-            parts.append(f"{h_pad}Average Cost: <b>Missing</b>{h_pad}<br>")
         else:
-            parts.append(f"{h_pad}Average Cost: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
         # Openness
         parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
@@ -548,7 +560,8 @@ def _plot_scatter_plotly(
             x_axis_label=x_axis_label,
             x_col=x_col_to_use,
             y_col=y_col_to_use,
-            divider_line_x=divider_line_x
         ),
         axis=1
     )
@@ -695,10 +708,16 @@ def _plot_scatter_plotly(
         range=[x_min_log, x_max_log]  # Match domain coordinate calculation
     )
     # Build layout configuration
     layout_config = dict(
         template="plotly_white",
-        title=f"OpenHands Index {name} Leaderboard",
         xaxis=xaxis_config,
         yaxis=dict(title="Average (mean) score", range=[y_min, y_max]),  # Match domain calculation
         legend=dict(
@@ -789,6 +808,43 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
     return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
 def get_pareto_df(data, cost_col=None, score_col=None):
     """
     Calculate the Pareto frontier for the given data.

         'average cost': 'Average Cost',
         'total cost': 'Average Cost',  # Legacy support
         'Overall cost': 'Average Cost',  # Legacy support
+        'average runtime': 'Average Runtime',
         'categories_completed': 'Categories Completed',
         'Logs': 'Logs',
         'Openness': 'Openness',
         x: Optional[str],
         y: str,
         agent_col: str = 'Agent',
+        name: Optional[str] = None,
+        plot_type: str = 'cost'  # 'cost' or 'runtime'
 ) -> go.Figure:
     # --- Section 1: Define Mappings ---
     data_plot = data.copy()
     data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce')
+    # Set axis labels based on plot type
+    if plot_type == 'runtime':
+        x_axis_label = f"Average (mean) runtime per problem (seconds)" if x else "Runtime (Data N/A)"
+    else:
+        x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)"
     max_reported_cost = 0
     divider_line_x = 0
             ))
     # --- Section 5: Prepare for Marker Plotting ---
+    def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x, is_runtime=False):
         """
         Builds the complete HTML string for the plot's hover tooltip.
         Format: {lm_name} (SDK {version})
                 Average Score: {score}
+                Average Cost/Runtime: {value}
                 Openness: {openness}
         """
         h_pad = "   "
         # Average Score
         parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
+        # Average Cost or Runtime
+        if is_runtime:
+            if divider_line_x > 0 and row[x_col] >= divider_line_x:
+                parts.append(f"{h_pad}Average Runtime: <b>Missing</b>{h_pad}<br>")
+            else:
+                parts.append(f"{h_pad}Average Runtime: <b>{row[x_col]:.0f}s</b>{h_pad}<br>")
         else:
+            if divider_line_x > 0 and row[x_col] >= divider_line_x:
+                parts.append(f"{h_pad}Average Cost: <b>Missing</b>{h_pad}<br>")
+            else:
+                parts.append(f"{h_pad}Average Cost: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
         # Openness
         parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
             x_axis_label=x_axis_label,
             x_col=x_col_to_use,
             y_col=y_col_to_use,
+            divider_line_x=divider_line_x,
+            is_runtime=(plot_type == 'runtime')
         ),
         axis=1
     )
         range=[x_min_log, x_max_log]  # Match domain coordinate calculation
     )
+    # Set title based on plot type
+    if plot_type == 'runtime':
+        plot_title = f"OpenHands Index {name} Runtime/Performance"
+    else:
+        plot_title = f"OpenHands Index {name} Cost/Performance"
     # Build layout configuration
     layout_config = dict(
         template="plotly_white",
+        title=plot_title,
         xaxis=xaxis_config,
         yaxis=dict(title="Average (mean) score", range=[y_min, y_max]),  # Match domain calculation
         legend=dict(
     return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
+def format_runtime_column(df: pd.DataFrame, runtime_col_name: str) -> pd.DataFrame:
+    """
+    Applies custom formatting to a runtime column based on its corresponding score column.
+    - If runtime is not null, formats as time with 's' suffix.
+    - If runtime is null but score is not, it becomes "Missing".
+    - If both runtime and score are null, it becomes "Not Submitted".
+    Args:
+        df: The DataFrame to modify.
+        runtime_col_name: The name of the runtime column to format (e.g., "Average Runtime").
+    Returns:
+        The DataFrame with the formatted runtime column.
+    """
+    # Find the corresponding score column by replacing "Runtime" with "Score"
+    score_col_name = runtime_col_name.replace("Runtime", "Score")
+    # Ensure the score column actually exists to avoid errors
+    if score_col_name not in df.columns:
+        return df  # Return the DataFrame unmodified if there's no matching score
+    def apply_formatting_logic(row):
+        runtime_value = row[runtime_col_name]
+        score_value = row[score_col_name]
+        status_color = "#ec4899"
+        if pd.notna(runtime_value) and isinstance(runtime_value, (int, float)):
+            return f"{runtime_value:.0f}s"
+        elif pd.notna(score_value):
+            return f'<span style="color: {status_color};">Missing</span>'  # Score exists, but runtime is missing
+        else:
+            return f'<span style="color: {status_color};">Not Submitted</span>'  # Neither score nor runtime exists
+    # Apply the logic to the specified runtime column and update the DataFrame
+    df[runtime_col_name] = df.apply(apply_formatting_logic, axis=1)
+    return df
 def get_pareto_df(data, cost_col=None, score_col=None):
     """
     Calculate the Pareto frontier for the given data.

simple_data_loader.py CHANGED Viewed

@@ -269,7 +269,7 @@ class SimpleLeaderboardViewer:
                 dataset_costs = []
                 # Track category-level data for aggregation
-                category_data = {}  # {category: {'scores': [...], 'costs': [...]}}
                 for _, row in agent_records.iterrows():
                     tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
@@ -277,6 +277,7 @@ class SimpleLeaderboardViewer:
                         # Add columns for this specific dataset/benchmark
                         record[f'{tag} score'] = row['score']
                         record[f'{tag} cost'] = row['cost_per_instance']
                         dataset_scores.append(row['score'])
                         dataset_costs.append(row['cost_per_instance'])
@@ -289,12 +290,14 @@ class SimpleLeaderboardViewer:
                         if tag in self.benchmark_to_categories:
                             for category in self.benchmark_to_categories[tag]:
                                 if category not in category_data:
-                                    category_data[category] = {'scores': [], 'costs': []}
                                 category_data[category]['scores'].append(row['score'])
                                 category_data[category]['costs'].append(row['cost_per_instance'])
-                # Calculate category-level aggregates and track average cost
                 all_costs = []
                 categories_with_scores = 0
                 for category in ALL_CATEGORIES:
                     if category in category_data and category_data[category]['scores']:
@@ -308,6 +311,12 @@ class SimpleLeaderboardViewer:
                                 avg_cost = sum(valid_costs) / len(valid_costs)
                                 record[f'{category} cost'] = avg_cost
                                 all_costs.extend(valid_costs)
                     else:
                         # Category not submitted - will show as NA
                         pass
@@ -323,6 +332,9 @@ class SimpleLeaderboardViewer:
                 # Average cost per instance across all benchmarks
                 record['average cost'] = sum(all_costs) / len(all_costs) if all_costs else None
                 # Track how many categories were completed
                 record['categories_completed'] = categories_with_scores

                 dataset_costs = []
                 # Track category-level data for aggregation
+                category_data = {}  # {category: {'scores': [...], 'costs': [], 'runtimes': []}}
                 for _, row in agent_records.iterrows():
                     tags = row['tags'] if isinstance(row['tags'], list) else [row['tags']]
                         # Add columns for this specific dataset/benchmark
                         record[f'{tag} score'] = row['score']
                         record[f'{tag} cost'] = row['cost_per_instance']
+                        record[f'{tag} runtime'] = row.get('average_runtime')
                         dataset_scores.append(row['score'])
                         dataset_costs.append(row['cost_per_instance'])
                         if tag in self.benchmark_to_categories:
                             for category in self.benchmark_to_categories[tag]:
                                 if category not in category_data:
+                                    category_data[category] = {'scores': [], 'costs': [], 'runtimes': []}
                                 category_data[category]['scores'].append(row['score'])
                                 category_data[category]['costs'].append(row['cost_per_instance'])
+                                category_data[category]['runtimes'].append(row.get('average_runtime'))
+                # Calculate category-level aggregates and track average cost/runtime
                 all_costs = []
+                all_runtimes = []
                 categories_with_scores = 0
                 for category in ALL_CATEGORIES:
                     if category in category_data and category_data[category]['scores']:
                                 avg_cost = sum(valid_costs) / len(valid_costs)
                                 record[f'{category} cost'] = avg_cost
                                 all_costs.extend(valid_costs)
+                        if data['runtimes']:
+                            valid_runtimes = [r for r in data['runtimes'] if r is not None]
+                            if valid_runtimes:
+                                avg_runtime = sum(valid_runtimes) / len(valid_runtimes)
+                                record[f'{category} runtime'] = avg_runtime
+                                all_runtimes.extend(valid_runtimes)
                     else:
                         # Category not submitted - will show as NA
                         pass
                 # Average cost per instance across all benchmarks
                 record['average cost'] = sum(all_costs) / len(all_costs) if all_costs else None
+                # Average runtime per instance across all benchmarks
+                record['average runtime'] = sum(all_runtimes) / len(all_runtimes) if all_runtimes else None
                 # Track how many categories were completed
                 record['categories_completed'] = categories_with_scores

ui_components.py CHANGED Viewed

@@ -17,6 +17,7 @@ from leaderboard_transformer import (
     _plot_scatter_plotly,
     format_cost_column,
     format_score_column,
     get_pareto_df,
     clean_llm_base_list,
 )
@@ -515,6 +516,10 @@ def create_leaderboard_display(
             if "Score" in col:
                 df_display = format_score_column(df_display, col)
         # Clean the Language Model column first
         df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
@@ -583,47 +588,65 @@ def create_leaderboard_display(
     # If no complete entries exist, show all entries by default
     has_complete_entries = len(df_display_complete) > 0
-    # Determine primary score/cost columns for scatter plot
     if category_name == "Overall":
         primary_score_col = "Average Score"
         primary_cost_col = "Average Cost"
     else:
         primary_score_col = f"{category_name} Score"
         primary_cost_col = f"{category_name} Cost"
-    # Function to create scatter plot from data
-    def create_scatter_plot(df_data):
         return _plot_scatter_plotly(
             data=df_data,
             x=primary_cost_col if primary_cost_col in df_data.columns else None,
             y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
             agent_col="SDK Version",
-            name=category_name
         )
-    # Create initial scatter plots for both complete and all data
-    scatter_plot_complete = create_scatter_plot(df_view_complete) if has_complete_entries else go.Figure()
-    scatter_plot_all = create_scatter_plot(df_view_full)
     # Now get headers from the renamed dataframe (use all entries to ensure headers are present)
     df_headers = df_display_all.columns.tolist()
     df_datatypes = []
     for col in df_headers:
-        if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
         elif col in ["SDK Version", "Language Model"]:
             df_datatypes.append("html")
         else:
             df_datatypes.append("str")
     # Dynamically set widths for the DataFrame columns
-    # Order: Language Model, SDK Version, Average Score, Average Cost, ...
-    fixed_start_widths = [280, 100, 100]  # Language Model (with icons), SDK Version, Average Score
-    num_score_cost_cols = 0
     remaining_headers = df_headers[len(fixed_start_widths):]
     for col in remaining_headers:
-        if "Score" in col or "Cost" in col:
-            num_score_cost_cols += 1
-    dynamic_widths = [90] * num_score_cost_cols
     fixed_end_widths = [90, 100, 50]  # Categories Attempted, Date, Logs
     # 5. Combine all the lists to create the final, fully dynamic list.
     final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
@@ -644,10 +667,18 @@ def create_leaderboard_display(
         show_incomplete_checkbox = None
         gr.Markdown(f"*No entries with all 5 categories completed yet. Showing all {num_total} entries.*")
-    # Plot component - show complete entries by default if available
-    initial_plot = scatter_plot_complete if has_complete_entries else scatter_plot_all
-    plot_component = gr.Plot(
-        value=initial_plot,
         show_label=False,
     )
     gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
@@ -669,17 +700,17 @@ def create_leaderboard_display(
                 elem_id="main-leaderboard"
             )
-            # Update function for the toggle - updates both table and plot
             def update_display(show_incomplete):
                 if show_incomplete:
-                    return df_display_all, scatter_plot_all
                 else:
-                    return df_display_complete, scatter_plot_complete
             show_incomplete_checkbox.change(
                 fn=update_display,
                 inputs=[show_incomplete_checkbox],
-                outputs=[dataframe_component, plot_component]
             )
         else:
             dataframe_component = gr.DataFrame(
@@ -719,21 +750,23 @@ def create_leaderboard_display(
                 new_df_display_complete = prepare_df_for_display(new_df_view_complete)
                 new_df_display_all = prepare_df_for_display(new_df_view_full)
-                # Create new scatter plots
-                new_scatter_complete = create_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure()
-                new_scatter_all = create_scatter_plot(new_df_view_full)
                 # Return the appropriate data based on checkbox state
                 if current_checkbox_state:
-                    return new_df_display_all, new_scatter_all
                 else:
-                    return new_df_display_complete, new_scatter_complete
         # No change, return current values
         if current_checkbox_state:
-            return df_display_all, scatter_plot_all
         else:
-            return df_display_complete, scatter_plot_complete
     # Create a timer that checks for updates every 60 seconds
     refresh_timer = gr.Timer(value=60)
@@ -743,7 +776,7 @@ def create_leaderboard_display(
         refresh_timer.tick(
             fn=check_and_refresh_data,
             inputs=[show_incomplete_checkbox],
-            outputs=[dataframe_component, plot_component]
         )
     else:
         # If no checkbox, always show all data
@@ -756,18 +789,19 @@ def create_leaderboard_display(
                     new_transformer = DataTransformer(new_df, new_tag_map)
                     new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True)
                     new_df_display_all = prepare_df_for_display(new_df_view_full)
-                    new_scatter_all = create_scatter_plot(new_df_view_full)
-                    return new_df_display_all, new_scatter_all
-            return df_display_all, scatter_plot_all
         refresh_timer.tick(
             fn=check_and_refresh_all,
             inputs=[],
-            outputs=[dataframe_component, plot_component]
         )
     # Return the components so they can be referenced elsewhere.
-    return plot_component, dataframe_component
 # # --- Detailed Benchmark Display ---
 def create_benchmark_details_display(
@@ -807,10 +841,11 @@ def create_benchmark_details_display(
         # 3. Prepare the data for this specific benchmark's table and plot
         benchmark_score_col = f"{benchmark_name} Score"
         benchmark_cost_col = f"{benchmark_name} Cost"
         benchmark_download_col = f"{benchmark_name} Download"
         # Define the columns needed for the detailed table
-        table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs', benchmark_download_col, 'id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -914,6 +949,7 @@ def create_benchmark_details_display(
             'Attempted Benchmark',
             benchmark_score_col,
             benchmark_cost_col,
             'Date',
             'Logs',
             benchmark_download_col
@@ -922,10 +958,16 @@ def create_benchmark_details_display(
             if col not in benchmark_table_df.columns:
                 benchmark_table_df[col] = pd.NA # Add as an empty column
         benchmark_table_df = benchmark_table_df[desired_cols_in_order]
         # Rename columns for a cleaner table display, as requested
         benchmark_table_df.rename(columns={
             benchmark_score_col: 'Score',
             benchmark_cost_col: 'Cost',
             benchmark_download_col: '⬇️',  # Empty-ish header with icon hint
         }, inplace=True)
@@ -933,20 +975,34 @@ def create_benchmark_details_display(
         df_headers = benchmark_table_df.columns.tolist()
         df_datatypes = []
         for col in df_headers:
-            if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
             elif col in ["SDK Version", "Language Model"]:
                 df_datatypes.append("html")
             else:
                 df_datatypes.append("str")
-        benchmark_plot = _plot_scatter_plotly(
             data=full_df,
             x=benchmark_cost_col,
             y=benchmark_score_col,
             agent_col="SDK Version",
-            name=benchmark_name
         )
-        gr.Plot(value=benchmark_plot, show_label=False)
         gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
         # Put table and key into an accordion
@@ -957,7 +1013,7 @@ def create_benchmark_details_display(
                 datatype=df_datatypes,
                 interactive=False,
                 wrap=True,
-                column_widths=[200, 80, 40, 80, 80, 150, 40, 40],  # Language Model, SDK Version, Attempted, Score, Cost, Date, Logs, Download
                 show_search="search",
                 elem_classes=["wrap-header-df"]
             )

     _plot_scatter_plotly,
     format_cost_column,
     format_score_column,
+    format_runtime_column,
     get_pareto_df,
     clean_llm_base_list,
 )
             if "Score" in col:
                 df_display = format_score_column(df_display, col)
+        for col in df_display.columns:
+            if "Runtime" in col:
+                df_display = format_runtime_column(df_display, col)
         # Clean the Language Model column first
         df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
     # If no complete entries exist, show all entries by default
     has_complete_entries = len(df_display_complete) > 0
+    # Determine primary score/cost/runtime columns for scatter plot
     if category_name == "Overall":
         primary_score_col = "Average Score"
         primary_cost_col = "Average Cost"
+        primary_runtime_col = "Average Runtime"
     else:
         primary_score_col = f"{category_name} Score"
         primary_cost_col = f"{category_name} Cost"
+        primary_runtime_col = f"{category_name} Runtime"
+    # Function to create cost/performance scatter plot from data
+    def create_cost_scatter_plot(df_data):
         return _plot_scatter_plotly(
             data=df_data,
             x=primary_cost_col if primary_cost_col in df_data.columns else None,
             y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
             agent_col="SDK Version",
+            name=category_name,
+            plot_type='cost'
+        )
+    # Function to create runtime/performance scatter plot from data
+    def create_runtime_scatter_plot(df_data):
+        return _plot_scatter_plotly(
+            data=df_data,
+            x=primary_runtime_col if primary_runtime_col in df_data.columns else None,
+            y=primary_score_col if primary_score_col in df_data.columns else "Average Score",
+            agent_col="SDK Version",
+            name=category_name,
+            plot_type='runtime'
         )
+    # Create initial cost scatter plots for both complete and all data
+    cost_scatter_complete = create_cost_scatter_plot(df_view_complete) if has_complete_entries else go.Figure()
+    cost_scatter_all = create_cost_scatter_plot(df_view_full)
+    # Create initial runtime scatter plots for both complete and all data
+    runtime_scatter_complete = create_runtime_scatter_plot(df_view_complete) if has_complete_entries else go.Figure()
+    runtime_scatter_all = create_runtime_scatter_plot(df_view_full)
     # Now get headers from the renamed dataframe (use all entries to ensure headers are present)
     df_headers = df_display_all.columns.tolist()
     df_datatypes = []
     for col in df_headers:
+        if col == "Logs" or "Cost" in col or "Score" in col or "Runtime" in col:
             df_datatypes.append("markdown")
         elif col in ["SDK Version", "Language Model"]:
             df_datatypes.append("html")
         else:
             df_datatypes.append("str")
     # Dynamically set widths for the DataFrame columns
+    # Order: Language Model, SDK Version, Average Score, Average Cost, Average Runtime, ...
+    fixed_start_widths = [280, 100, 100, 90]  # Language Model (with icons), SDK Version, Average Score, Average Runtime
+    num_score_cost_runtime_cols = 0
     remaining_headers = df_headers[len(fixed_start_widths):]
     for col in remaining_headers:
+        if "Score" in col or "Cost" in col or "Runtime" in col:
+            num_score_cost_runtime_cols += 1
+    dynamic_widths = [90] * num_score_cost_runtime_cols
     fixed_end_widths = [90, 100, 50]  # Categories Attempted, Date, Logs
     # 5. Combine all the lists to create the final, fully dynamic list.
     final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths
         show_incomplete_checkbox = None
         gr.Markdown(f"*No entries with all 5 categories completed yet. Showing all {num_total} entries.*")
+    # Plot components - show complete entries by default if available
+    # Cost/Performance plot
+    initial_cost_plot = cost_scatter_complete if has_complete_entries else cost_scatter_all
+    cost_plot_component = gr.Plot(
+        value=initial_cost_plot,
+        show_label=False,
+    )
+    # Runtime/Performance plot
+    initial_runtime_plot = runtime_scatter_complete if has_complete_entries else runtime_scatter_all
+    runtime_plot_component = gr.Plot(
+        value=initial_runtime_plot,
         show_label=False,
     )
     gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
                 elem_id="main-leaderboard"
             )
+            # Update function for the toggle - updates both table and plots
             def update_display(show_incomplete):
                 if show_incomplete:
+                    return df_display_all, cost_scatter_all, runtime_scatter_all
                 else:
+                    return df_display_complete, cost_scatter_complete, runtime_scatter_complete
             show_incomplete_checkbox.change(
                 fn=update_display,
                 inputs=[show_incomplete_checkbox],
+                outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
             )
         else:
             dataframe_component = gr.DataFrame(
                 new_df_display_complete = prepare_df_for_display(new_df_view_complete)
                 new_df_display_all = prepare_df_for_display(new_df_view_full)
+                # Create new scatter plots (both cost and runtime)
+                new_cost_scatter_complete = create_cost_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure()
+                new_cost_scatter_all = create_cost_scatter_plot(new_df_view_full)
+                new_runtime_scatter_complete = create_runtime_scatter_plot(new_df_view_complete) if len(new_df_display_complete) > 0 else go.Figure()
+                new_runtime_scatter_all = create_runtime_scatter_plot(new_df_view_full)
                 # Return the appropriate data based on checkbox state
                 if current_checkbox_state:
+                    return new_df_display_all, new_cost_scatter_all, new_runtime_scatter_all
                 else:
+                    return new_df_display_complete, new_cost_scatter_complete, new_runtime_scatter_complete
         # No change, return current values
         if current_checkbox_state:
+            return df_display_all, cost_scatter_all, runtime_scatter_all
         else:
+            return df_display_complete, cost_scatter_complete, runtime_scatter_complete
     # Create a timer that checks for updates every 60 seconds
     refresh_timer = gr.Timer(value=60)
         refresh_timer.tick(
             fn=check_and_refresh_data,
             inputs=[show_incomplete_checkbox],
+            outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
         )
     else:
         # If no checkbox, always show all data
                     new_transformer = DataTransformer(new_df, new_tag_map)
                     new_df_view_full, _ = new_transformer.view(tag=category_name, use_plotly=True)
                     new_df_display_all = prepare_df_for_display(new_df_view_full)
+                    new_cost_scatter_all = create_cost_scatter_plot(new_df_view_full)
+                    new_runtime_scatter_all = create_runtime_scatter_plot(new_df_view_full)
+                    return new_df_display_all, new_cost_scatter_all, new_runtime_scatter_all
+            return df_display_all, cost_scatter_all, runtime_scatter_all
         refresh_timer.tick(
             fn=check_and_refresh_all,
             inputs=[],
+            outputs=[dataframe_component, cost_plot_component, runtime_plot_component]
         )
     # Return the components so they can be referenced elsewhere.
+    return cost_plot_component, runtime_plot_component, dataframe_component
 # # --- Detailed Benchmark Display ---
 def create_benchmark_details_display(
         # 3. Prepare the data for this specific benchmark's table and plot
         benchmark_score_col = f"{benchmark_name} Score"
         benchmark_cost_col = f"{benchmark_name} Cost"
+        benchmark_runtime_col = f"{benchmark_name} Runtime"
         benchmark_download_col = f"{benchmark_name} Download"
         # Define the columns needed for the detailed table
+        table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col, benchmark_runtime_col, 'Logs', benchmark_download_col, 'id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
             'Attempted Benchmark',
             benchmark_score_col,
             benchmark_cost_col,
+            benchmark_runtime_col,
             'Date',
             'Logs',
             benchmark_download_col
             if col not in benchmark_table_df.columns:
                 benchmark_table_df[col] = pd.NA # Add as an empty column
         benchmark_table_df = benchmark_table_df[desired_cols_in_order]
+        # Format the runtime column before renaming
+        if benchmark_runtime_col in benchmark_table_df.columns:
+            benchmark_table_df = format_runtime_column(benchmark_table_df, benchmark_runtime_col)
         # Rename columns for a cleaner table display, as requested
         benchmark_table_df.rename(columns={
             benchmark_score_col: 'Score',
             benchmark_cost_col: 'Cost',
+            benchmark_runtime_col: 'Runtime',
             benchmark_download_col: '⬇️',  # Empty-ish header with icon hint
         }, inplace=True)
         df_headers = benchmark_table_df.columns.tolist()
         df_datatypes = []
         for col in df_headers:
+            if col in ["Logs", "⬇️"] or "Cost" in col or "Score" in col or "Runtime" in col:
                 df_datatypes.append("markdown")
             elif col in ["SDK Version", "Language Model"]:
                 df_datatypes.append("html")
             else:
                 df_datatypes.append("str")
+        # Cost/Performance plot
+        cost_benchmark_plot = _plot_scatter_plotly(
             data=full_df,
             x=benchmark_cost_col,
             y=benchmark_score_col,
             agent_col="SDK Version",
+            name=benchmark_name,
+            plot_type='cost'
+        )
+        gr.Plot(value=cost_benchmark_plot, show_label=False)
+        # Runtime/Performance plot
+        runtime_benchmark_plot = _plot_scatter_plotly(
+            data=full_df,
+            x=benchmark_runtime_col,
+            y=benchmark_score_col,
+            agent_col="SDK Version",
+            name=benchmark_name,
+            plot_type='runtime'
         )
+        gr.Plot(value=runtime_benchmark_plot, show_label=False)
         gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
         # Put table and key into an accordion
                 datatype=df_datatypes,
                 interactive=False,
                 wrap=True,
+                column_widths=[200, 80, 40, 80, 80, 80, 150, 40, 40],  # Language Model, SDK Version, Attempted, Score, Cost, Runtime, Date, Logs, Download
                 show_search="search",
                 elem_classes=["wrap-header-df"]
             )