Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Jan 16

Commit

5998027

1 Parent(s): 55da48c

feat: Update leaderboard calculations and add incomplete entries toggle

Changes:
1. Rename 'OpenHands Version' to 'SDK Version'
2. Rename 'Overall Score' to 'Average Score' - now divides by 5 regardless
of categories completed (missing categories count as 0)
3. Rename 'Overall Cost' to 'Total Cost' - now sums all category costs
4. Add 'Show incomplete entries' toggle (default: hidden) to filter
entries that don't have all 5 categories submitted
5. Fix agent grouping to use version+model combination instead of just
version (fixes issue where different models with same SDK version
were incorrectly merged)
6. Fix model_dump() to use mode='json' for proper enum serialization
7. Track 'categories_completed' count per agent

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (3) hide show

leaderboard_transformer.py +16 -12
simple_data_loader.py +42 -27
ui_components.py +106 -67

leaderboard_transformer.py CHANGED Viewed

@@ -96,19 +96,23 @@ def _pretty_column_name(raw_col: str) -> str:
     """
     Takes a raw column name from the DataFrame and returns a "pretty" version.
     Handles three cases:
-    1. Fixed names (e.g., 'Openhands version' -> 'OpenHands Version', 'Language model' -> 'Language Model').
     2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
     3. Fallback for any other names.
     """
     # Case 1: Handle fixed, special-case mappings first.
     fixed_mappings = {
         'id': 'id',
-        'Openhands version': 'OpenHands Version',
         'Language model': 'Language Model',
         'Agent description': 'Agent Description',
         'Submission date': 'Date',
-        'Overall': 'Overall Score',
-        'Overall cost': 'Overall Cost',
         'Logs': 'Logs',
         'Openness': 'Openness',
         'LLM base': 'Model',
@@ -256,7 +260,7 @@ class DataTransformer:
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
-        base_cols = ["id","Language Model","OpenHands Version","Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs"]
@@ -310,7 +314,7 @@ class DataTransformer:
                     data=df_view,
                     x=primary_cost_col,
                     y=primary_score_col,
-                    agent_col="OpenHands Version",
                     name=primary_metric
                 ) if use_plotly else go.Figure()
                 # Use a consistent key for easy retrieval later
@@ -324,7 +328,7 @@ class DataTransformer:
                 plots['scatter_plot'] = go.Figure()
         return df_view, plots
-DEFAULT_Y_COLUMN = "Overall Score"
 DUMMY_X_VALUE_FOR_MISSING_COSTS = 0
 def _plot_scatter_plotly(
@@ -551,7 +555,7 @@ def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame:
     - If both cost and score are null, it becomes "Not Attempted".
     Args:
         df: The DataFrame to modify.
-        cost_col_name: The name of the cost column to format (e.g., "Overall Cost").
     Returns:
         The DataFrame with the formatted cost column.
     """
@@ -584,10 +588,10 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
     Applies custom formatting to a score column for display.
     - If a score is 0 or NaN, it's displayed as a colored "0".
     - Other scores are formatted to two decimal places.
-    - Overall Score values are displayed in bold.
     """
     status_color = "#ec4899"  # The same color as your other status text
-    is_overall_score = (score_col_name == "Overall Score")
     def apply_formatting(score_value):
         # Explicitly handle missing values without turning them into zeros
@@ -601,8 +605,8 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
         else:
             formatted = str(score_value)
-        # Make Overall Score bold
-        if is_overall_score and score_value != 0:
             return f"<strong>{formatted}</strong>"
         return formatted

     """
     Takes a raw column name from the DataFrame and returns a "pretty" version.
     Handles three cases:
+    1. Fixed names (e.g., 'SDK version' -> 'SDK Version', 'Language model' -> 'Language Model').
     2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
     3. Fallback for any other names.
     """
     # Case 1: Handle fixed, special-case mappings first.
     fixed_mappings = {
         'id': 'id',
+        'SDK version': 'SDK Version',
+        'Openhands version': 'SDK Version',  # Legacy support
         'Language model': 'Language Model',
         'Agent description': 'Agent Description',
         'Submission date': 'Date',
+        'average score': 'Average Score',
+        'Overall': 'Average Score',  # Legacy support
+        'total cost': 'Total Cost',
+        'Overall cost': 'Total Cost',  # Legacy support
+        'categories_completed': 'Categories Completed',
         'Logs': 'Logs',
         'Openness': 'Openness',
         'LLM base': 'Model',
         df_view = df_sorted.copy()
         # --- 3. Add Columns for Agent Openness ---
+        base_cols = ["id","Language Model","SDK Version","Source"]
         new_cols = ["Openness"]
         ending_cols = ["Date", "Logs"]
                     data=df_view,
                     x=primary_cost_col,
                     y=primary_score_col,
+                    agent_col="SDK Version",
                     name=primary_metric
                 ) if use_plotly else go.Figure()
                 # Use a consistent key for easy retrieval later
                 plots['scatter_plot'] = go.Figure()
         return df_view, plots
+DEFAULT_Y_COLUMN = "Average Score"
 DUMMY_X_VALUE_FOR_MISSING_COSTS = 0
 def _plot_scatter_plotly(
     - If both cost and score are null, it becomes "Not Attempted".
     Args:
         df: The DataFrame to modify.
+        cost_col_name: The name of the cost column to format (e.g., "Total Cost").
     Returns:
         The DataFrame with the formatted cost column.
     """
     Applies custom formatting to a score column for display.
     - If a score is 0 or NaN, it's displayed as a colored "0".
     - Other scores are formatted to two decimal places.
+    - Average Score values are displayed in bold.
     """
     status_color = "#ec4899"  # The same color as your other status text
+    is_average_score = (score_col_name == "Average Score")
     def apply_formatting(score_value):
         # Explicitly handle missing values without turning them into zeros
         else:
             formatted = str(score_value)
+        # Make Average Score bold
+        if is_average_score and score_value != 0:
             return f"<strong>{formatted}</strong>"
         return formatted

simple_data_loader.py CHANGED Viewed

@@ -65,7 +65,8 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
     if _ensure_schema_models() and Metadata and ScoreEntry:
         try:
             validated_metadata = Metadata(**metadata_raw)
-            metadata_dict = validated_metadata.model_dump()
         except Exception as e:
             errors.append(f"Metadata validation error in {agent_dir.name}: {e}")
             metadata_dict = metadata_raw  # Fall back to raw data
@@ -74,7 +75,8 @@ def load_and_validate_agent_data(agent_dir: Path) -> tuple[Optional[dict], Optio
         for i, score in enumerate(scores_raw):
             try:
                 validated_score = ScoreEntry(**score)
-                validated_scores.append(validated_score.model_dump())
             except Exception as e:
                 errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
                 validated_scores.append(score)  # Fall back to raw data
@@ -223,23 +225,30 @@ class SimpleLeaderboardViewer:
         try:
             # Transform to expected format for leaderboard
-            # Group by agent to aggregate results across datasets
             transformed_records = []
-            for agent_version in df['agent_version'].unique():
-                agent_records = df[df['agent_version'] == agent_version]
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
                 # Normalize openness to "open" or "closed"
                 from aliases import OPENNESS_MAPPING
                 raw_openness = first_record['openness']
                 normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                 record = {
                     # Core agent info - use final display names
-                    'Openhands version': agent_version,  # Will become "OpenHands Version"
                     'Language model': first_record['llm_base'],  # Will become "Language Model"
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'date': first_record['submission_time'],  # Will become "Date"
@@ -273,30 +282,36 @@ class SimpleLeaderboardViewer:
                                 category_data[category]['scores'].append(row['score'])
                                 category_data[category]['costs'].append(row['total_cost'])
-                # Calculate category-level aggregates
-                category_avg_scores = []
-                category_avg_costs = []
-                for category, data in category_data.items():
-                    if data['scores']:
                         avg_score = sum(data['scores']) / len(data['scores'])
                         record[f'{category} score'] = avg_score
-                        category_avg_scores.append(avg_score)
-                    if data['costs']:
-                        avg_cost = sum(data['costs']) / len(data['costs'])
-                        record[f'{category} cost'] = avg_cost
-                        category_avg_costs.append(avg_cost)
-                # Calculate overall score and cost as macro-average of category averages
-                # This ensures each category contributes equally regardless of benchmark count
-                if category_avg_scores:
-                    record['overall score'] = sum(category_avg_scores) / len(category_avg_scores)
-                else:
-                    record['overall score'] = None
-                if category_avg_costs:
-                    record['overall cost'] = sum(category_avg_costs) / len(category_avg_costs)
-                else:
-                    record['overall cost'] = None
                 transformed_records.append(record)

     if _ensure_schema_models() and Metadata and ScoreEntry:
         try:
             validated_metadata = Metadata(**metadata_raw)
+            # Use mode='json' to serialize enums as strings
+            metadata_dict = validated_metadata.model_dump(mode='json')
         except Exception as e:
             errors.append(f"Metadata validation error in {agent_dir.name}: {e}")
             metadata_dict = metadata_raw  # Fall back to raw data
         for i, score in enumerate(scores_raw):
             try:
                 validated_score = ScoreEntry(**score)
+                # Use mode='json' to serialize enums as strings
+                validated_scores.append(validated_score.model_dump(mode='json'))
             except Exception as e:
                 errors.append(f"Score entry {i} validation error in {agent_dir.name}: {e}")
                 validated_scores.append(score)  # Fall back to raw data
         try:
             # Transform to expected format for leaderboard
+            # Group by agent (version + model combination) to aggregate results across datasets
             transformed_records = []
+            # Create a unique identifier for each agent (version + model)
+            df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
+            for agent_id in df['agent_id'].unique():
+                agent_records = df[df['agent_id'] == agent_id]
                 # Build a single record for this agent
                 first_record = agent_records.iloc[0]
+                agent_version = first_record['agent_version']
                 # Normalize openness to "open" or "closed"
                 from aliases import OPENNESS_MAPPING
                 raw_openness = first_record['openness']
                 normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
+                # All 5 categories for the leaderboard
+                ALL_CATEGORIES = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
                 record = {
                     # Core agent info - use final display names
+                    'SDK version': agent_version,  # Will become "SDK Version"
                     'Language model': first_record['llm_base'],  # Will become "Language Model"
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'date': first_record['submission_time'],  # Will become "Date"
                                 category_data[category]['scores'].append(row['score'])
                                 category_data[category]['costs'].append(row['total_cost'])
+                # Calculate category-level aggregates and track total cost
+                total_cost = 0.0
+                categories_with_scores = 0
+                for category in ALL_CATEGORIES:
+                    if category in category_data and category_data[category]['scores']:
+                        data = category_data[category]
                         avg_score = sum(data['scores']) / len(data['scores'])
                         record[f'{category} score'] = avg_score
+                        categories_with_scores += 1
+                        if data['costs']:
+                            cat_cost = sum(data['costs'])
+                            record[f'{category} cost'] = cat_cost
+                            total_cost += cat_cost
+                    else:
+                        # Category not submitted - will show as NA
+                        pass
+                # Calculate average score: always divide by 5 (treating missing categories as 0)
+                # This penalizes incomplete submissions
+                score_sum = sum(
+                    record.get(f'{cat} score', 0) or 0
+                    for cat in ALL_CATEGORIES
+                )
+                record['average score'] = score_sum / 5
+                # Total cost is the sum of all category costs
+                record['total cost'] = total_cost if total_cost > 0 else None
+                # Track how many categories were completed
+                record['categories_completed'] = categories_with_scores
                 transformed_records.append(record)

ui_components.py CHANGED Viewed

@@ -147,10 +147,10 @@ def build_descriptions_tooltip_content(table) -> str:
     """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
     if table == "Overall":
         return """
-            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
-            <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the five category-level average scores. Each category contributes equally.</div>
-            <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
             <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
             <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
@@ -166,7 +166,7 @@ def build_descriptions_tooltip_content(table) -> str:
         """
     elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
         return f"""
-            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
@@ -178,7 +178,7 @@ def build_descriptions_tooltip_content(table) -> str:
     else:
         # Fallback for any other table type, e.g., individual benchmarks
         return f"""
-            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
@@ -360,70 +360,83 @@ def create_leaderboard_display(
     # 1. Instantiate the transformer and get the specific view for this category.
     # The function no longer loads data itself; it filters the data it receives.
     transformer = DataTransformer(full_df, tag_map)
-    df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
-    pareto_df = get_pareto_df(df_view)
-    # Get the list of agents on the frontier. We'll use this list later.
-    trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
-    trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
-    if not pareto_df.empty and 'id' in pareto_df.columns:
-        pareto_agent_names = pareto_df['id'].tolist()
-    else:
-        pareto_agent_names = []
-    df_view['Pareto'] = df_view.apply(
-        lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
-        axis=1
-    )
-    # Generate openness icons for each row
-    def get_openness_icon_html(row):
-        openness_val = row.get('Openness', '')
-        uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
-        return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
-    df_view['Icon'] = df_view.apply(get_openness_icon_html, axis=1)
-    # Format cost columns
-    for col in df_view.columns:
-        if "Cost" in col:
-            df_view = format_cost_column(df_view, col)
-    # Apply score formatting without coercing NaN to 0
-    for col in df_view.columns:
-        if "Score" in col:
-            df_view = format_score_column(df_view, col)
-    scatter_plot = plots_dict.get('scatter_plot', go.Figure())
-    #Make pretty and format the Language Model column
-    df_view['Language Model'] = df_view['Language Model'].apply(clean_llm_base_list)
-    df_view['Language Model'] = df_view['Language Model'].apply(format_llm_base_with_html)
-    # append the repro url to the end of the OpenHands Version
-    if 'Source' in df_view.columns:
-        df_view['OpenHands Version'] = df_view.apply(
-            lambda row: f"{row['OpenHands Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['OpenHands Version'],
             axis=1
         )
-    all_cols = df_view.columns.tolist()
-    # Remove pareto and Icon columns and insert it at the beginning
-    all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
-    all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
-    df_view = df_view[all_cols]
-    # Drop internally used columns that are not needed in the display
-    columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source']
-    df_view = df_view.drop(columns=columns_to_drop, errors='ignore')
-    header_rename_map = {
-        "Pareto": "",
-        "Icon": "",
-    }
-    # Rename columns first before getting headers
-    df_view = df_view.rename(columns=header_rename_map)
     # Now get headers from the renamed dataframe
-    df_headers = df_view.columns.tolist()
     df_datatypes = []
     for col in df_headers:
         if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
-        elif col in ["OpenHands Version","Language Model", ""]:  # "" for renamed Pareto/Icon columns
             df_datatypes.append("html")
         else:
             df_datatypes.append("str")
@@ -451,9 +464,21 @@ def create_leaderboard_display(
     # Put table and key into an accordion
     with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
         dataframe_component = gr.DataFrame(
             headers=df_headers,
-            value=df_view,
             datatype=df_datatypes,
             interactive=False,
             wrap=True,
@@ -462,6 +487,20 @@ def create_leaderboard_display(
             show_search="search",
             elem_id="main-leaderboard"
         )
         legend_markdown = create_legend_markdown(category_name)
         gr.HTML(value=legend_markdown, elem_id="legend-markdown")
@@ -508,7 +547,7 @@ def create_benchmark_details_display(
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
-        table_cols = ['OpenHands Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -543,10 +582,10 @@ def create_benchmark_details_display(
         #Make pretty and format the Language Model column
         benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
         benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
-        # append the repro url to the end of the OpenHands Version
         if 'Source' in benchmark_table_df.columns:
-            benchmark_table_df['OpenHands Version'] = benchmark_table_df.apply(
-                lambda row: f"{row['OpenHands Version']} {row['Source']}" if row['Source'] else row['OpenHands Version'],
                 axis=1
             )
@@ -574,7 +613,7 @@ def create_benchmark_details_display(
             'Pareto',
             'Icon',
             'Language Model',
-            'OpenHands Version',
             'Attempted Benchmark',
             benchmark_score_col,
             benchmark_cost_col,
@@ -603,7 +642,7 @@ def create_benchmark_details_display(
         for col in df_headers:
             if "Logs" in col or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
-            elif col in ["OpenHands Version", "Language Model", ""]:  # "" for renamed Pareto/Icon columns
                 df_datatypes.append("html")
             else:
                 df_datatypes.append("str")
@@ -611,7 +650,7 @@ def create_benchmark_details_display(
             data=full_df,
             x=benchmark_cost_col,
             y=benchmark_score_col,
-            agent_col="OpenHands Version",
             name=benchmark_name
         )
         with gr.Row():

     """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
     if table == "Overall":
         return """
+            <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands SDK evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
+            <div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
+            <div class="tooltip-description-item"><b>Total Cost:</b> Sum of costs across all submitted categories, in USD.</div>
             <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
             <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
             <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
         """
     elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
         return f"""
+            <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
             <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
     else:
         # Fallback for any other table type, e.g., individual benchmarks
         return f"""
+            <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
             <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
     # 1. Instantiate the transformer and get the specific view for this category.
     # The function no longer loads data itself; it filters the data it receives.
     transformer = DataTransformer(full_df, tag_map)
+    df_view_full, plots_dict = transformer.view(tag=category_name, use_plotly=True)
+    def prepare_df_for_display(df_view):
+        """Prepare a DataFrame for display with all formatting applied."""
+        df_display = df_view.copy()
+        pareto_df = get_pareto_df(df_display)
+        trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
+        trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
+        if not pareto_df.empty and 'id' in pareto_df.columns:
+            pareto_agent_names = pareto_df['id'].tolist()
+        else:
+            pareto_agent_names = []
+        df_display['Pareto'] = df_display.apply(
+            lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
             axis=1
         )
+        def get_openness_icon_html(row):
+            openness_val = row.get('Openness', '')
+            uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
+            return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
+        df_display['Icon'] = df_display.apply(get_openness_icon_html, axis=1)
+        for col in df_display.columns:
+            if "Cost" in col:
+                df_display = format_cost_column(df_display, col)
+        for col in df_display.columns:
+            if "Score" in col:
+                df_display = format_score_column(df_display, col)
+        df_display['Language Model'] = df_display['Language Model'].apply(clean_llm_base_list)
+        df_display['Language Model'] = df_display['Language Model'].apply(format_llm_base_with_html)
+        if 'Source' in df_display.columns:
+            df_display['SDK Version'] = df_display.apply(
+                lambda row: f"{row['SDK Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['SDK Version'],
+                axis=1
+            )
+        all_cols = df_display.columns.tolist()
+        all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
+        all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
+        df_display = df_display[all_cols]
+        columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source', 'Categories Completed']
+        df_display = df_display.drop(columns=columns_to_drop, errors='ignore')
+        header_rename_map = {
+            "Pareto": "",
+            "Icon": "",
+        }
+        df_display = df_display.rename(columns=header_rename_map)
+        return df_display
+    # Prepare both complete and all entries versions
+    # Complete entries have all 5 categories submitted
+    if 'Categories Completed' in df_view_full.columns:
+        df_view_complete = df_view_full[df_view_full['Categories Completed'] == 5].copy()
+    else:
+        df_view_complete = df_view_full.copy()
+    df_display_complete = prepare_df_for_display(df_view_complete)
+    df_display_all = prepare_df_for_display(df_view_full)
+    scatter_plot = plots_dict.get('scatter_plot', go.Figure())
     # Now get headers from the renamed dataframe
+    df_headers = df_display_complete.columns.tolist()
     df_datatypes = []
     for col in df_headers:
         if col == "Logs" or "Cost" in col or "Score" in col:
             df_datatypes.append("markdown")
+        elif col in ["SDK Version","Language Model", ""]:  # "" for renamed Pareto/Icon columns
             df_datatypes.append("html")
         else:
             df_datatypes.append("str")
     # Put table and key into an accordion
     with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
+        # Add toggle for showing incomplete entries
+        num_complete = len(df_display_complete)
+        num_total = len(df_display_all)
+        num_incomplete = num_total - num_complete
+        show_incomplete_checkbox = gr.Checkbox(
+            label=f"Show incomplete entries ({num_incomplete} entries with fewer than 5 categories)",
+            value=False,
+            elem_id="show-incomplete-toggle"
+        )
+        # Start with complete entries only (default)
         dataframe_component = gr.DataFrame(
             headers=df_headers,
+            value=df_display_complete,
             datatype=df_datatypes,
             interactive=False,
             wrap=True,
             show_search="search",
             elem_id="main-leaderboard"
         )
+        # Update function for the toggle
+        def update_table(show_incomplete):
+            if show_incomplete:
+                return df_display_all
+            else:
+                return df_display_complete
+        show_incomplete_checkbox.change(
+            fn=update_table,
+            inputs=[show_incomplete_checkbox],
+            outputs=[dataframe_component]
+        )
         legend_markdown = create_legend_markdown(category_name)
         gr.HTML(value=legend_markdown, elem_id="legend-markdown")
         benchmark_cost_col = f"{benchmark_name} Cost"
         # Define the columns needed for the detailed table
+        table_cols = ['SDK Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
         # Filter to only columns that actually exist in the full dataframe
         existing_table_cols = [col for col in table_cols if col in full_df.columns]
         #Make pretty and format the Language Model column
         benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
         benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
+        # append the repro url to the end of the SDK Version
         if 'Source' in benchmark_table_df.columns:
+            benchmark_table_df['SDK Version'] = benchmark_table_df.apply(
+                lambda row: f"{row['SDK Version']} {row['Source']}" if row['Source'] else row['SDK Version'],
                 axis=1
             )
             'Pareto',
             'Icon',
             'Language Model',
+            'SDK Version',
             'Attempted Benchmark',
             benchmark_score_col,
             benchmark_cost_col,
         for col in df_headers:
             if "Logs" in col or "Cost" in col or "Score" in col:
                 df_datatypes.append("markdown")
+            elif col in ["SDK Version", "Language Model", ""]:  # "" for renamed Pareto/Icon columns
                 df_datatypes.append("html")
             else:
                 df_datatypes.append("str")
             data=full_df,
             x=benchmark_cost_col,
             y=benchmark_score_col,
+            agent_col="SDK Version",
             name=benchmark_name
         )
         with gr.Row():