Spaces:

OpenHands
/

openhands-index

Running

openhands commited on Nov 25, 2025

Commit

e734bf6

1 Parent(s): 0e14c25

Fix score calculation to match AstaBench methodology and update categories

Critical fixes:
- Changed overall score calculation from simple average of all benchmarks to macro-average of category averages
- This ensures each category contributes equally regardless of benchmark count
- Updated Categories Attempted from 4 to 5 categories (Bug Fixing, Frontend Dev, App Creation, Test Gen, Info Gathering)
- Updated category list in leaderboard_transformer.py line 281
- Updated tooltip text to reflect 5 categories

This matches the AstaBench calculation methodology.

Files changed (3) hide show

leaderboard_transformer.py +2 -2
simple_data_loader.py +16 -6
ui_components.py +1 -1

leaderboard_transformer.py CHANGED Viewed

@@ -278,9 +278,9 @@ class DataTransformer:
         # Calculated and add "Categories Attempted" column
         if primary_metric == "Overall":
             def calculate_attempted(row):
-                main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
                 count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
-                return f"{count}/4"
             # Apply the function row-wise to create the new column
             attempted_column = df_view.apply(calculate_attempted, axis=1)

         # Calculated and add "Categories Attempted" column
         if primary_metric == "Overall":
             def calculate_attempted(row):
+                main_categories = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
                 count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
+                return f"{count}/5"
             # Apply the function row-wise to create the new column
             attempted_column = df_view.apply(calculate_attempted, axis=1)

simple_data_loader.py CHANGED Viewed

@@ -167,18 +167,28 @@ class SimpleLeaderboardViewer:
                                 category_data[category]['costs'].append(row['total_cost'])
                 # Calculate category-level aggregates
                 for category, data in category_data.items():
                     if data['scores']:
-                        record[f'{category} score'] = sum(data['scores']) / len(data['scores'])
                     if data['costs']:
-                        record[f'{category} cost'] = sum(data['costs']) / len(data['costs'])
-                # Calculate overall score and cost (average across all benchmarks)
-                if dataset_scores:
-                    record['overall score'] = sum(dataset_scores) / len(dataset_scores)
-                    record['overall cost'] = sum(dataset_costs) / len(dataset_costs)
                 else:
                     record['overall score'] = None
                     record['overall cost'] = None
                 transformed_records.append(record)

                                 category_data[category]['costs'].append(row['total_cost'])
                 # Calculate category-level aggregates
+                category_avg_scores = []
+                category_avg_costs = []
                 for category, data in category_data.items():
                     if data['scores']:
+                        avg_score = sum(data['scores']) / len(data['scores'])
+                        record[f'{category} score'] = avg_score
+                        category_avg_scores.append(avg_score)
                     if data['costs']:
+                        avg_cost = sum(data['costs']) / len(data['costs'])
+                        record[f'{category} cost'] = avg_cost
+                        category_avg_costs.append(avg_cost)
+                # Calculate overall score and cost as macro-average of category averages
+                # This ensures each category contributes equally regardless of benchmark count
+                if category_avg_scores:
+                    record['overall score'] = sum(category_avg_scores) / len(category_avg_scores)
                 else:
                     record['overall score'] = None
+                if category_avg_costs:
+                    record['overall cost'] = sum(category_avg_costs) / len(category_avg_costs)
+                else:
                     record['overall cost'] = None
                 transformed_records.append(record)

ui_components.py CHANGED Viewed

@@ -159,7 +159,7 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
             <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
             <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
-            <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 4).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
     elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:

             <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
             <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
             <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
+            <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
     elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]: