Spaces:
Running
Running
openhands
commited on
Commit
·
e734bf6
1
Parent(s):
0e14c25
Fix score calculation to match AstaBench methodology and update categories
Browse filesCritical fixes:
- Changed overall score calculation from simple average of all benchmarks to macro-average of category averages
- This ensures each category contributes equally regardless of benchmark count
- Updated Categories Attempted from 4 to 5 categories (Bug Fixing, Frontend Dev, App Creation, Test Gen, Info Gathering)
- Updated category list in leaderboard_transformer.py line 281
- Updated tooltip text to reflect 5 categories
This matches the AstaBench calculation methodology.
- leaderboard_transformer.py +2 -2
- simple_data_loader.py +16 -6
- ui_components.py +1 -1
leaderboard_transformer.py
CHANGED
|
@@ -278,9 +278,9 @@ class DataTransformer:
|
|
| 278 |
# Calculated and add "Categories Attempted" column
|
| 279 |
if primary_metric == "Overall":
|
| 280 |
def calculate_attempted(row):
|
| 281 |
-
main_categories = ['
|
| 282 |
count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
|
| 283 |
-
return f"{count}/
|
| 284 |
|
| 285 |
# Apply the function row-wise to create the new column
|
| 286 |
attempted_column = df_view.apply(calculate_attempted, axis=1)
|
|
|
|
| 278 |
# Calculated and add "Categories Attempted" column
|
| 279 |
if primary_metric == "Overall":
|
| 280 |
def calculate_attempted(row):
|
| 281 |
+
main_categories = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
|
| 282 |
count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
|
| 283 |
+
return f"{count}/5"
|
| 284 |
|
| 285 |
# Apply the function row-wise to create the new column
|
| 286 |
attempted_column = df_view.apply(calculate_attempted, axis=1)
|
simple_data_loader.py
CHANGED
|
@@ -167,18 +167,28 @@ class SimpleLeaderboardViewer:
|
|
| 167 |
category_data[category]['costs'].append(row['total_cost'])
|
| 168 |
|
| 169 |
# Calculate category-level aggregates
|
|
|
|
|
|
|
| 170 |
for category, data in category_data.items():
|
| 171 |
if data['scores']:
|
| 172 |
-
|
|
|
|
|
|
|
| 173 |
if data['costs']:
|
| 174 |
-
|
|
|
|
|
|
|
| 175 |
|
| 176 |
-
# Calculate overall score and cost
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
record['overall
|
| 180 |
else:
|
| 181 |
record['overall score'] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
record['overall cost'] = None
|
| 183 |
|
| 184 |
transformed_records.append(record)
|
|
|
|
| 167 |
category_data[category]['costs'].append(row['total_cost'])
|
| 168 |
|
| 169 |
# Calculate category-level aggregates
|
| 170 |
+
category_avg_scores = []
|
| 171 |
+
category_avg_costs = []
|
| 172 |
for category, data in category_data.items():
|
| 173 |
if data['scores']:
|
| 174 |
+
avg_score = sum(data['scores']) / len(data['scores'])
|
| 175 |
+
record[f'{category} score'] = avg_score
|
| 176 |
+
category_avg_scores.append(avg_score)
|
| 177 |
if data['costs']:
|
| 178 |
+
avg_cost = sum(data['costs']) / len(data['costs'])
|
| 179 |
+
record[f'{category} cost'] = avg_cost
|
| 180 |
+
category_avg_costs.append(avg_cost)
|
| 181 |
|
| 182 |
+
# Calculate overall score and cost as macro-average of category averages
|
| 183 |
+
# This ensures each category contributes equally regardless of benchmark count
|
| 184 |
+
if category_avg_scores:
|
| 185 |
+
record['overall score'] = sum(category_avg_scores) / len(category_avg_scores)
|
| 186 |
else:
|
| 187 |
record['overall score'] = None
|
| 188 |
+
|
| 189 |
+
if category_avg_costs:
|
| 190 |
+
record['overall cost'] = sum(category_avg_costs) / len(category_avg_costs)
|
| 191 |
+
else:
|
| 192 |
record['overall cost'] = None
|
| 193 |
|
| 194 |
transformed_records.append(record)
|
ui_components.py
CHANGED
|
@@ -159,7 +159,7 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 159 |
<div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
|
| 160 |
<div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
|
| 161 |
<div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
|
| 162 |
-
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of
|
| 163 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 164 |
"""
|
| 165 |
elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
|
|
|
|
| 159 |
<div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
|
| 160 |
<div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
|
| 161 |
<div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
|
| 162 |
+
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 163 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 164 |
"""
|
| 165 |
elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
|