openhands commited on
Commit
e734bf6
·
1 Parent(s): 0e14c25

Fix score calculation to match AstaBench methodology and update categories

Browse files

Critical fixes:
- Changed overall score calculation from simple average of all benchmarks to macro-average of category averages
- This ensures each category contributes equally regardless of benchmark count
- Updated Categories Attempted from 4 to 5 categories (Bug Fixing, Frontend Dev, App Creation, Test Gen, Info Gathering)
- Updated category list in leaderboard_transformer.py line 281
- Updated tooltip text to reflect 5 categories

This matches the AstaBench calculation methodology.

leaderboard_transformer.py CHANGED
@@ -278,9 +278,9 @@ class DataTransformer:
278
  # Calculated and add "Categories Attempted" column
279
  if primary_metric == "Overall":
280
  def calculate_attempted(row):
281
- main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
282
  count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
283
- return f"{count}/4"
284
 
285
  # Apply the function row-wise to create the new column
286
  attempted_column = df_view.apply(calculate_attempted, axis=1)
 
278
  # Calculated and add "Categories Attempted" column
279
  if primary_metric == "Overall":
280
  def calculate_attempted(row):
281
+ main_categories = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
282
  count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
283
+ return f"{count}/5"
284
 
285
  # Apply the function row-wise to create the new column
286
  attempted_column = df_view.apply(calculate_attempted, axis=1)
simple_data_loader.py CHANGED
@@ -167,18 +167,28 @@ class SimpleLeaderboardViewer:
167
  category_data[category]['costs'].append(row['total_cost'])
168
 
169
  # Calculate category-level aggregates
 
 
170
  for category, data in category_data.items():
171
  if data['scores']:
172
- record[f'{category} score'] = sum(data['scores']) / len(data['scores'])
 
 
173
  if data['costs']:
174
- record[f'{category} cost'] = sum(data['costs']) / len(data['costs'])
 
 
175
 
176
- # Calculate overall score and cost (average across all benchmarks)
177
- if dataset_scores:
178
- record['overall score'] = sum(dataset_scores) / len(dataset_scores)
179
- record['overall cost'] = sum(dataset_costs) / len(dataset_costs)
180
  else:
181
  record['overall score'] = None
 
 
 
 
182
  record['overall cost'] = None
183
 
184
  transformed_records.append(record)
 
167
  category_data[category]['costs'].append(row['total_cost'])
168
 
169
  # Calculate category-level aggregates
170
+ category_avg_scores = []
171
+ category_avg_costs = []
172
  for category, data in category_data.items():
173
  if data['scores']:
174
+ avg_score = sum(data['scores']) / len(data['scores'])
175
+ record[f'{category} score'] = avg_score
176
+ category_avg_scores.append(avg_score)
177
  if data['costs']:
178
+ avg_cost = sum(data['costs']) / len(data['costs'])
179
+ record[f'{category} cost'] = avg_cost
180
+ category_avg_costs.append(avg_cost)
181
 
182
+ # Calculate overall score and cost as macro-average of category averages
183
+ # This ensures each category contributes equally regardless of benchmark count
184
+ if category_avg_scores:
185
+ record['overall score'] = sum(category_avg_scores) / len(category_avg_scores)
186
  else:
187
  record['overall score'] = None
188
+
189
+ if category_avg_costs:
190
+ record['overall cost'] = sum(category_avg_costs) / len(category_avg_costs)
191
+ else:
192
  record['overall cost'] = None
193
 
194
  transformed_records.append(record)
ui_components.py CHANGED
@@ -159,7 +159,7 @@ def build_descriptions_tooltip_content(table) -> str:
159
  <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
160
  <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
161
  <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
162
- <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 4).</div>
163
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
164
  """
165
  elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
 
159
  <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
160
  <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
161
  <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
162
+ <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
163
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
164
  """
165
  elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]: