Spaces:
Running
Fix Pareto frontier calculation and display
Browse filesTwo bugs were causing all models to show the frontier trophy icon:
1. The 'id' field was set to just the SDK version (e.g., 'v1.8.3') instead
of a unique identifier. This caused multiple models to match when
checking if they're on the Pareto frontier.
Fix: Use agent_id (version_model) as the unique identifier.
2. get_pareto_df() was using the first Cost/Score columns it found
(e.g., 'Swe-Bench-Multimodal Cost') instead of 'Average Cost' and
'Average Score' which should be used for the Overall leaderboard.
Fix: Default to Average Cost/Score columns when available.
Now only the true Pareto optimal models receive the trophy icon:
- deepseek-v3.2-reasoner (lowest cost)
- gemini-3-flash (best score at ~$0.54)
- gpt-5.2 (best score at ~$0.86)
- claude-4.5-opus (highest score)
Co-authored-by: openhands <openhands@all-hands.dev>
- leaderboard_transformer.py +34 -13
- simple_data_loader.py +2 -1
|
@@ -789,30 +789,51 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
|
|
| 789 |
return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
|
| 790 |
|
| 791 |
|
| 792 |
-
def get_pareto_df(data):
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
return pd.DataFrame()
|
| 797 |
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
frontier_data =
|
| 801 |
-
frontier_data
|
| 802 |
-
frontier_data[x_col] = pd.to_numeric(frontier_data[x_col], errors='coerce')
|
| 803 |
-
frontier_data.dropna(subset=[x_col, y_col], inplace=True)
|
| 804 |
if frontier_data.empty:
|
| 805 |
return pd.DataFrame()
|
| 806 |
|
| 807 |
-
|
|
|
|
| 808 |
|
| 809 |
pareto_points = []
|
| 810 |
max_score_at_cost = -np.inf
|
| 811 |
|
| 812 |
for _, row in frontier_data.iterrows():
|
| 813 |
-
if row[
|
| 814 |
pareto_points.append(row)
|
| 815 |
-
max_score_at_cost = row[
|
| 816 |
|
| 817 |
return pd.DataFrame(pareto_points)
|
| 818 |
|
|
|
|
| 789 |
return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
|
| 790 |
|
| 791 |
|
| 792 |
+
def get_pareto_df(data, cost_col=None, score_col=None):
|
| 793 |
+
"""
|
| 794 |
+
Calculate the Pareto frontier for the given data.
|
| 795 |
+
|
| 796 |
+
Args:
|
| 797 |
+
data: DataFrame with cost and score columns
|
| 798 |
+
cost_col: Specific cost column to use (default: 'Average Cost')
|
| 799 |
+
score_col: Specific score column to use (default: 'Average Score')
|
| 800 |
+
|
| 801 |
+
Returns:
|
| 802 |
+
DataFrame containing only the rows on the Pareto frontier
|
| 803 |
+
"""
|
| 804 |
+
# Use Average Cost/Score by default for the Overall leaderboard
|
| 805 |
+
if cost_col is None:
|
| 806 |
+
cost_col = 'Average Cost' if 'Average Cost' in data.columns else None
|
| 807 |
+
if cost_col is None:
|
| 808 |
+
cost_cols = [c for c in data.columns if 'Cost' in c]
|
| 809 |
+
cost_col = cost_cols[0] if cost_cols else None
|
| 810 |
+
|
| 811 |
+
if score_col is None:
|
| 812 |
+
score_col = 'Average Score' if 'Average Score' in data.columns else None
|
| 813 |
+
if score_col is None:
|
| 814 |
+
score_cols = [c for c in data.columns if 'Score' in c]
|
| 815 |
+
score_col = score_cols[0] if score_cols else None
|
| 816 |
+
|
| 817 |
+
if cost_col is None or score_col is None:
|
| 818 |
return pd.DataFrame()
|
| 819 |
|
| 820 |
+
frontier_data = data.dropna(subset=[cost_col, score_col]).copy()
|
| 821 |
+
frontier_data[score_col] = pd.to_numeric(frontier_data[score_col], errors='coerce')
|
| 822 |
+
frontier_data[cost_col] = pd.to_numeric(frontier_data[cost_col], errors='coerce')
|
| 823 |
+
frontier_data.dropna(subset=[cost_col, score_col], inplace=True)
|
|
|
|
|
|
|
| 824 |
if frontier_data.empty:
|
| 825 |
return pd.DataFrame()
|
| 826 |
|
| 827 |
+
# Sort by cost ascending, then by score descending
|
| 828 |
+
frontier_data = frontier_data.sort_values(by=[cost_col, score_col], ascending=[True, False])
|
| 829 |
|
| 830 |
pareto_points = []
|
| 831 |
max_score_at_cost = -np.inf
|
| 832 |
|
| 833 |
for _, row in frontier_data.iterrows():
|
| 834 |
+
if row[score_col] >= max_score_at_cost:
|
| 835 |
pareto_points.append(row)
|
| 836 |
+
max_score_at_cost = row[score_col]
|
| 837 |
|
| 838 |
return pd.DataFrame(pareto_points)
|
| 839 |
|
|
@@ -252,7 +252,8 @@ class SimpleLeaderboardViewer:
|
|
| 252 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
| 253 |
'date': first_record['submission_time'], # Will become "Date"
|
| 254 |
# Additional columns expected by the transformer
|
| 255 |
-
|
|
|
|
| 256 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 257 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
| 258 |
}
|
|
|
|
| 252 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
| 253 |
'date': first_record['submission_time'], # Will become "Date"
|
| 254 |
# Additional columns expected by the transformer
|
| 255 |
+
# Use agent_id (version_model) as unique identifier for Pareto frontier calculation
|
| 256 |
+
'id': agent_id,
|
| 257 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 258 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
| 259 |
}
|