openhands openhands commited on
Commit
1739efc
·
1 Parent(s): eff7f34

Fix Pareto frontier calculation and display

Browse files

Two bugs were causing all models to show the frontier trophy icon:

1. The 'id' field was set to just the SDK version (e.g., 'v1.8.3') instead
of a unique identifier. This caused multiple models to match when
checking if they're on the Pareto frontier.

Fix: Use agent_id (version_model) as the unique identifier.

2. get_pareto_df() was using the first Cost/Score columns it found
(e.g., 'Swe-Bench-Multimodal Cost') instead of 'Average Cost' and
'Average Score' which should be used for the Overall leaderboard.

Fix: Default to Average Cost/Score columns when available.

Now only the true Pareto optimal models receive the trophy icon:
- deepseek-v3.2-reasoner (lowest cost)
- gemini-3-flash (best score at ~$0.54)
- gpt-5.2 (best score at ~$0.86)
- claude-4.5-opus (highest score)

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (2) hide show
  1. leaderboard_transformer.py +34 -13
  2. simple_data_loader.py +2 -1
leaderboard_transformer.py CHANGED
@@ -789,30 +789,51 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
789
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
790
 
791
 
792
- def get_pareto_df(data):
793
- cost_cols = [c for c in data.columns if 'Cost' in c]
794
- score_cols = [c for c in data.columns if 'Score' in c]
795
- if not cost_cols or not score_cols:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796
  return pd.DataFrame()
797
 
798
- x_col, y_col = cost_cols[0], score_cols[0]
799
-
800
- frontier_data = data.dropna(subset=[x_col, y_col]).copy()
801
- frontier_data[y_col] = pd.to_numeric(frontier_data[y_col], errors='coerce')
802
- frontier_data[x_col] = pd.to_numeric(frontier_data[x_col], errors='coerce')
803
- frontier_data.dropna(subset=[x_col, y_col], inplace=True)
804
  if frontier_data.empty:
805
  return pd.DataFrame()
806
 
807
- frontier_data = frontier_data.sort_values(by=[x_col, y_col], ascending=[True, False])
 
808
 
809
  pareto_points = []
810
  max_score_at_cost = -np.inf
811
 
812
  for _, row in frontier_data.iterrows():
813
- if row[y_col] >= max_score_at_cost:
814
  pareto_points.append(row)
815
- max_score_at_cost = row[y_col]
816
 
817
  return pd.DataFrame(pareto_points)
818
 
 
789
  return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
790
 
791
 
792
+ def get_pareto_df(data, cost_col=None, score_col=None):
793
+ """
794
+ Calculate the Pareto frontier for the given data.
795
+
796
+ Args:
797
+ data: DataFrame with cost and score columns
798
+ cost_col: Specific cost column to use (default: 'Average Cost')
799
+ score_col: Specific score column to use (default: 'Average Score')
800
+
801
+ Returns:
802
+ DataFrame containing only the rows on the Pareto frontier
803
+ """
804
+ # Use Average Cost/Score by default for the Overall leaderboard
805
+ if cost_col is None:
806
+ cost_col = 'Average Cost' if 'Average Cost' in data.columns else None
807
+ if cost_col is None:
808
+ cost_cols = [c for c in data.columns if 'Cost' in c]
809
+ cost_col = cost_cols[0] if cost_cols else None
810
+
811
+ if score_col is None:
812
+ score_col = 'Average Score' if 'Average Score' in data.columns else None
813
+ if score_col is None:
814
+ score_cols = [c for c in data.columns if 'Score' in c]
815
+ score_col = score_cols[0] if score_cols else None
816
+
817
+ if cost_col is None or score_col is None:
818
  return pd.DataFrame()
819
 
820
+ frontier_data = data.dropna(subset=[cost_col, score_col]).copy()
821
+ frontier_data[score_col] = pd.to_numeric(frontier_data[score_col], errors='coerce')
822
+ frontier_data[cost_col] = pd.to_numeric(frontier_data[cost_col], errors='coerce')
823
+ frontier_data.dropna(subset=[cost_col, score_col], inplace=True)
 
 
824
  if frontier_data.empty:
825
  return pd.DataFrame()
826
 
827
+ # Sort by cost ascending, then by score descending
828
+ frontier_data = frontier_data.sort_values(by=[cost_col, score_col], ascending=[True, False])
829
 
830
  pareto_points = []
831
  max_score_at_cost = -np.inf
832
 
833
  for _, row in frontier_data.iterrows():
834
+ if row[score_col] >= max_score_at_cost:
835
  pareto_points.append(row)
836
+ max_score_at_cost = row[score_col]
837
 
838
  return pd.DataFrame(pareto_points)
839
 
simple_data_loader.py CHANGED
@@ -252,7 +252,8 @@ class SimpleLeaderboardViewer:
252
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
253
  'date': first_record['submission_time'], # Will become "Date"
254
  # Additional columns expected by the transformer
255
- 'id': first_record.get('id', agent_version), # Will become "Id"
 
256
  'source': first_record.get('source', ''), # Will become "Source"
257
  'logs': first_record.get('logs', ''), # Will become "Logs"
258
  }
 
252
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
253
  'date': first_record['submission_time'], # Will become "Date"
254
  # Additional columns expected by the transformer
255
+ # Use agent_id (version_model) as unique identifier for Pareto frontier calculation
256
+ 'id': agent_id,
257
  'source': first_record.get('source', ''), # Will become "Source"
258
  'logs': first_record.get('logs', ''), # Will become "Logs"
259
  }