Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Jan 22

Commit

1739efc

1 Parent(s): eff7f34

Fix Pareto frontier calculation and display

Two bugs were causing all models to show the frontier trophy icon:

1. The 'id' field was set to just the SDK version (e.g., 'v1.8.3') instead
of a unique identifier. This caused multiple models to match when
checking if they're on the Pareto frontier.

Fix: Use agent_id (version_model) as the unique identifier.

2. get_pareto_df() was using the first Cost/Score columns it found
(e.g., 'Swe-Bench-Multimodal Cost') instead of 'Average Cost' and
'Average Score' which should be used for the Overall leaderboard.

Fix: Default to Average Cost/Score columns when available.

Now only the true Pareto optimal models receive the trophy icon:
- deepseek-v3.2-reasoner (lowest cost)
- gemini-3-flash (best score at ~$0.54)
- gpt-5.2 (best score at ~$0.86)
- claude-4.5-opus (highest score)

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (2) hide show

leaderboard_transformer.py +34 -13
simple_data_loader.py +2 -1

leaderboard_transformer.py CHANGED Viewed

@@ -789,30 +789,51 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
     return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
-def get_pareto_df(data):
-    cost_cols = [c for c in data.columns if 'Cost' in c]
-    score_cols = [c for c in data.columns if 'Score' in c]
-    if not cost_cols or not score_cols:
         return pd.DataFrame()
-    x_col, y_col = cost_cols[0], score_cols[0]
-    frontier_data = data.dropna(subset=[x_col, y_col]).copy()
-    frontier_data[y_col] = pd.to_numeric(frontier_data[y_col], errors='coerce')
-    frontier_data[x_col] = pd.to_numeric(frontier_data[x_col], errors='coerce')
-    frontier_data.dropna(subset=[x_col, y_col], inplace=True)
     if frontier_data.empty:
         return pd.DataFrame()
-    frontier_data = frontier_data.sort_values(by=[x_col, y_col], ascending=[True, False])
     pareto_points = []
     max_score_at_cost = -np.inf
     for _, row in frontier_data.iterrows():
-        if row[y_col] >= max_score_at_cost:
             pareto_points.append(row)
-            max_score_at_cost = row[y_col]
     return pd.DataFrame(pareto_points)

     return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)})
+def get_pareto_df(data, cost_col=None, score_col=None):
+    """
+    Calculate the Pareto frontier for the given data.
+    Args:
+        data: DataFrame with cost and score columns
+        cost_col: Specific cost column to use (default: 'Average Cost')
+        score_col: Specific score column to use (default: 'Average Score')
+    Returns:
+        DataFrame containing only the rows on the Pareto frontier
+    """
+    # Use Average Cost/Score by default for the Overall leaderboard
+    if cost_col is None:
+        cost_col = 'Average Cost' if 'Average Cost' in data.columns else None
+        if cost_col is None:
+            cost_cols = [c for c in data.columns if 'Cost' in c]
+            cost_col = cost_cols[0] if cost_cols else None
+    if score_col is None:
+        score_col = 'Average Score' if 'Average Score' in data.columns else None
+        if score_col is None:
+            score_cols = [c for c in data.columns if 'Score' in c]
+            score_col = score_cols[0] if score_cols else None
+    if cost_col is None or score_col is None:
         return pd.DataFrame()
+    frontier_data = data.dropna(subset=[cost_col, score_col]).copy()
+    frontier_data[score_col] = pd.to_numeric(frontier_data[score_col], errors='coerce')
+    frontier_data[cost_col] = pd.to_numeric(frontier_data[cost_col], errors='coerce')
+    frontier_data.dropna(subset=[cost_col, score_col], inplace=True)
     if frontier_data.empty:
         return pd.DataFrame()
+    # Sort by cost ascending, then by score descending
+    frontier_data = frontier_data.sort_values(by=[cost_col, score_col], ascending=[True, False])
     pareto_points = []
     max_score_at_cost = -np.inf
     for _, row in frontier_data.iterrows():
+        if row[score_col] >= max_score_at_cost:
             pareto_points.append(row)
+            max_score_at_cost = row[score_col]
     return pd.DataFrame(pareto_points)

simple_data_loader.py CHANGED Viewed

@@ -252,7 +252,8 @@ class SimpleLeaderboardViewer:
                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'date': first_record['submission_time'],  # Will become "Date"
                     # Additional columns expected by the transformer
-                    'id': first_record.get('id', agent_version),  # Will become "Id"
                     'source': first_record.get('source', ''),  # Will become "Source"
                     'logs': first_record.get('logs', ''),  # Will become "Logs"
                 }

                     'openness': normalized_openness,  # Will become "Openness" (simplified to "open" or "closed")
                     'date': first_record['submission_time'],  # Will become "Date"
                     # Additional columns expected by the transformer
+                    # Use agent_id (version_model) as unique identifier for Pareto frontier calculation
+                    'id': agent_id,
                     'source': first_record.get('source', ''),  # Will become "Source"
                     'logs': first_record.get('logs', ''),  # Will become "Logs"
                 }