Spaces:
Running
Running
terryyz
commited on
Commit
·
dfe9318
1
Parent(s):
5f8c666
fix
Browse files- elo_calculation.py +14 -8
- ranking.py +27 -8
elo_calculation.py
CHANGED
|
@@ -211,8 +211,9 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
|
|
| 211 |
confidence_intervals = {} # Initialize to avoid uninitialized variable error
|
| 212 |
|
| 213 |
# Check if we have sufficient data for Bradley-Terry model
|
| 214 |
-
|
| 215 |
-
|
|
|
|
| 216 |
all_models = set(
|
| 217 |
battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
|
| 218 |
)
|
|
@@ -247,9 +248,6 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
|
|
| 247 |
for model in elo_ratings.index:
|
| 248 |
confidence_intervals[model] = 0
|
| 249 |
except Exception as bootstrap_error:
|
| 250 |
-
print(
|
| 251 |
-
f"Bootstrap calculation failed: {bootstrap_error}, skipping confidence intervals"
|
| 252 |
-
)
|
| 253 |
for model in elo_ratings.index:
|
| 254 |
confidence_intervals[model] = 0
|
| 255 |
else:
|
|
@@ -258,9 +256,6 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
|
|
| 258 |
confidence_intervals[model] = 0
|
| 259 |
except Exception as e:
|
| 260 |
# Fallback to old method if Bradley-Terry fails
|
| 261 |
-
print(
|
| 262 |
-
f"Bradley-Terry calculation failed: {e}, falling back to online Elo"
|
| 263 |
-
)
|
| 264 |
old_elo_ratings = compute_online_elo(battles_df)
|
| 265 |
elo_ratings = pd.Series(old_elo_ratings)
|
| 266 |
confidence_intervals = {model: 0 for model in elo_ratings.index}
|
|
@@ -270,6 +265,7 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
|
|
| 270 |
def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
|
| 271 |
"""
|
| 272 |
Create ranking DataFrame with all necessary columns
|
|
|
|
| 273 |
|
| 274 |
Args:
|
| 275 |
elo_ratings (pd.Series): Elo ratings for each model
|
|
@@ -278,13 +274,19 @@ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
|
|
| 278 |
|
| 279 |
Returns:
|
| 280 |
pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
|
|
|
|
| 281 |
"""
|
| 282 |
# Load model metadata
|
| 283 |
metadata = load_model_metadata()
|
| 284 |
|
| 285 |
# Create ranking list with Elo ratings and confidence intervals
|
|
|
|
| 286 |
ranking_list = []
|
| 287 |
for model in elo_ratings.index:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
ci_margin = confidence_intervals.get(model, 0)
|
| 289 |
|
| 290 |
# Get metadata for this model
|
|
@@ -303,6 +305,10 @@ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
|
|
| 303 |
}
|
| 304 |
)
|
| 305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
# Sort by Elo rating (highest first)
|
| 307 |
ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
|
| 308 |
ranking_df["Rank"] = range(1, len(ranking_df) + 1)
|
|
|
|
| 211 |
confidence_intervals = {} # Initialize to avoid uninitialized variable error
|
| 212 |
|
| 213 |
# Check if we have sufficient data for Bradley-Terry model
|
| 214 |
+
# Since we only display models with >= 10 votes, we need enough battles
|
| 215 |
+
if len(battles_df) < 10:
|
| 216 |
+
# Not enough battles for reliable ranking
|
| 217 |
all_models = set(
|
| 218 |
battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
|
| 219 |
)
|
|
|
|
| 248 |
for model in elo_ratings.index:
|
| 249 |
confidence_intervals[model] = 0
|
| 250 |
except Exception as bootstrap_error:
|
|
|
|
|
|
|
|
|
|
| 251 |
for model in elo_ratings.index:
|
| 252 |
confidence_intervals[model] = 0
|
| 253 |
else:
|
|
|
|
| 256 |
confidence_intervals[model] = 0
|
| 257 |
except Exception as e:
|
| 258 |
# Fallback to old method if Bradley-Terry fails
|
|
|
|
|
|
|
|
|
|
| 259 |
old_elo_ratings = compute_online_elo(battles_df)
|
| 260 |
elo_ratings = pd.Series(old_elo_ratings)
|
| 261 |
confidence_intervals = {model: 0 for model in elo_ratings.index}
|
|
|
|
| 265 |
def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
|
| 266 |
"""
|
| 267 |
Create ranking DataFrame with all necessary columns
|
| 268 |
+
Only includes models with at least 10 battles
|
| 269 |
|
| 270 |
Args:
|
| 271 |
elo_ratings (pd.Series): Elo ratings for each model
|
|
|
|
| 274 |
|
| 275 |
Returns:
|
| 276 |
pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
|
| 277 |
+
Empty DataFrame if no models have >= 10 votes
|
| 278 |
"""
|
| 279 |
# Load model metadata
|
| 280 |
metadata = load_model_metadata()
|
| 281 |
|
| 282 |
# Create ranking list with Elo ratings and confidence intervals
|
| 283 |
+
# Only include models with at least 10 battles
|
| 284 |
ranking_list = []
|
| 285 |
for model in elo_ratings.index:
|
| 286 |
+
# Skip models with fewer than 10 votes
|
| 287 |
+
if vote_counts.get(model, 0) < 10:
|
| 288 |
+
continue
|
| 289 |
+
|
| 290 |
ci_margin = confidence_intervals.get(model, 0)
|
| 291 |
|
| 292 |
# Get metadata for this model
|
|
|
|
| 305 |
}
|
| 306 |
)
|
| 307 |
|
| 308 |
+
# Return empty DataFrame if no models meet the minimum vote threshold
|
| 309 |
+
if not ranking_list:
|
| 310 |
+
return pd.DataFrame()
|
| 311 |
+
|
| 312 |
# Sort by Elo rating (highest first)
|
| 313 |
ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
|
| 314 |
ranking_df["Rank"] = range(1, len(ranking_df) + 1)
|
ranking.py
CHANGED
|
@@ -5,6 +5,7 @@ Handles model leaderboard functionality and data management
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
import pandas as pd
|
|
|
|
| 8 |
import datetime
|
| 9 |
import os
|
| 10 |
from collections import defaultdict
|
|
@@ -61,16 +62,28 @@ def load_ranking_data(hf_token=None, force_reload=False):
|
|
| 61 |
return pd.DataFrame()
|
| 62 |
|
| 63 |
# Filter to only include samples where both models have code in their responses
|
| 64 |
-
# code_a and code_b
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
if 'code_a' in df.columns and 'code_b' in df.columns:
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
]
|
| 70 |
|
|
|
|
| 71 |
if df.empty:
|
| 72 |
return pd.DataFrame()
|
| 73 |
-
|
| 74 |
# Convert vote format for Elo calculation and count votes
|
| 75 |
battle_data = []
|
| 76 |
vote_counts = defaultdict(int)
|
|
@@ -131,6 +144,10 @@ def update_ranking_display():
|
|
| 131 |
if df.empty:
|
| 132 |
return gr.update(value=df), "**Last Updated:** No data available"
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
last_update = (
|
| 135 |
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
|
| 136 |
if ranking_last_updated
|
|
@@ -145,6 +162,10 @@ def force_update_ranking_display():
|
|
| 145 |
if df.empty:
|
| 146 |
return gr.update(value=df), "**Last Updated:** No data available"
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
last_update = (
|
| 149 |
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
|
| 150 |
if ranking_last_updated
|
|
@@ -172,7 +193,6 @@ def create_ranking_tab():
|
|
| 172 |
"95% CI (±)",
|
| 173 |
"Votes",
|
| 174 |
"Organization",
|
| 175 |
-
"License",
|
| 176 |
],
|
| 177 |
datatype=[
|
| 178 |
"number",
|
|
@@ -181,7 +201,6 @@ def create_ranking_tab():
|
|
| 181 |
"str",
|
| 182 |
"number",
|
| 183 |
"str",
|
| 184 |
-
"str",
|
| 185 |
],
|
| 186 |
label="Model Rankings",
|
| 187 |
interactive=False,
|
|
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
import datetime
|
| 10 |
import os
|
| 11 |
from collections import defaultdict
|
|
|
|
| 62 |
return pd.DataFrame()
|
| 63 |
|
| 64 |
# Filter to only include samples where both models have code in their responses
|
| 65 |
+
# code_a and code_b are lists/arrays of dicts, check if each dict has non-empty "code"
|
| 66 |
+
def has_valid_code(x):
|
| 67 |
+
"""Check if x is a list/array of dicts where each dict has a non-empty 'code' field"""
|
| 68 |
+
# Handle None, NaN, and other non-list/array values
|
| 69 |
+
if x is None or (not isinstance(x, (list, np.ndarray))) or len(x) == 0:
|
| 70 |
+
return False
|
| 71 |
+
return all(
|
| 72 |
+
isinstance(item, dict) and
|
| 73 |
+
'code' in item and
|
| 74 |
+
item['code'] and
|
| 75 |
+
len(str(item['code']).strip()) > 0
|
| 76 |
+
for item in x
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
if 'code_a' in df.columns and 'code_b' in df.columns:
|
| 80 |
+
# Filter rows where both code_a and code_b have valid code
|
| 81 |
+
valid_code_a = df['code_a'].apply(has_valid_code)
|
| 82 |
+
valid_code_b = df['code_b'].apply(has_valid_code)
|
|
|
|
| 83 |
|
| 84 |
+
df = df[valid_code_a & valid_code_b]
|
| 85 |
if df.empty:
|
| 86 |
return pd.DataFrame()
|
|
|
|
| 87 |
# Convert vote format for Elo calculation and count votes
|
| 88 |
battle_data = []
|
| 89 |
vote_counts = defaultdict(int)
|
|
|
|
| 144 |
if df.empty:
|
| 145 |
return gr.update(value=df), "**Last Updated:** No data available"
|
| 146 |
|
| 147 |
+
# Drop License column if it exists
|
| 148 |
+
if 'License' in df.columns:
|
| 149 |
+
df = df.drop(columns=['License'])
|
| 150 |
+
|
| 151 |
last_update = (
|
| 152 |
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
|
| 153 |
if ranking_last_updated
|
|
|
|
| 162 |
if df.empty:
|
| 163 |
return gr.update(value=df), "**Last Updated:** No data available"
|
| 164 |
|
| 165 |
+
# Drop License column if it exists
|
| 166 |
+
if 'License' in df.columns:
|
| 167 |
+
df = df.drop(columns=['License'])
|
| 168 |
+
|
| 169 |
last_update = (
|
| 170 |
ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
|
| 171 |
if ranking_last_updated
|
|
|
|
| 193 |
"95% CI (±)",
|
| 194 |
"Votes",
|
| 195 |
"Organization",
|
|
|
|
| 196 |
],
|
| 197 |
datatype=[
|
| 198 |
"number",
|
|
|
|
| 201 |
"str",
|
| 202 |
"number",
|
| 203 |
"str",
|
|
|
|
| 204 |
],
|
| 205 |
label="Model Rankings",
|
| 206 |
interactive=False,
|