terryyz commited on
Commit
dfe9318
·
1 Parent(s): 5f8c666
Files changed (2) hide show
  1. elo_calculation.py +14 -8
  2. ranking.py +27 -8
elo_calculation.py CHANGED
@@ -211,8 +211,9 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
211
  confidence_intervals = {} # Initialize to avoid uninitialized variable error
212
 
213
  # Check if we have sufficient data for Bradley-Terry model
214
- if len(battles_df) < 2:
215
- # Not enough battles, use default ratings
 
216
  all_models = set(
217
  battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
218
  )
@@ -247,9 +248,6 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
247
  for model in elo_ratings.index:
248
  confidence_intervals[model] = 0
249
  except Exception as bootstrap_error:
250
- print(
251
- f"Bootstrap calculation failed: {bootstrap_error}, skipping confidence intervals"
252
- )
253
  for model in elo_ratings.index:
254
  confidence_intervals[model] = 0
255
  else:
@@ -258,9 +256,6 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
258
  confidence_intervals[model] = 0
259
  except Exception as e:
260
  # Fallback to old method if Bradley-Terry fails
261
- print(
262
- f"Bradley-Terry calculation failed: {e}, falling back to online Elo"
263
- )
264
  old_elo_ratings = compute_online_elo(battles_df)
265
  elo_ratings = pd.Series(old_elo_ratings)
266
  confidence_intervals = {model: 0 for model in elo_ratings.index}
@@ -270,6 +265,7 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
270
  def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
271
  """
272
  Create ranking DataFrame with all necessary columns
 
273
 
274
  Args:
275
  elo_ratings (pd.Series): Elo ratings for each model
@@ -278,13 +274,19 @@ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
278
 
279
  Returns:
280
  pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
 
281
  """
282
  # Load model metadata
283
  metadata = load_model_metadata()
284
 
285
  # Create ranking list with Elo ratings and confidence intervals
 
286
  ranking_list = []
287
  for model in elo_ratings.index:
 
 
 
 
288
  ci_margin = confidence_intervals.get(model, 0)
289
 
290
  # Get metadata for this model
@@ -303,6 +305,10 @@ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
303
  }
304
  )
305
 
 
 
 
 
306
  # Sort by Elo rating (highest first)
307
  ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
308
  ranking_df["Rank"] = range(1, len(ranking_df) + 1)
 
211
  confidence_intervals = {} # Initialize to avoid uninitialized variable error
212
 
213
  # Check if we have sufficient data for Bradley-Terry model
214
+ # Since we only display models with >= 10 votes, we need enough battles
215
+ if len(battles_df) < 10:
216
+ # Not enough battles for reliable ranking
217
  all_models = set(
218
  battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
219
  )
 
248
  for model in elo_ratings.index:
249
  confidence_intervals[model] = 0
250
  except Exception as bootstrap_error:
 
 
 
251
  for model in elo_ratings.index:
252
  confidence_intervals[model] = 0
253
  else:
 
256
  confidence_intervals[model] = 0
257
  except Exception as e:
258
  # Fallback to old method if Bradley-Terry fails
 
 
 
259
  old_elo_ratings = compute_online_elo(battles_df)
260
  elo_ratings = pd.Series(old_elo_ratings)
261
  confidence_intervals = {model: 0 for model in elo_ratings.index}
 
265
  def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
266
  """
267
  Create ranking DataFrame with all necessary columns
268
+ Only includes models with at least 10 battles
269
 
270
  Args:
271
  elo_ratings (pd.Series): Elo ratings for each model
 
274
 
275
  Returns:
276
  pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
277
+ Empty DataFrame if no models have >= 10 votes
278
  """
279
  # Load model metadata
280
  metadata = load_model_metadata()
281
 
282
  # Create ranking list with Elo ratings and confidence intervals
283
+ # Only include models with at least 10 battles
284
  ranking_list = []
285
  for model in elo_ratings.index:
286
+ # Skip models with fewer than 10 votes
287
+ if vote_counts.get(model, 0) < 10:
288
+ continue
289
+
290
  ci_margin = confidence_intervals.get(model, 0)
291
 
292
  # Get metadata for this model
 
305
  }
306
  )
307
 
308
+ # Return empty DataFrame if no models meet the minimum vote threshold
309
+ if not ranking_list:
310
+ return pd.DataFrame()
311
+
312
  # Sort by Elo rating (highest first)
313
  ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
314
  ranking_df["Rank"] = range(1, len(ranking_df) + 1)
ranking.py CHANGED
@@ -5,6 +5,7 @@ Handles model leaderboard functionality and data management
5
 
6
  import gradio as gr
7
  import pandas as pd
 
8
  import datetime
9
  import os
10
  from collections import defaultdict
@@ -61,16 +62,28 @@ def load_ranking_data(hf_token=None, force_reload=False):
61
  return pd.DataFrame()
62
 
63
  # Filter to only include samples where both models have code in their responses
64
- # code_a and code_b should be non-empty lists
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  if 'code_a' in df.columns and 'code_b' in df.columns:
66
- df = df[
67
- df['code_a'].apply(lambda x: isinstance(x, list) and len(x) > 0) &
68
- df['code_b'].apply(lambda x: isinstance(x, list) and len(x) > 0)
69
- ]
70
 
 
71
  if df.empty:
72
  return pd.DataFrame()
73
-
74
  # Convert vote format for Elo calculation and count votes
75
  battle_data = []
76
  vote_counts = defaultdict(int)
@@ -131,6 +144,10 @@ def update_ranking_display():
131
  if df.empty:
132
  return gr.update(value=df), "**Last Updated:** No data available"
133
 
 
 
 
 
134
  last_update = (
135
  ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
136
  if ranking_last_updated
@@ -145,6 +162,10 @@ def force_update_ranking_display():
145
  if df.empty:
146
  return gr.update(value=df), "**Last Updated:** No data available"
147
 
 
 
 
 
148
  last_update = (
149
  ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
150
  if ranking_last_updated
@@ -172,7 +193,6 @@ def create_ranking_tab():
172
  "95% CI (±)",
173
  "Votes",
174
  "Organization",
175
- "License",
176
  ],
177
  datatype=[
178
  "number",
@@ -181,7 +201,6 @@ def create_ranking_tab():
181
  "str",
182
  "number",
183
  "str",
184
- "str",
185
  ],
186
  label="Model Rankings",
187
  interactive=False,
 
5
 
6
  import gradio as gr
7
  import pandas as pd
8
+ import numpy as np
9
  import datetime
10
  import os
11
  from collections import defaultdict
 
62
  return pd.DataFrame()
63
 
64
  # Filter to only include samples where both models have code in their responses
65
+ # code_a and code_b are lists/arrays of dicts, check if each dict has non-empty "code"
66
+ def has_valid_code(x):
67
+ """Check if x is a list/array of dicts where each dict has a non-empty 'code' field"""
68
+ # Handle None, NaN, and other non-list/array values
69
+ if x is None or (not isinstance(x, (list, np.ndarray))) or len(x) == 0:
70
+ return False
71
+ return all(
72
+ isinstance(item, dict) and
73
+ 'code' in item and
74
+ item['code'] and
75
+ len(str(item['code']).strip()) > 0
76
+ for item in x
77
+ )
78
+
79
  if 'code_a' in df.columns and 'code_b' in df.columns:
80
+ # Filter rows where both code_a and code_b have valid code
81
+ valid_code_a = df['code_a'].apply(has_valid_code)
82
+ valid_code_b = df['code_b'].apply(has_valid_code)
 
83
 
84
+ df = df[valid_code_a & valid_code_b]
85
  if df.empty:
86
  return pd.DataFrame()
 
87
  # Convert vote format for Elo calculation and count votes
88
  battle_data = []
89
  vote_counts = defaultdict(int)
 
144
  if df.empty:
145
  return gr.update(value=df), "**Last Updated:** No data available"
146
 
147
+ # Drop License column if it exists
148
+ if 'License' in df.columns:
149
+ df = df.drop(columns=['License'])
150
+
151
  last_update = (
152
  ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
153
  if ranking_last_updated
 
162
  if df.empty:
163
  return gr.update(value=df), "**Last Updated:** No data available"
164
 
165
+ # Drop License column if it exists
166
+ if 'License' in df.columns:
167
+ df = df.drop(columns=['License'])
168
+
169
  last_update = (
170
  ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
171
  if ranking_last_updated
 
193
  "95% CI (±)",
194
  "Votes",
195
  "Organization",
 
196
  ],
197
  datatype=[
198
  "number",
 
201
  "str",
202
  "number",
203
  "str",
 
204
  ],
205
  label="Model Rankings",
206
  interactive=False,