Spaces:

bigcode
/

arena

Running

App Files Files Community

terryyz commited on Oct 2, 2025

Commit

dfe9318

1 Parent(s): 5f8c666

fix

Browse files

Files changed (2) hide show

elo_calculation.py +14 -8
ranking.py +27 -8

elo_calculation.py CHANGED Viewed

@@ -211,8 +211,9 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
     confidence_intervals = {}  # Initialize to avoid uninitialized variable error
     # Check if we have sufficient data for Bradley-Terry model
-    if len(battles_df) < 2:
-        # Not enough battles, use default ratings
         all_models = set(
             battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
         )
@@ -247,9 +248,6 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
                         for model in elo_ratings.index:
                             confidence_intervals[model] = 0
                 except Exception as bootstrap_error:
-                    print(
-                        f"Bootstrap calculation failed: {bootstrap_error}, skipping confidence intervals"
-                    )
                     for model in elo_ratings.index:
                         confidence_intervals[model] = 0
             else:
@@ -258,9 +256,6 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
                     confidence_intervals[model] = 0
         except Exception as e:
             # Fallback to old method if Bradley-Terry fails
-            print(
-                f"Bradley-Terry calculation failed: {e}, falling back to online Elo"
-            )
             old_elo_ratings = compute_online_elo(battles_df)
             elo_ratings = pd.Series(old_elo_ratings)
             confidence_intervals = {model: 0 for model in elo_ratings.index}
@@ -270,6 +265,7 @@ def calculate_elo_with_confidence_intervals(battles_df, vote_counts):
 def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
     """
     Create ranking DataFrame with all necessary columns
     Args:
         elo_ratings (pd.Series): Elo ratings for each model
@@ -278,13 +274,19 @@ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
     Returns:
         pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
     """
     # Load model metadata
     metadata = load_model_metadata()
     # Create ranking list with Elo ratings and confidence intervals
     ranking_list = []
     for model in elo_ratings.index:
         ci_margin = confidence_intervals.get(model, 0)
         # Get metadata for this model
@@ -303,6 +305,10 @@ def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
             }
         )
     # Sort by Elo rating (highest first)
     ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
     ranking_df["Rank"] = range(1, len(ranking_df) + 1)

     confidence_intervals = {}  # Initialize to avoid uninitialized variable error
     # Check if we have sufficient data for Bradley-Terry model
+    # Since we only display models with >= 10 votes, we need enough battles
+    if len(battles_df) < 10:
+        # Not enough battles for reliable ranking
         all_models = set(
             battles_df["model_a"].tolist() + battles_df["model_b"].tolist()
         )
                         for model in elo_ratings.index:
                             confidence_intervals[model] = 0
                 except Exception as bootstrap_error:
                     for model in elo_ratings.index:
                         confidence_intervals[model] = 0
             else:
                     confidence_intervals[model] = 0
         except Exception as e:
             # Fallback to old method if Bradley-Terry fails
             old_elo_ratings = compute_online_elo(battles_df)
             elo_ratings = pd.Series(old_elo_ratings)
             confidence_intervals = {model: 0 for model in elo_ratings.index}
 def create_ranking_dataframe(elo_ratings, confidence_intervals, vote_counts):
     """
     Create ranking DataFrame with all necessary columns
+    Only includes models with at least 10 battles
     Args:
         elo_ratings (pd.Series): Elo ratings for each model
     Returns:
         pd.DataFrame: Ranking table with columns [Rank, Model, Score, 95% CI (±), Votes, Organization, License]
+                     Empty DataFrame if no models have >= 10 votes
     """
     # Load model metadata
     metadata = load_model_metadata()
     # Create ranking list with Elo ratings and confidence intervals
+    # Only include models with at least 10 battles
     ranking_list = []
     for model in elo_ratings.index:
+        # Skip models with fewer than 10 votes
+        if vote_counts.get(model, 0) < 10:
+            continue
         ci_margin = confidence_intervals.get(model, 0)
         # Get metadata for this model
             }
         )
+    # Return empty DataFrame if no models meet the minimum vote threshold
+    if not ranking_list:
+        return pd.DataFrame()
     # Sort by Elo rating (highest first)
     ranking_df = pd.DataFrame(ranking_list).sort_values("Score", ascending=False)
     ranking_df["Rank"] = range(1, len(ranking_df) + 1)

ranking.py CHANGED Viewed

@@ -5,6 +5,7 @@ Handles model leaderboard functionality and data management
 import gradio as gr
 import pandas as pd
 import datetime
 import os
 from collections import defaultdict
@@ -61,16 +62,28 @@ def load_ranking_data(hf_token=None, force_reload=False):
             return pd.DataFrame()
         # Filter to only include samples where both models have code in their responses
-        # code_a and code_b should be non-empty lists
         if 'code_a' in df.columns and 'code_b' in df.columns:
-            df = df[
-                df['code_a'].apply(lambda x: isinstance(x, list) and len(x) > 0) &
-                df['code_b'].apply(lambda x: isinstance(x, list) and len(x) > 0)
-            ]
             if df.empty:
                 return pd.DataFrame()
         # Convert vote format for Elo calculation and count votes
         battle_data = []
         vote_counts = defaultdict(int)
@@ -131,6 +144,10 @@ def update_ranking_display():
     if df.empty:
         return gr.update(value=df), "**Last Updated:** No data available"
     last_update = (
         ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
         if ranking_last_updated
@@ -145,6 +162,10 @@ def force_update_ranking_display():
     if df.empty:
         return gr.update(value=df), "**Last Updated:** No data available"
     last_update = (
         ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
         if ranking_last_updated
@@ -172,7 +193,6 @@ def create_ranking_tab():
                 "95% CI (±)",
                 "Votes",
                 "Organization",
-                "License",
             ],
             datatype=[
                 "number",
@@ -181,7 +201,6 @@ def create_ranking_tab():
                 "str",
                 "number",
                 "str",
-                "str",
             ],
             label="Model Rankings",
             interactive=False,

 import gradio as gr
 import pandas as pd
+import numpy as np
 import datetime
 import os
 from collections import defaultdict
             return pd.DataFrame()
         # Filter to only include samples where both models have code in their responses
+        # code_a and code_b are lists/arrays of dicts, check if each dict has non-empty "code"
+        def has_valid_code(x):
+            """Check if x is a list/array of dicts where each dict has a non-empty 'code' field"""
+            # Handle None, NaN, and other non-list/array values
+            if x is None or (not isinstance(x, (list, np.ndarray))) or len(x) == 0:
+                return False
+            return all(
+                isinstance(item, dict) and
+                'code' in item and
+                item['code'] and
+                len(str(item['code']).strip()) > 0
+                for item in x
+            )
         if 'code_a' in df.columns and 'code_b' in df.columns:
+            # Filter rows where both code_a and code_b have valid code
+            valid_code_a = df['code_a'].apply(has_valid_code)
+            valid_code_b = df['code_b'].apply(has_valid_code)
+            df = df[valid_code_a & valid_code_b]
             if df.empty:
                 return pd.DataFrame()
         # Convert vote format for Elo calculation and count votes
         battle_data = []
         vote_counts = defaultdict(int)
     if df.empty:
         return gr.update(value=df), "**Last Updated:** No data available"
+    # Drop License column if it exists
+    if 'License' in df.columns:
+        df = df.drop(columns=['License'])
     last_update = (
         ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
         if ranking_last_updated
     if df.empty:
         return gr.update(value=df), "**Last Updated:** No data available"
+    # Drop License column if it exists
+    if 'License' in df.columns:
+        df = df.drop(columns=['License'])
     last_update = (
         ranking_last_updated.strftime("%Y-%m-%d %H:%M:%S")
         if ranking_last_updated
                 "95% CI (±)",
                 "Votes",
                 "Organization",
             ],
             datatype=[
                 "number",
                 "str",
                 "number",
                 "str",
             ],
             label="Model Rankings",
             interactive=False,