Use confidence interval and seed to have reproducible scoring
Browse files- app.py +24 -19
- db.py +10 -2
- rating_systems.py +1 -1
app.py
CHANGED
|
@@ -49,30 +49,36 @@ commit_scheduler = CommitScheduler(
|
|
| 49 |
def fetch_elo_scores():
|
| 50 |
"""Fetch and log Elo scores."""
|
| 51 |
try:
|
| 52 |
-
|
| 53 |
logging.info("Elo scores successfully computed.")
|
| 54 |
-
return
|
| 55 |
except Exception as e:
|
| 56 |
logging.error("Error computing Elo scores: %s", str(e))
|
| 57 |
return None
|
| 58 |
|
| 59 |
def update_rankings_table():
|
| 60 |
-
"""Update and return the rankings table based on Elo scores."""
|
| 61 |
-
|
| 62 |
-
|
| 63 |
try:
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
except KeyError as e:
|
| 71 |
logging.error("Missing score for model: %s", str(e))
|
| 72 |
return []
|
| 73 |
return rankings
|
| 74 |
|
| 75 |
-
|
| 76 |
def select_new_image():
|
| 77 |
"""Select a new image and its segmented versions."""
|
| 78 |
max_attempts = 10
|
|
@@ -354,10 +360,10 @@ def gradio_interface():
|
|
| 354 |
|
| 355 |
with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
|
| 356 |
rankings_table = gr.Dataframe(
|
| 357 |
-
headers=["Model", "Elo score", "
|
| 358 |
value=update_rankings_table(),
|
| 359 |
label="Current Model Rankings",
|
| 360 |
-
column_widths=[180, 60, 60, 60],
|
| 361 |
row_count=4
|
| 362 |
)
|
| 363 |
|
|
@@ -368,12 +374,11 @@ def gradio_interface():
|
|
| 368 |
|
| 369 |
# Explanation of Bootstrapped Elo Score
|
| 370 |
explanation_text = """
|
| 371 |
-
The
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
variability and confidence in the model's ranking.
|
| 375 |
|
| 376 |
-
We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py).
|
| 377 |
"""
|
| 378 |
gr.Markdown(explanation_text)
|
| 379 |
|
|
|
|
| 49 |
def fetch_elo_scores():
|
| 50 |
"""Fetch and log Elo scores."""
|
| 51 |
try:
|
| 52 |
+
median_elo_scores, model_rating_q025, model_rating_q975, variance = compute_elo_scores()
|
| 53 |
logging.info("Elo scores successfully computed.")
|
| 54 |
+
return median_elo_scores, model_rating_q025, model_rating_q975, variance
|
| 55 |
except Exception as e:
|
| 56 |
logging.error("Error computing Elo scores: %s", str(e))
|
| 57 |
return None
|
| 58 |
|
| 59 |
def update_rankings_table():
|
| 60 |
+
"""Update and return the rankings table based on Elo scores and vote counts."""
|
| 61 |
+
median_elo_scores, model_rating_q025, model_rating_q975, variance = fetch_elo_scores() or {}
|
| 62 |
+
model_vote_counts = compute_votes_per_model()
|
| 63 |
try:
|
| 64 |
+
# Create a list of models to iterate over
|
| 65 |
+
models = ["Photoroom", "RemoveBG", "BRIA RMBG 2.0"]
|
| 66 |
+
rankings = []
|
| 67 |
+
|
| 68 |
+
for model in models:
|
| 69 |
+
elo_score = int(median_elo_scores.get(model, 0))
|
| 70 |
+
model_variance = int(variance.get(model, 0))
|
| 71 |
+
ci_95 = f"{int(model_rating_q025.get(model, 0))} - {int(model_rating_q975.get(model, 0))}"
|
| 72 |
+
vote_count = model_vote_counts.get(model, 0)
|
| 73 |
+
rankings.append([model, elo_score, model_variance, ci_95, vote_count])
|
| 74 |
+
|
| 75 |
+
# Sort rankings by Elo score in descending order
|
| 76 |
+
rankings.sort(key=lambda x: x[1], reverse=True)
|
| 77 |
except KeyError as e:
|
| 78 |
logging.error("Missing score for model: %s", str(e))
|
| 79 |
return []
|
| 80 |
return rankings
|
| 81 |
|
|
|
|
| 82 |
def select_new_image():
|
| 83 |
"""Select a new image and its segmented versions."""
|
| 84 |
max_attempts = 10
|
|
|
|
| 360 |
|
| 361 |
with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
|
| 362 |
rankings_table = gr.Dataframe(
|
| 363 |
+
headers=["Model", "Elo score", "Variance", "95% CI", "Selections"],
|
| 364 |
value=update_rankings_table(),
|
| 365 |
label="Current Model Rankings",
|
| 366 |
+
column_widths=[180, 60, 60, 60, 60],
|
| 367 |
row_count=4
|
| 368 |
)
|
| 369 |
|
|
|
|
| 374 |
|
| 375 |
# Explanation of Bootstrapped Elo Score
|
| 376 |
explanation_text = """
|
| 377 |
+
The Elo score was calculated using bootstrapping with num_rounds=1000. This method provides a
|
| 378 |
+
distribution of Elo scores by repeatedly sampling the data, which helps in
|
| 379 |
+
understanding the variability and confidence in the model's ranking.
|
|
|
|
| 380 |
|
| 381 |
+
We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py#L153).
|
| 382 |
"""
|
| 383 |
gr.Markdown(explanation_text)
|
| 384 |
|
db.py
CHANGED
|
@@ -5,6 +5,7 @@ from sqlalchemy.ext.declarative import declarative_base
|
|
| 5 |
from sqlalchemy.orm import sessionmaker, Session
|
| 6 |
from datetime import datetime
|
| 7 |
import pandas as pd
|
|
|
|
| 8 |
from datasets import load_dataset
|
| 9 |
from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
|
| 10 |
|
|
@@ -115,10 +116,17 @@ def compute_elo_scores():
|
|
| 115 |
logging.info("Initial votes count: %d", init_size)
|
| 116 |
logging.info("Votes count after validation: %d", df.shape[0])
|
| 117 |
|
| 118 |
-
|
|
|
|
|
|
|
| 119 |
bootstrap_elo_scores = compute_bootstrap_elo(df)
|
| 120 |
median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
# Function to compute the number of votes for each model
|
| 124 |
def compute_votes_per_model():
|
|
|
|
| 5 |
from sqlalchemy.orm import sessionmaker, Session
|
| 6 |
from datetime import datetime
|
| 7 |
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
from datasets import load_dataset
|
| 10 |
from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
|
| 11 |
|
|
|
|
| 116 |
logging.info("Initial votes count: %d", init_size)
|
| 117 |
logging.info("Votes count after validation: %d", df.shape[0])
|
| 118 |
|
| 119 |
+
# Seed the random number generator for reproducibility
|
| 120 |
+
np.random.seed(42)
|
| 121 |
+
|
| 122 |
bootstrap_elo_scores = compute_bootstrap_elo(df)
|
| 123 |
median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
|
| 124 |
+
|
| 125 |
+
model_rating_q025 = bootstrap_elo_scores.quantile(0.025)
|
| 126 |
+
model_rating_q975 = bootstrap_elo_scores.quantile(0.975)
|
| 127 |
+
variance = bootstrap_elo_scores.var()
|
| 128 |
+
|
| 129 |
+
return median_elo_scores, model_rating_q025, model_rating_q975, variance
|
| 130 |
|
| 131 |
# Function to compute the number of votes for each model
|
| 132 |
def compute_votes_per_model():
|
rating_systems.py
CHANGED
|
@@ -44,7 +44,7 @@ def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0):
|
|
| 44 |
|
| 45 |
|
| 46 |
def compute_bootstrap_elo(
|
| 47 |
-
df, num_round=
|
| 48 |
):
|
| 49 |
matchups, outcomes, models = preprocess_for_elo(df)
|
| 50 |
sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
def compute_bootstrap_elo(
|
| 47 |
+
df, num_round=1000, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
|
| 48 |
):
|
| 49 |
matchups, outcomes, models = preprocess_for_elo(df)
|
| 50 |
sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
|