import json import os import random import sqlite3 from datetime import datetime from typing import Dict, List, Tuple, Optional import numpy as np import pandas as pd import gradio as gr from filelock import FileLock # --------------------------- # Config # --------------------------- ARENAS = [ "Text", "WebDev", "Vision", "Text-to-Image", "Image Edit", "Search", "Text-to-Video", "Image-to-Video", ] DB_PATH = "arena.db" DB_LOCK = "arena.db.lock" SEED_PATH = os.path.join("data", "seed_snapshot.json") CSS_PATH = os.path.join("assets", "zen.css") DEFAULT_RATING = 1200.0 K_FACTOR = 16.0 # --------------------------- # Provider tagging (heuristic) # --------------------------- def guess_provider(model: str) -> str: m = (model or "").lower() if "gpt" in m or "chatgpt" in m or m.startswith("o3"): return "OpenAI" if "gemini" in m or "veo" in m: return "Google" if "claude" in m: return "Anthropic" if "grok" in m: return "xAI" if "sonar" in m or "ppl" in m: return "Perplexity" if "flux" in m: return "Black Forest Labs" if "kling" in m: return "Kuaishou" if "wan" in m: return "WAN" if "hunyuan" in m: return "Tencent" if "seedream" in m or "seedance" in m: return "ByteDance" return "Other" # --------------------------- # SQLite persistence # --------------------------- def db() -> sqlite3.Connection: conn = sqlite3.connect(DB_PATH, check_same_thread=False) conn.execute( """ CREATE TABLE IF NOT EXISTS ratings ( arena TEXT NOT NULL, model TEXT NOT NULL, provider TEXT NOT NULL, rating REAL NOT NULL, votes INTEGER NOT NULL, updated_at TEXT NOT NULL, PRIMARY KEY (arena, model) ) """ ) conn.execute( """ CREATE TABLE IF NOT EXISTS votes_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, arena TEXT NOT NULL, winner TEXT NOT NULL, loser TEXT NOT NULL, created_at TEXT NOT NULL ) """ ) conn.commit() return conn def now_iso() -> str: return datetime.utcnow().isoformat(timespec="seconds") + "Z" def ensure_model(arena: str, model: str, provider: Optional[str] = None, default_rating: float = DEFAULT_RATING) -> None: if arena not in ARENAS: return model = (model or "").strip() if not model: return provider = (provider or "").strip() or guess_provider(model) with FileLock(DB_LOCK): conn = db() conn.execute( "INSERT OR IGNORE INTO ratings (arena, model, provider, rating, votes, updated_at) VALUES (?, ?, ?, ?, ?, ?)", (arena, model, provider, float(default_rating), 0, now_iso()), ) conn.commit() conn.close() def get_rating(arena: str, model: str) -> Tuple[float, int, str]: with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT rating, votes, provider FROM ratings WHERE arena=? AND model=?", (arena, model)) row = cur.fetchone() conn.close() if row is None: return (DEFAULT_RATING, 0, guess_provider(model)) return (float(row[0]), int(row[1]), str(row[2])) def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tuple[float, float]: ea = 1.0 / (1.0 + 10 ** ((r_b - r_a) / 400.0)) sa = 1.0 if a_wins else 0.0 new_a = r_a + k * (sa - ea) new_b = r_b + k * ((1.0 - sa) - (1.0 - ea)) return new_a, new_b def vote(arena: str, winner: str, loser: str) -> None: ensure_model(arena, winner) ensure_model(arena, loser) r_w, v_w, p_w = get_rating(arena, winner) r_l, v_l, p_l = get_rating(arena, loser) new_w, new_l = elo_update(r_w, r_l, True) with FileLock(DB_LOCK): conn = db() conn.execute( "UPDATE ratings SET rating=?, votes=?, updated_at=?, provider=? WHERE arena=? AND model=?", (float(new_w), int(v_w + 1), now_iso(), p_w, arena, winner), ) conn.execute( "UPDATE ratings SET rating=?, votes=?, updated_at=?, provider=? WHERE arena=? AND model=?", (float(new_l), int(v_l + 1), now_iso(), p_l, arena, loser), ) conn.execute( "INSERT INTO votes_log (arena, winner, loser, created_at) VALUES (?, ?, ?, ?)", (arena, winner, loser, now_iso()), ) conn.commit() conn.close() def seed_from_json(force: bool = False) -> Dict[str, object]: if not os.path.exists(SEED_PATH): return {"ok": False, "seeded_rows": 0, "note": "Missing data/seed_snapshot.json"} with open(SEED_PATH, "r", encoding="utf-8") as f: seed = json.load(f) seeded = 0 with FileLock(DB_LOCK): conn = db() cur = conn.cursor() if force: cur.execute("DELETE FROM ratings") cur.execute("DELETE FROM votes_log") conn.commit() for arena, rows in seed.items(): if arena not in ARENAS: continue for item in rows: model = str(item.get("model", "")).strip() if not model: continue score = float(item.get("score", DEFAULT_RATING)) votes_n = int(item.get("votes", 0)) provider = guess_provider(model) cur.execute( """ INSERT OR REPLACE INTO ratings (arena, model, provider, rating, votes, updated_at) VALUES (?, ?, ?, ?, ?, ?) """, (arena, model, provider, score, votes_n, now_iso()), ) seeded += 1 conn.commit() conn.close() return {"ok": True, "seeded_rows": seeded, "note": "Seeded successfully"} def ensure_seed_once() -> None: with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT COUNT(*) FROM ratings") n = cur.fetchone()[0] conn.close() if n == 0: seed_from_json(force=False) # --------------------------- # Query helpers # --------------------------- def providers_list() -> List[str]: ensure_seed_once() with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT DISTINCT provider FROM ratings ORDER BY provider ASC") rows = [r[0] for r in cur.fetchall()] conn.close() return ["All"] + rows def all_models() -> List[str]: ensure_seed_once() with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT DISTINCT model FROM ratings ORDER BY model ASC") rows = [r[0] for r in cur.fetchall()] conn.close() return rows def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_votes: int = 0, limit: int = 100) -> pd.DataFrame: ensure_seed_once() arena = arena if arena in ARENAS else "Text" search = (search or "").strip().lower() provider = provider or "All" min_votes = int(min_votes or 0) where = ["arena = ?"] params: List[object] = [arena] if search: where.append("LOWER(model) LIKE ?") params.append(f"%{search}%") if provider != "All": where.append("provider = ?") params.append(provider) if min_votes > 0: where.append("votes >= ?") params.append(min_votes) where_sql = " AND ".join(where) q = f""" SELECT model AS Model, provider AS Provider, rating AS Rating, votes AS Votes, updated_at AS Updated FROM ratings WHERE {where_sql} ORDER BY rating DESC LIMIT ? """ params.append(int(limit)) with FileLock(DB_LOCK): conn = db() df = pd.read_sql_query(q, conn, params=params) conn.close() if df.empty: return pd.DataFrame(columns=["Rank", "Model", "Provider", "Score", "Votes", "Updated"]) df["Score"] = df["Rating"].round().astype(int) df.drop(columns=["Rating"], inplace=True) df.insert(0, "Rank", np.arange(1, len(df) + 1)) return df[["Rank", "Model", "Provider", "Score", "Votes", "Updated"]] def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: int = 0, limit_models: int = 200) -> pd.DataFrame: ensure_seed_once() search = (search or "").strip().lower() provider = provider or "All" min_votes = int(min_votes or 0) limit_models = int(limit_models or 200) with FileLock(DB_LOCK): conn = db() base = pd.read_sql_query("SELECT arena, model, provider, rating, votes FROM ratings", conn) conn.close() if base.empty: return pd.DataFrame() if provider != "All": base = base[base["provider"] == provider] if min_votes > 0: base = base[base["votes"] >= min_votes] if search: base = base[base["model"].str.lower().str.contains(search, na=False)] if base.empty: return pd.DataFrame() base["rank"] = base.groupby("arena")["rating"].rank(ascending=False, method="min").astype(int) base["score"] = base["rating"].round().astype(int) pivot_rank = base.pivot_table(index=["model", "provider"], columns="arena", values="rank", aggfunc="min") avg_rank = pivot_rank.mean(axis=1, skipna=True).sort_values() chosen = avg_rank.head(limit_models).index base = base.set_index(["model", "provider"]) base = base.loc[base.index.isin(chosen)].reset_index() out = pd.DataFrame({"Model": [m for (m, p) in chosen], "Provider": [p for (m, p) in chosen]}).reset_index(drop=True) for a in ARENAS: sub = base[base["arena"] == a][["model", "provider", "rank", "score", "votes"]].copy() sub = sub.rename(columns={ "rank": f"{a} Rank", "score": f"{a} Score", "votes": f"{a} Votes", }) out = out.merge(sub, how="left", left_on=["Model", "Provider"], right_on=["model", "provider"]) for c in ["model", "provider"]: if c in out.columns: out.drop(columns=[c], inplace=True) out["_avg_rank"] = out[[f"{a} Rank" for a in ARENAS]].mean(axis=1, skipna=True) out = out.sort_values("_avg_rank", ascending=True).drop(columns=["_avg_rank"]) for a in ARENAS: for col in [f"{a} Rank", f"{a} Score", f"{a} Votes"]: if col in out.columns: out[col] = out[col].astype("Int64") return out def kpis() -> Dict[str, str]: ensure_seed_once() with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT COUNT(DISTINCT model) FROM ratings") models = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM ratings") rows = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM votes_log") votes_n = cur.fetchone()[0] cur.execute("SELECT MAX(created_at) FROM votes_log") last_vote = cur.fetchone()[0] conn.close() return { "models": str(models), "entries": str(rows), "votes": str(votes_n), "last_vote": last_vote or "—", } # --------------------------- # Voting / profiles # --------------------------- def pick_pair(arena: str, provider: str = "All") -> Tuple[str, str]: df = leaderboard_df(arena, provider=provider, min_votes=0, limit=50) models = df["Model"].tolist() if len(models) < 2: models = all_models() if len(models) < 2: return ("model-a", "model-b") return tuple(random.sample(models, 2)) def model_card_md(model: str, arena: Optional[str] = None) -> str: provider = guess_provider(model) out = [f"### {model}", f"{provider}"] if arena: r, v, _ = get_rating(arena, model) out += ["", f"**Arena:** {arena}", f"**Score:** {int(round(r))}", f"**Votes:** {v}"] return "\n".join(out) def model_profile(model: str) -> Tuple[pd.DataFrame, str]: ensure_seed_once() model = (model or "").strip() if not model: return pd.DataFrame(columns=["Arena", "Score", "Votes", "Provider"]), "
Multi-arena rankings (Text · WebDev · Vision · Image · Video · Search) with a cross-arena overview matrix and live Elo voting.