import json import os import random import sqlite3 from datetime import datetime from typing import Dict, List, Tuple, Optional import numpy as np import pandas as pd import gradio as gr from filelock import FileLock # --------------------------- # Config # --------------------------- ARENAS = [ "Text", "WebDev", "Vision", "Text-to-Image", "Image Edit", "Search", "Text-to-Video", "Image-to-Video", ] DB_PATH = "arena.db" DB_LOCK = "arena.db.lock" SEED_PATH = os.path.join("data", "seed_snapshot.json") CSS_PATH = os.path.join("assets", "zen.css") DEFAULT_RATING = 1200.0 K_FACTOR = 16.0 # --------------------------- # Provider tagging (heuristic) # --------------------------- def guess_provider(model: str) -> str: m = (model or "").lower() if "gpt" in m or "chatgpt" in m or m.startswith("o3"): return "OpenAI" if "gemini" in m or "veo" in m: return "Google" if "claude" in m: return "Anthropic" if "grok" in m: return "xAI" if "sonar" in m or "ppl" in m: return "Perplexity" if "flux" in m: return "Black Forest Labs" if "kling" in m: return "Kuaishou" if "wan" in m: return "WAN" if "hunyuan" in m: return "Tencent" if "seedream" in m or "seedance" in m: return "ByteDance" return "Other" # --------------------------- # SQLite persistence # --------------------------- def db() -> sqlite3.Connection: conn = sqlite3.connect(DB_PATH, check_same_thread=False) conn.execute( """ CREATE TABLE IF NOT EXISTS ratings ( arena TEXT NOT NULL, model TEXT NOT NULL, provider TEXT NOT NULL, rating REAL NOT NULL, votes INTEGER NOT NULL, updated_at TEXT NOT NULL, PRIMARY KEY (arena, model) ) """ ) conn.execute( """ CREATE TABLE IF NOT EXISTS votes_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, arena TEXT NOT NULL, winner TEXT NOT NULL, loser TEXT NOT NULL, created_at TEXT NOT NULL ) """ ) conn.commit() return conn def now_iso() -> str: return datetime.utcnow().isoformat(timespec="seconds") + "Z" def ensure_model(arena: str, model: str, provider: Optional[str] = None, default_rating: float = DEFAULT_RATING) -> None: if arena not in ARENAS: return model = (model or "").strip() if not model: return provider = (provider or "").strip() or guess_provider(model) with FileLock(DB_LOCK): conn = db() conn.execute( "INSERT OR IGNORE INTO ratings (arena, model, provider, rating, votes, updated_at) VALUES (?, ?, ?, ?, ?, ?)", (arena, model, provider, float(default_rating), 0, now_iso()), ) conn.commit() conn.close() def get_rating(arena: str, model: str) -> Tuple[float, int, str]: with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT rating, votes, provider FROM ratings WHERE arena=? AND model=?", (arena, model)) row = cur.fetchone() conn.close() if row is None: return (DEFAULT_RATING, 0, guess_provider(model)) return (float(row[0]), int(row[1]), str(row[2])) def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tuple[float, float]: ea = 1.0 / (1.0 + 10 ** ((r_b - r_a) / 400.0)) sa = 1.0 if a_wins else 0.0 new_a = r_a + k * (sa - ea) new_b = r_b + k * ((1.0 - sa) - (1.0 - ea)) return new_a, new_b def vote(arena: str, winner: str, loser: str) -> None: ensure_model(arena, winner) ensure_model(arena, loser) r_w, v_w, p_w = get_rating(arena, winner) r_l, v_l, p_l = get_rating(arena, loser) new_w, new_l = elo_update(r_w, r_l, True) with FileLock(DB_LOCK): conn = db() conn.execute( "UPDATE ratings SET rating=?, votes=?, updated_at=?, provider=? WHERE arena=? AND model=?", (float(new_w), int(v_w + 1), now_iso(), p_w, arena, winner), ) conn.execute( "UPDATE ratings SET rating=?, votes=?, updated_at=?, provider=? WHERE arena=? AND model=?", (float(new_l), int(v_l + 1), now_iso(), p_l, arena, loser), ) conn.execute( "INSERT INTO votes_log (arena, winner, loser, created_at) VALUES (?, ?, ?, ?)", (arena, winner, loser, now_iso()), ) conn.commit() conn.close() def seed_from_json(force: bool = False) -> Dict[str, object]: if not os.path.exists(SEED_PATH): return {"ok": False, "seeded_rows": 0, "note": "Missing data/seed_snapshot.json"} with open(SEED_PATH, "r", encoding="utf-8") as f: seed = json.load(f) seeded = 0 with FileLock(DB_LOCK): conn = db() cur = conn.cursor() if force: cur.execute("DELETE FROM ratings") cur.execute("DELETE FROM votes_log") conn.commit() for arena, rows in seed.items(): if arena not in ARENAS: continue for item in rows: model = str(item.get("model", "")).strip() if not model: continue score = float(item.get("score", DEFAULT_RATING)) votes_n = int(item.get("votes", 0)) provider = guess_provider(model) cur.execute( """ INSERT OR REPLACE INTO ratings (arena, model, provider, rating, votes, updated_at) VALUES (?, ?, ?, ?, ?, ?) """, (arena, model, provider, score, votes_n, now_iso()), ) seeded += 1 conn.commit() conn.close() return {"ok": True, "seeded_rows": seeded, "note": "Seeded successfully"} def ensure_seed_once() -> None: with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT COUNT(*) FROM ratings") n = cur.fetchone()[0] conn.close() if n == 0: seed_from_json(force=False) # --------------------------- # Query helpers # --------------------------- def providers_list() -> List[str]: ensure_seed_once() with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT DISTINCT provider FROM ratings ORDER BY provider ASC") rows = [r[0] for r in cur.fetchall()] conn.close() return ["All"] + rows def all_models() -> List[str]: ensure_seed_once() with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT DISTINCT model FROM ratings ORDER BY model ASC") rows = [r[0] for r in cur.fetchall()] conn.close() return rows def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_votes: int = 0, limit: int = 100) -> pd.DataFrame: ensure_seed_once() arena = arena if arena in ARENAS else "Text" search = (search or "").strip().lower() provider = provider or "All" min_votes = int(min_votes or 0) where = ["arena = ?"] params: List[object] = [arena] if search: where.append("LOWER(model) LIKE ?") params.append(f"%{search}%") if provider != "All": where.append("provider = ?") params.append(provider) if min_votes > 0: where.append("votes >= ?") params.append(min_votes) where_sql = " AND ".join(where) q = f""" SELECT model AS Model, provider AS Provider, rating AS Rating, votes AS Votes, updated_at AS Updated FROM ratings WHERE {where_sql} ORDER BY rating DESC LIMIT ? """ params.append(int(limit)) with FileLock(DB_LOCK): conn = db() df = pd.read_sql_query(q, conn, params=params) conn.close() if df.empty: return pd.DataFrame(columns=["Rank", "Model", "Provider", "Score", "Votes", "Updated"]) df["Score"] = df["Rating"].round().astype(int) df.drop(columns=["Rating"], inplace=True) df.insert(0, "Rank", np.arange(1, len(df) + 1)) return df[["Rank", "Model", "Provider", "Score", "Votes", "Updated"]] def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: int = 0, limit_models: int = 200) -> pd.DataFrame: ensure_seed_once() search = (search or "").strip().lower() provider = provider or "All" min_votes = int(min_votes or 0) limit_models = int(limit_models or 200) with FileLock(DB_LOCK): conn = db() base = pd.read_sql_query("SELECT arena, model, provider, rating, votes FROM ratings", conn) conn.close() if base.empty: return pd.DataFrame() if provider != "All": base = base[base["provider"] == provider] if min_votes > 0: base = base[base["votes"] >= min_votes] if search: base = base[base["model"].str.lower().str.contains(search, na=False)] if base.empty: return pd.DataFrame() base["rank"] = base.groupby("arena")["rating"].rank(ascending=False, method="min").astype(int) base["score"] = base["rating"].round().astype(int) pivot_rank = base.pivot_table(index=["model", "provider"], columns="arena", values="rank", aggfunc="min") avg_rank = pivot_rank.mean(axis=1, skipna=True).sort_values() chosen = avg_rank.head(limit_models).index base = base.set_index(["model", "provider"]) base = base.loc[base.index.isin(chosen)].reset_index() out = pd.DataFrame({"Model": [m for (m, p) in chosen], "Provider": [p for (m, p) in chosen]}).reset_index(drop=True) for a in ARENAS: sub = base[base["arena"] == a][["model", "provider", "rank", "score", "votes"]].copy() sub = sub.rename(columns={ "rank": f"{a} Rank", "score": f"{a} Score", "votes": f"{a} Votes", }) out = out.merge(sub, how="left", left_on=["Model", "Provider"], right_on=["model", "provider"]) for c in ["model", "provider"]: if c in out.columns: out.drop(columns=[c], inplace=True) out["_avg_rank"] = out[[f"{a} Rank" for a in ARENAS]].mean(axis=1, skipna=True) out = out.sort_values("_avg_rank", ascending=True).drop(columns=["_avg_rank"]) for a in ARENAS: for col in [f"{a} Rank", f"{a} Score", f"{a} Votes"]: if col in out.columns: out[col] = out[col].astype("Int64") return out def kpis() -> Dict[str, str]: ensure_seed_once() with FileLock(DB_LOCK): conn = db() cur = conn.cursor() cur.execute("SELECT COUNT(DISTINCT model) FROM ratings") models = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM ratings") rows = cur.fetchone()[0] cur.execute("SELECT COUNT(*) FROM votes_log") votes_n = cur.fetchone()[0] cur.execute("SELECT MAX(created_at) FROM votes_log") last_vote = cur.fetchone()[0] conn.close() return { "models": str(models), "entries": str(rows), "votes": str(votes_n), "last_vote": last_vote or "—", } # --------------------------- # Voting / profiles # --------------------------- def pick_pair(arena: str, provider: str = "All") -> Tuple[str, str]: df = leaderboard_df(arena, provider=provider, min_votes=0, limit=50) models = df["Model"].tolist() if len(models) < 2: models = all_models() if len(models) < 2: return ("model-a", "model-b") return tuple(random.sample(models, 2)) def model_card_md(model: str, arena: Optional[str] = None) -> str: provider = guess_provider(model) out = [f"### {model}", f"{provider}"] if arena: r, v, _ = get_rating(arena, model) out += ["", f"**Arena:** {arena}", f"**Score:** {int(round(r))}", f"**Votes:** {v}"] return "\n".join(out) def model_profile(model: str) -> Tuple[pd.DataFrame, str]: ensure_seed_once() model = (model or "").strip() if not model: return pd.DataFrame(columns=["Arena", "Score", "Votes", "Provider"]), "
No model selected.
" rows = [] for a in ARENAS: r, v, p = get_rating(a, model) rows.append({"Arena": a, "Score": int(round(r)), "Votes": v, "Provider": p}) df = pd.DataFrame(rows).sort_values("Score", ascending=False) best = df.iloc[0] worst = df.iloc[-1] summary = ( "
" "
Model Profile
" f"
{model} · Provider: {guess_provider(model)}
" "
" "
" f"Best arena: {best['Arena']} (Score {best['Score']}, Votes {best['Votes']}). " f"Worst arena: {worst['Arena']} (Score {worst['Score']}, Votes {worst['Votes']})." "
" "
" ) return df, summary # --------------------------- # App UI # --------------------------- ensure_seed_once() css = "" if os.path.exists(CSS_PATH): with open(CSS_PATH, "r", encoding="utf-8") as f: css = f.read() with gr.Blocks(title="ZEN Model Arena Leaderboard") as demo: if css: gr.HTML(f"") k = kpis() header = f"""
ZEN Model Arena Leaderboard

Multi-arena rankings (Text · WebDev · Vision · Image · Video · Search) with a cross-arena overview matrix and live Elo voting.

Models
{k['models']}
unique IDs tracked
Arena Entries
{k['entries']}
model × arena rows
Votes Logged
{k['votes']}
pairwise results
Last Vote
{k['last_vote']}
UTC
Gradio 6.2.0 SQLite + FileLock Arena Matrix Search + Filters
""" gr.HTML(header) provider_choices = providers_list() with gr.Tabs(): # Overview with gr.Tab("Leaderboard Overview"): gr.Markdown("### Top 10 (live DB)\nMirrors the snapshot format, but runs off the DB.") with gr.Row(): arena_sel_ov = gr.Dropdown(choices=ARENAS, value="Text", label="Arena") provider_sel_ov = gr.Dropdown(choices=provider_choices, value="All", label="Provider") min_votes_ov = gr.Slider(0, 100000, value=0, step=50, label="Min votes") search_ov = gr.Textbox(value="", label="Search models", placeholder="gpt, gemini, claude, flux...") df_ov = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-520"]) refresh_ov = gr.Button("Refresh overview", variant="primary") def refresh_overview(arena, provider, min_votes, search): return leaderboard_df(arena, search=search, provider=provider, min_votes=int(min_votes), limit=10) refresh_ov.click(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov]) demo.load(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov]) # Matrix with gr.Tab("Arena Overview Matrix"): gr.Markdown("### Cross-arena placements\nRank/Score/Votes per arena in one wide matrix.") with gr.Row(): provider_sel_mx = gr.Dropdown(choices=provider_choices, value="All", label="Provider") min_votes_mx = gr.Slider(0, 100000, value=0, step=50, label="Min votes") search_mx = gr.Textbox(value="", label="Search") limit_mx = gr.Slider(10, 400, value=200, step=10, label="Max models") mx = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-600"]) refresh_mx = gr.Button("Build / Refresh Matrix", variant="primary") def build_matrix(provider, min_votes, search, limit_models): return arena_overview_matrix(search=search, provider=provider, min_votes=int(min_votes), limit_models=int(limit_models)) refresh_mx.click(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx]) demo.load(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx]) # Arena tabs for arena in ARENAS: with gr.Tab(arena): gr.Markdown(f"### {arena} Leaderboard (live DB)") with gr.Row(): provider_sel = gr.Dropdown(choices=provider_choices, value="All", label="Provider") min_votes = gr.Slider(0, 100000, value=0, step=50, label="Min votes") search = gr.Textbox(value="", label="Search") df = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-600"]) btn = gr.Button("Refresh", variant="primary") btn.click( lambda p, mv, s, a=arena: leaderboard_df(a, search=s, provider=p, min_votes=int(mv), limit=150), inputs=[provider_sel, min_votes, search], outputs=[df], ) demo.load(lambda a=arena: leaderboard_df(a, limit=150), outputs=[df]) # Voting with gr.Tab("Start Voting"): gr.Markdown("### Pairwise Voting (Elo)\nPick a winner for a specific arena. Scores update instantly.") with gr.Row(): arena_vote = gr.Dropdown(choices=ARENAS, value="Text", label="Arena") provider_vote = gr.Dropdown(choices=provider_choices, value="All", label="Provider pool") new_match_btn = gr.Button("New Matchup", variant="primary") left_state = gr.State("") right_state = gr.State("") with gr.Row(): left_md = gr.Markdown() right_md = gr.Markdown() with gr.Row(): left_btn = gr.Button("⬅ Left Wins", variant="primary") right_btn = gr.Button("Right Wins ➡", variant="primary") vote_status = gr.HTML() def new_matchup(arena, provider): a, b = pick_pair(arena, provider=provider) return model_card_md(a, arena), model_card_md(b, arena), a, b, "
New matchup ready.
" def left_wins(arena, left, right, provider): if left and right: vote(arena, winner=left, loser=right) return new_matchup(arena, provider) def right_wins(arena, left, right, provider): if left and right: vote(arena, winner=right, loser=left) return new_matchup(arena, provider) new_match_btn.click(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status]) left_btn.click(left_wins, inputs=[arena_vote, left_state, right_state, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status]) right_btn.click(right_wins, inputs=[arena_vote, left_state, right_state, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status]) demo.load(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status]) # Profiles with gr.Tab("Model Profiles"): gr.Markdown("### Inspect a model across arenas") models = all_models() model_dd = gr.Dropdown(choices=models, value=(models[0] if models else None), label="Model") prof_summary = gr.HTML() prof_df = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-520"]) load_btn = gr.Button("Load Profile", variant="primary") load_btn.click(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary]) demo.load(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary]) # Admin with gr.Tab("Admin"): gr.Markdown("### Admin Tools") with gr.Row(): reseed_force = gr.Checkbox(value=False, label="Force reseed (wipe DB first)") reseed_btn = gr.Button("Seed from data/seed_snapshot.json", variant="primary") reseed_out = gr.JSON() reseed_btn.click(lambda force: seed_from_json(force=bool(force)), inputs=[reseed_force], outputs=[reseed_out]) gr.Markdown("#### Add a model to arenas") new_model = gr.Textbox(label="Model ID", placeholder="e.g., gpt-5.2, gemini-3-pro, claude-opus-4-5-…") new_provider = gr.Textbox(label="Provider (optional)", placeholder="Leave blank for auto-detect") arenas_add = gr.CheckboxGroup(choices=ARENAS, value=["Text"], label="Arenas") add_btn = gr.Button("Add Model", variant="primary") add_out = gr.HTML() def add_model(model_id, provider, arenas): model_id = (model_id or "").strip() if not model_id: return "
Missing model id.
" provider = (provider or "").strip() or guess_provider(model_id) added = 0 for a in arenas or []: ensure_model(a, model_id, provider=provider) added += 1 return f"
Added {model_id} ({provider}) to {added} arenas.
" add_btn.click(add_model, inputs=[new_model, new_provider, arenas_add], outputs=[add_out]) gr.Markdown("#### Sanity") sanity = gr.JSON() def sanity_check(): return { "time_utc": now_iso(), "db_path_exists": os.path.exists(DB_PATH), "seed_path_exists": os.path.exists(SEED_PATH), "css_path_exists": os.path.exists(CSS_PATH), "arenas": ARENAS, "providers_detected": providers_list(), "models_count": len(all_models()), } demo.load(sanity_check, outputs=[sanity]) gr.close_all() demo.launch()