ZENLLC's picture
Update app.py
54d6710 verified
import json
import os
import random
import sqlite3
from datetime import datetime
from typing import Dict, List, Tuple, Optional
import numpy as np
import pandas as pd
import gradio as gr
from filelock import FileLock
# ---------------------------
# Config
# ---------------------------
ARENAS = [
"Text",
"WebDev",
"Vision",
"Text-to-Image",
"Image Edit",
"Search",
"Text-to-Video",
"Image-to-Video",
]
DB_PATH = "arena.db"
DB_LOCK = "arena.db.lock"
SEED_PATH = os.path.join("data", "seed_snapshot.json")
CSS_PATH = os.path.join("assets", "zen.css")
DEFAULT_RATING = 1200.0
K_FACTOR = 16.0
# ---------------------------
# Provider tagging (heuristic)
# ---------------------------
def guess_provider(model: str) -> str:
m = (model or "").lower()
if "gpt" in m or "chatgpt" in m or m.startswith("o3"):
return "OpenAI"
if "gemini" in m or "veo" in m:
return "Google"
if "claude" in m:
return "Anthropic"
if "grok" in m:
return "xAI"
if "sonar" in m or "ppl" in m:
return "Perplexity"
if "flux" in m:
return "Black Forest Labs"
if "kling" in m:
return "Kuaishou"
if "wan" in m:
return "WAN"
if "hunyuan" in m:
return "Tencent"
if "seedream" in m or "seedance" in m:
return "ByteDance"
return "Other"
# ---------------------------
# SQLite persistence
# ---------------------------
def db() -> sqlite3.Connection:
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS ratings (
arena TEXT NOT NULL,
model TEXT NOT NULL,
provider TEXT NOT NULL,
rating REAL NOT NULL,
votes INTEGER NOT NULL,
updated_at TEXT NOT NULL,
PRIMARY KEY (arena, model)
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS votes_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
arena TEXT NOT NULL,
winner TEXT NOT NULL,
loser TEXT NOT NULL,
created_at TEXT NOT NULL
)
"""
)
conn.commit()
return conn
def now_iso() -> str:
return datetime.utcnow().isoformat(timespec="seconds") + "Z"
def ensure_model(arena: str, model: str, provider: Optional[str] = None, default_rating: float = DEFAULT_RATING) -> None:
if arena not in ARENAS:
return
model = (model or "").strip()
if not model:
return
provider = (provider or "").strip() or guess_provider(model)
with FileLock(DB_LOCK):
conn = db()
conn.execute(
"INSERT OR IGNORE INTO ratings (arena, model, provider, rating, votes, updated_at) VALUES (?, ?, ?, ?, ?, ?)",
(arena, model, provider, float(default_rating), 0, now_iso()),
)
conn.commit()
conn.close()
def get_rating(arena: str, model: str) -> Tuple[float, int, str]:
with FileLock(DB_LOCK):
conn = db()
cur = conn.cursor()
cur.execute("SELECT rating, votes, provider FROM ratings WHERE arena=? AND model=?", (arena, model))
row = cur.fetchone()
conn.close()
if row is None:
return (DEFAULT_RATING, 0, guess_provider(model))
return (float(row[0]), int(row[1]), str(row[2]))
def elo_update(r_a: float, r_b: float, a_wins: bool, k: float = K_FACTOR) -> Tuple[float, float]:
ea = 1.0 / (1.0 + 10 ** ((r_b - r_a) / 400.0))
sa = 1.0 if a_wins else 0.0
new_a = r_a + k * (sa - ea)
new_b = r_b + k * ((1.0 - sa) - (1.0 - ea))
return new_a, new_b
def vote(arena: str, winner: str, loser: str) -> None:
ensure_model(arena, winner)
ensure_model(arena, loser)
r_w, v_w, p_w = get_rating(arena, winner)
r_l, v_l, p_l = get_rating(arena, loser)
new_w, new_l = elo_update(r_w, r_l, True)
with FileLock(DB_LOCK):
conn = db()
conn.execute(
"UPDATE ratings SET rating=?, votes=?, updated_at=?, provider=? WHERE arena=? AND model=?",
(float(new_w), int(v_w + 1), now_iso(), p_w, arena, winner),
)
conn.execute(
"UPDATE ratings SET rating=?, votes=?, updated_at=?, provider=? WHERE arena=? AND model=?",
(float(new_l), int(v_l + 1), now_iso(), p_l, arena, loser),
)
conn.execute(
"INSERT INTO votes_log (arena, winner, loser, created_at) VALUES (?, ?, ?, ?)",
(arena, winner, loser, now_iso()),
)
conn.commit()
conn.close()
def seed_from_json(force: bool = False) -> Dict[str, object]:
if not os.path.exists(SEED_PATH):
return {"ok": False, "seeded_rows": 0, "note": "Missing data/seed_snapshot.json"}
with open(SEED_PATH, "r", encoding="utf-8") as f:
seed = json.load(f)
seeded = 0
with FileLock(DB_LOCK):
conn = db()
cur = conn.cursor()
if force:
cur.execute("DELETE FROM ratings")
cur.execute("DELETE FROM votes_log")
conn.commit()
for arena, rows in seed.items():
if arena not in ARENAS:
continue
for item in rows:
model = str(item.get("model", "")).strip()
if not model:
continue
score = float(item.get("score", DEFAULT_RATING))
votes_n = int(item.get("votes", 0))
provider = guess_provider(model)
cur.execute(
"""
INSERT OR REPLACE INTO ratings (arena, model, provider, rating, votes, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
""",
(arena, model, provider, score, votes_n, now_iso()),
)
seeded += 1
conn.commit()
conn.close()
return {"ok": True, "seeded_rows": seeded, "note": "Seeded successfully"}
def ensure_seed_once() -> None:
with FileLock(DB_LOCK):
conn = db()
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM ratings")
n = cur.fetchone()[0]
conn.close()
if n == 0:
seed_from_json(force=False)
# ---------------------------
# Query helpers
# ---------------------------
def providers_list() -> List[str]:
ensure_seed_once()
with FileLock(DB_LOCK):
conn = db()
cur = conn.cursor()
cur.execute("SELECT DISTINCT provider FROM ratings ORDER BY provider ASC")
rows = [r[0] for r in cur.fetchall()]
conn.close()
return ["All"] + rows
def all_models() -> List[str]:
ensure_seed_once()
with FileLock(DB_LOCK):
conn = db()
cur = conn.cursor()
cur.execute("SELECT DISTINCT model FROM ratings ORDER BY model ASC")
rows = [r[0] for r in cur.fetchall()]
conn.close()
return rows
def leaderboard_df(arena: str, search: str = "", provider: str = "All", min_votes: int = 0, limit: int = 100) -> pd.DataFrame:
ensure_seed_once()
arena = arena if arena in ARENAS else "Text"
search = (search or "").strip().lower()
provider = provider or "All"
min_votes = int(min_votes or 0)
where = ["arena = ?"]
params: List[object] = [arena]
if search:
where.append("LOWER(model) LIKE ?")
params.append(f"%{search}%")
if provider != "All":
where.append("provider = ?")
params.append(provider)
if min_votes > 0:
where.append("votes >= ?")
params.append(min_votes)
where_sql = " AND ".join(where)
q = f"""
SELECT model AS Model, provider AS Provider, rating AS Rating, votes AS Votes, updated_at AS Updated
FROM ratings
WHERE {where_sql}
ORDER BY rating DESC
LIMIT ?
"""
params.append(int(limit))
with FileLock(DB_LOCK):
conn = db()
df = pd.read_sql_query(q, conn, params=params)
conn.close()
if df.empty:
return pd.DataFrame(columns=["Rank", "Model", "Provider", "Score", "Votes", "Updated"])
df["Score"] = df["Rating"].round().astype(int)
df.drop(columns=["Rating"], inplace=True)
df.insert(0, "Rank", np.arange(1, len(df) + 1))
return df[["Rank", "Model", "Provider", "Score", "Votes", "Updated"]]
def arena_overview_matrix(search: str = "", provider: str = "All", min_votes: int = 0, limit_models: int = 200) -> pd.DataFrame:
ensure_seed_once()
search = (search or "").strip().lower()
provider = provider or "All"
min_votes = int(min_votes or 0)
limit_models = int(limit_models or 200)
with FileLock(DB_LOCK):
conn = db()
base = pd.read_sql_query("SELECT arena, model, provider, rating, votes FROM ratings", conn)
conn.close()
if base.empty:
return pd.DataFrame()
if provider != "All":
base = base[base["provider"] == provider]
if min_votes > 0:
base = base[base["votes"] >= min_votes]
if search:
base = base[base["model"].str.lower().str.contains(search, na=False)]
if base.empty:
return pd.DataFrame()
base["rank"] = base.groupby("arena")["rating"].rank(ascending=False, method="min").astype(int)
base["score"] = base["rating"].round().astype(int)
pivot_rank = base.pivot_table(index=["model", "provider"], columns="arena", values="rank", aggfunc="min")
avg_rank = pivot_rank.mean(axis=1, skipna=True).sort_values()
chosen = avg_rank.head(limit_models).index
base = base.set_index(["model", "provider"])
base = base.loc[base.index.isin(chosen)].reset_index()
out = pd.DataFrame({"Model": [m for (m, p) in chosen], "Provider": [p for (m, p) in chosen]}).reset_index(drop=True)
for a in ARENAS:
sub = base[base["arena"] == a][["model", "provider", "rank", "score", "votes"]].copy()
sub = sub.rename(columns={
"rank": f"{a} Rank",
"score": f"{a} Score",
"votes": f"{a} Votes",
})
out = out.merge(sub, how="left", left_on=["Model", "Provider"], right_on=["model", "provider"])
for c in ["model", "provider"]:
if c in out.columns:
out.drop(columns=[c], inplace=True)
out["_avg_rank"] = out[[f"{a} Rank" for a in ARENAS]].mean(axis=1, skipna=True)
out = out.sort_values("_avg_rank", ascending=True).drop(columns=["_avg_rank"])
for a in ARENAS:
for col in [f"{a} Rank", f"{a} Score", f"{a} Votes"]:
if col in out.columns:
out[col] = out[col].astype("Int64")
return out
def kpis() -> Dict[str, str]:
ensure_seed_once()
with FileLock(DB_LOCK):
conn = db()
cur = conn.cursor()
cur.execute("SELECT COUNT(DISTINCT model) FROM ratings")
models = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM ratings")
rows = cur.fetchone()[0]
cur.execute("SELECT COUNT(*) FROM votes_log")
votes_n = cur.fetchone()[0]
cur.execute("SELECT MAX(created_at) FROM votes_log")
last_vote = cur.fetchone()[0]
conn.close()
return {
"models": str(models),
"entries": str(rows),
"votes": str(votes_n),
"last_vote": last_vote or "—",
}
# ---------------------------
# Voting / profiles
# ---------------------------
def pick_pair(arena: str, provider: str = "All") -> Tuple[str, str]:
df = leaderboard_df(arena, provider=provider, min_votes=0, limit=50)
models = df["Model"].tolist()
if len(models) < 2:
models = all_models()
if len(models) < 2:
return ("model-a", "model-b")
return tuple(random.sample(models, 2))
def model_card_md(model: str, arena: Optional[str] = None) -> str:
provider = guess_provider(model)
out = [f"### {model}", f"<span class='zen-badge'>{provider}</span>"]
if arena:
r, v, _ = get_rating(arena, model)
out += ["", f"**Arena:** {arena}", f"**Score:** {int(round(r))}", f"**Votes:** {v}"]
return "\n".join(out)
def model_profile(model: str) -> Tuple[pd.DataFrame, str]:
ensure_seed_once()
model = (model or "").strip()
if not model:
return pd.DataFrame(columns=["Arena", "Score", "Votes", "Provider"]), "<div class='zen-card'>No model selected.</div>"
rows = []
for a in ARENAS:
r, v, p = get_rating(a, model)
rows.append({"Arena": a, "Score": int(round(r)), "Votes": v, "Provider": p})
df = pd.DataFrame(rows).sort_values("Score", ascending=False)
best = df.iloc[0]
worst = df.iloc[-1]
summary = (
"<div class='zen-card'>"
"<div class='zen-title'>Model Profile</div>"
f"<div class='zen-sub'><b>{model}</b> · Provider: <b>{guess_provider(model)}</b></div>"
"<div class='zen-hr'></div>"
"<div class='zen-note'>"
f"Best arena: <b>{best['Arena']}</b> (Score {best['Score']}, Votes {best['Votes']}). "
f"Worst arena: <b>{worst['Arena']}</b> (Score {worst['Score']}, Votes {worst['Votes']})."
"</div>"
"</div>"
)
return df, summary
# ---------------------------
# App UI
# ---------------------------
ensure_seed_once()
css = ""
if os.path.exists(CSS_PATH):
with open(CSS_PATH, "r", encoding="utf-8") as f:
css = f.read()
with gr.Blocks(title="ZEN Model Arena Leaderboard") as demo:
if css:
gr.HTML(f"<style>{css}</style>")
k = kpis()
header = f"""
<div class="zen-card">
<div class="zen-title">ZEN Model Arena Leaderboard</div>
<p class="zen-sub">
Multi-arena rankings (Text · WebDev · Vision · Image · Video · Search) with a cross-arena overview matrix and live Elo voting.
</p>
<div class="zen-kpi">
<div><div class="k">Models</div><div class="v">{k['models']}</div><div class="s">unique IDs tracked</div></div>
<div><div class="k">Arena Entries</div><div class="v">{k['entries']}</div><div class="s">model × arena rows</div></div>
<div><div class="k">Votes Logged</div><div class="v">{k['votes']}</div><div class="s">pairwise results</div></div>
<div><div class="k">Last Vote</div><div class="v" style="font-size:12px; font-weight:700;">{k['last_vote']}</div><div class="s">UTC</div></div>
</div>
<div class="zen-hr"></div>
<span class="zen-badge">Gradio 6.2.0</span>
<span class="zen-badge">SQLite + FileLock</span>
<span class="zen-badge">Arena Matrix</span>
<span class="zen-badge">Search + Filters</span>
</div>
"""
gr.HTML(header)
provider_choices = providers_list()
with gr.Tabs():
# Overview
with gr.Tab("Leaderboard Overview"):
gr.Markdown("### Top 10 (live DB)\nMirrors the snapshot format, but runs off the DB.")
with gr.Row():
arena_sel_ov = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
provider_sel_ov = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
min_votes_ov = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
search_ov = gr.Textbox(value="", label="Search models", placeholder="gpt, gemini, claude, flux...")
df_ov = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-520"])
refresh_ov = gr.Button("Refresh overview", variant="primary")
def refresh_overview(arena, provider, min_votes, search):
return leaderboard_df(arena, search=search, provider=provider, min_votes=int(min_votes), limit=10)
refresh_ov.click(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
demo.load(refresh_overview, inputs=[arena_sel_ov, provider_sel_ov, min_votes_ov, search_ov], outputs=[df_ov])
# Matrix
with gr.Tab("Arena Overview Matrix"):
gr.Markdown("### Cross-arena placements\nRank/Score/Votes per arena in one wide matrix.")
with gr.Row():
provider_sel_mx = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
min_votes_mx = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
search_mx = gr.Textbox(value="", label="Search")
limit_mx = gr.Slider(10, 400, value=200, step=10, label="Max models")
mx = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-600"])
refresh_mx = gr.Button("Build / Refresh Matrix", variant="primary")
def build_matrix(provider, min_votes, search, limit_models):
return arena_overview_matrix(search=search, provider=provider, min_votes=int(min_votes), limit_models=int(limit_models))
refresh_mx.click(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
demo.load(build_matrix, inputs=[provider_sel_mx, min_votes_mx, search_mx, limit_mx], outputs=[mx])
# Arena tabs
for arena in ARENAS:
with gr.Tab(arena):
gr.Markdown(f"### {arena} Leaderboard (live DB)")
with gr.Row():
provider_sel = gr.Dropdown(choices=provider_choices, value="All", label="Provider")
min_votes = gr.Slider(0, 100000, value=0, step=50, label="Min votes")
search = gr.Textbox(value="", label="Search")
df = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-600"])
btn = gr.Button("Refresh", variant="primary")
btn.click(
lambda p, mv, s, a=arena: leaderboard_df(a, search=s, provider=p, min_votes=int(mv), limit=150),
inputs=[provider_sel, min_votes, search],
outputs=[df],
)
demo.load(lambda a=arena: leaderboard_df(a, limit=150), outputs=[df])
# Voting
with gr.Tab("Start Voting"):
gr.Markdown("### Pairwise Voting (Elo)\nPick a winner for a specific arena. Scores update instantly.")
with gr.Row():
arena_vote = gr.Dropdown(choices=ARENAS, value="Text", label="Arena")
provider_vote = gr.Dropdown(choices=provider_choices, value="All", label="Provider pool")
new_match_btn = gr.Button("New Matchup", variant="primary")
left_state = gr.State("")
right_state = gr.State("")
with gr.Row():
left_md = gr.Markdown()
right_md = gr.Markdown()
with gr.Row():
left_btn = gr.Button("⬅ Left Wins", variant="primary")
right_btn = gr.Button("Right Wins ➡", variant="primary")
vote_status = gr.HTML()
def new_matchup(arena, provider):
a, b = pick_pair(arena, provider=provider)
return model_card_md(a, arena), model_card_md(b, arena), a, b, "<div class='zen-note'>New matchup ready.</div>"
def left_wins(arena, left, right, provider):
if left and right:
vote(arena, winner=left, loser=right)
return new_matchup(arena, provider)
def right_wins(arena, left, right, provider):
if left and right:
vote(arena, winner=right, loser=left)
return new_matchup(arena, provider)
new_match_btn.click(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
left_btn.click(left_wins, inputs=[arena_vote, left_state, right_state, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
right_btn.click(right_wins, inputs=[arena_vote, left_state, right_state, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
demo.load(new_matchup, inputs=[arena_vote, provider_vote], outputs=[left_md, right_md, left_state, right_state, vote_status])
# Profiles
with gr.Tab("Model Profiles"):
gr.Markdown("### Inspect a model across arenas")
models = all_models()
model_dd = gr.Dropdown(choices=models, value=(models[0] if models else None), label="Model")
prof_summary = gr.HTML()
prof_df = gr.Dataframe(interactive=False, wrap=True, elem_classes=["zen-table-520"])
load_btn = gr.Button("Load Profile", variant="primary")
load_btn.click(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
demo.load(model_profile, inputs=[model_dd], outputs=[prof_df, prof_summary])
# Admin
with gr.Tab("Admin"):
gr.Markdown("### Admin Tools")
with gr.Row():
reseed_force = gr.Checkbox(value=False, label="Force reseed (wipe DB first)")
reseed_btn = gr.Button("Seed from data/seed_snapshot.json", variant="primary")
reseed_out = gr.JSON()
reseed_btn.click(lambda force: seed_from_json(force=bool(force)), inputs=[reseed_force], outputs=[reseed_out])
gr.Markdown("#### Add a model to arenas")
new_model = gr.Textbox(label="Model ID", placeholder="e.g., gpt-5.2, gemini-3-pro, claude-opus-4-5-…")
new_provider = gr.Textbox(label="Provider (optional)", placeholder="Leave blank for auto-detect")
arenas_add = gr.CheckboxGroup(choices=ARENAS, value=["Text"], label="Arenas")
add_btn = gr.Button("Add Model", variant="primary")
add_out = gr.HTML()
def add_model(model_id, provider, arenas):
model_id = (model_id or "").strip()
if not model_id:
return "<div class='zen-note'>Missing model id.</div>"
provider = (provider or "").strip() or guess_provider(model_id)
added = 0
for a in arenas or []:
ensure_model(a, model_id, provider=provider)
added += 1
return f"<div class='zen-note'>Added <b>{model_id}</b> ({provider}) to <b>{added}</b> arenas.</div>"
add_btn.click(add_model, inputs=[new_model, new_provider, arenas_add], outputs=[add_out])
gr.Markdown("#### Sanity")
sanity = gr.JSON()
def sanity_check():
return {
"time_utc": now_iso(),
"db_path_exists": os.path.exists(DB_PATH),
"seed_path_exists": os.path.exists(SEED_PATH),
"css_path_exists": os.path.exists(CSS_PATH),
"arenas": ARENAS,
"providers_detected": providers_list(),
"models_count": len(all_models()),
}
demo.load(sanity_check, outputs=[sanity])
gr.close_all()
demo.launch()