Linker1907's picture
Prepare for HF Space deployment
4f8462a
#!/usr/bin/env python3
"""
HF Hub Benchmark Dashboard β€” Gradio app.
Run: python app.py
"""
import html as _html
import json
import urllib.request
import urllib.error
import concurrent.futures
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
import gradio as gr
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
ROOT = Path(__file__).resolve().parent
CACHE_PATH = ROOT / "dashboard_cache.json"
CACHE_TTL_SECONDS = 6 * 60 * 60 # 6 hours
CATEGORY_ORDER = [
"Knowledge",
"Math / Reasoning",
"Code / Engineering",
"Agents",
"Vision",
"Audio / Speech",
"Document / OCR",
"Retrieval / Embedding",
"NLP / Classification",
"Robotics",
"Other",
]
CATEGORY_ICONS = {
"Knowledge": "🧠",
"Math / Reasoning": "πŸ”’",
"Code / Engineering": "πŸ’»",
"Agents": "πŸ€–",
"Vision": "πŸ‘οΈ",
"Audio / Speech": "πŸ”Š",
"Document / OCR": "πŸ“„",
"Retrieval / Embedding": "πŸ”Ž",
"NLP / Classification": "🏷️",
"Robotics": "🦾",
"Other": "πŸ“¦",
}
BENCHMARK_DISPLAY_NAMES = {
"openai/gsm8k": "GSM8K",
"Idavidrein/gpqa": "GPQA",
"allenai/olmOCR-bench": "olmOCR-Bench",
"llamaindex/ParseBench": "ParseBench",
"mercor/apex-agents": "APEX-Agents",
"harborframework/terminal-bench-2.0": "Terminal-Bench 2.0",
"SWE-bench/SWE-bench_Verified": "SWE-bench Verified",
"TIGER-Lab/MMLU-Pro": "MMLU-Pro",
"hf-audio/open-asr-leaderboard": "Open ASR Leaderboard",
"MathArena/aime_2026": "AIME 2026",
"claw-eval/Claw-Eval": "Claw-Eval",
"cais/hle": "HLE",
"likaixin/ScreenSpot-Pro": "ScreenSpot-Pro",
"nvidia/compute-eval": "ComputeEval",
"ScaleAI/SWE-bench_Pro": "SWE-bench Pro",
"FutureMa/EvasionBench": "EvasionBench",
"mteb/BRIGHT": "BRIGHT",
"Delores-Lin/MDPBench": "MDPBench",
"mteb/arguana": "ArguAna",
"MMMU/MMMU_Pro": "MMMU-Pro",
"LEXam-Benchmark/LEXam": "LEXam",
"mercor/ACE": "ACE",
"mercor/APEX-v1-extended": "APEX-v1",
"VLABench/vlabench_primitive_ft_lerobot_video": "VLABench",
"tiiuae/PBench": "PBench",
"MathArena/hmmt_feb_2026": "HMMT Feb 2026",
"collinear-ai/yc-bench": "YC-Bench",
"internlm/WildClawBench": "WildClawBench",
"MME-Benchmarks/Video-MME-v2": "Video-MME v2",
"open-agent-leaderboard/results": "Open Agent Leaderboard",
}
CUSTOM_CSS = """
/* ---- Topbar ---- */
.topbar {
display: flex; align-items: center; justify-content: space-between;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white; padding: 14px 24px; border-radius: 10px;
margin-bottom: 8px; flex-wrap: wrap; gap: 12px;
}
.topbar-title { font-size: 18px; font-weight: 700; margin-bottom: 2px; }
.topbar-meta { font-size: 11px; opacity: 0.85; }
.topbar-pills { display: flex; gap: 8px; flex-wrap: wrap; }
.stat-pill {
background: rgba(255,255,255,0.2); border-radius: 20px;
padding: 4px 14px; font-size: 12px; white-space: nowrap;
}
.stat-pill b { font-size: 14px; }
/* ---- Layout columns ---- */
#sidebar-col {
background: white !important; padding: 0 !important;
border-right: 1px solid #e5e7eb !important;
border-radius: 10px 0 0 10px !important;
}
#main-col {
background: #f8fafc !important; padding: 18px 22px !important;
border-radius: 0 10px 10px 0 !important; min-width: 0 !important;
}
#sidebar-col > .form, #main-col > .form {
background: transparent !important; box-shadow: none !important;
border: none !important; padding: 0 !important;
}
/* ---- Sidebar Radio β†’ nav buttons ---- */
#cat_radio {
background: transparent !important; border: none !important;
box-shadow: none !important; padding: 0 !important;
}
#cat_radio > .wrap { flex-direction: column !important; gap: 0 !important; padding: 0 !important; }
#cat_radio label {
display: flex !important; align-items: center !important;
padding: 8px 12px !important; margin: 0 !important;
border-left: 3px solid transparent !important; border-radius: 0 !important;
cursor: pointer !important; font-size: 12px !important;
color: #374151 !important; background: white !important;
width: 100% !important; box-sizing: border-box !important; gap: 0 !important;
}
#cat_radio label:hover { background: #f3f4f6 !important; }
#cat_radio label:has(input:checked) {
background: #ede9fe !important; border-left-color: #7c3aed !important;
color: #5b21b6 !important; font-weight: 600 !important;
}
#cat_radio input[type="radio"] { display: none !important; }
#cat_radio .wrap span { margin-left: 0 !important; padding-left: 0 !important; }
/* ---- Bench cards Radio ---- */
#bench_radio {
background: transparent !important; border: none !important;
box-shadow: none !important; padding: 0 !important;
}
#bench_radio > .wrap {
flex-direction: row !important; flex-wrap: wrap !important;
gap: 10px !important; padding: 4px 0 12px !important;
}
#bench_radio label {
display: flex !important; align-items: center !important;
padding: 10px 14px !important; border: 2px solid #e5e7eb !important;
border-radius: 10px !important; cursor: pointer !important;
font-size: 12px !important; background: white !important;
color: #374151 !important; min-width: 150px !important;
margin: 0 !important; gap: 0 !important; transition: border-color 0.15s !important;
}
#bench_radio label:hover { border-color: #a78bfa !important; }
#bench_radio label:has(input:checked) {
border-color: #7c3aed !important; background: #faf5ff !important;
font-weight: 600 !important; color: #5b21b6 !important;
}
#bench_radio input[type="radio"] { display: none !important; }
#bench_radio .wrap span { margin-left: 0 !important; padding-left: 0 !important; }
/* ---- Filter checkbox ---- */
#providers-filter { margin: 2px 0 10px; }
#providers-filter > label { font-size: 12px !important; color: #6b7280 !important; }
/* ---- Layout columns ---- */
#sidebar-col {
background: white !important; padding: 0 !important;
border-right: 1px solid #e5e7eb !important;
border-radius: 10px 0 0 10px !important;
}
#main-col {
background: #f8fafc !important; padding: 18px 22px !important;
border-radius: 0 10px 10px 0 !important; min-width: 0 !important;
}
#sidebar-col > .form, #main-col > .form {
background: transparent !important; box-shadow: none !important;
border: none !important; padding: 0 !important;
}
/* ---- Sidebar HTML ---- */
.hf-sidebar { display: flex; flex-direction: column; padding: 10px 0; }
.hf-sidebar-label {
font-size: 10px; font-weight: 700; color: #9ca3af;
text-transform: uppercase; letter-spacing: 0.8px; padding: 0 16px 8px;
}
.hf-cat-btn {
display: flex; align-items: center; gap: 9px; width: 100%;
padding: 9px 16px; border: none; background: none; cursor: pointer;
border-left: 3px solid transparent; font-size: 13px; color: #374151;
text-align: left; transition: background 0.1s;
}
.hf-cat-btn:hover { background: #f3f4f6; }
.hf-cat-active {
background: #ede9fe !important; border-left-color: #7c3aed !important;
color: #5b21b6 !important; font-weight: 600;
}
.hf-cat-active .hf-cat-badge { background: #ddd6fe !important; color: #7c3aed !important; }
.hf-cat-icon { font-size: 15px; min-width: 20px; }
.hf-cat-name { flex: 1; }
.hf-cat-badge {
background: #f3f4f6; border-radius: 12px;
padding: 1px 8px; font-size: 11px; color: #6b7280;
}
/* ---- Benchmark cards HTML ---- */
.hf-section-head { display: flex; align-items: baseline; gap: 12px; margin-bottom: 12px; }
.hf-section-title { font-size: 16px; font-weight: 700; color: #111827; }
.hf-section-meta { font-size: 12px; color: #9ca3af; }
.hf-cards { display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 14px; }
.hf-card {
border: 2px solid #e5e7eb; border-radius: 10px; padding: 10px 14px;
cursor: pointer; background: white; min-width: 150px;
transition: border-color 0.15s, box-shadow 0.15s;
}
.hf-card:hover { border-color: #a78bfa; box-shadow: 0 1px 6px rgba(124,58,237,0.1); }
.hf-card-active { border-color: #7c3aed !important; background: #faf5ff !important; }
.hf-card-name { font-size: 13px; font-weight: 600; color: #111827; }
.hf-card-active .hf-card-name { color: #5b21b6; }
.hf-card-count { font-size: 11px; color: #6b7280; margin-top: 3px; }
.hf-card-owner { font-size: 10px; color: #9ca3af; margin-top: 2px; }
/* ---- JS bridge textboxes (rendered but invisible) ---- */
#cat_trigger, #bench_trigger {
display: none !important;
position: absolute !important;
pointer-events: none !important;
}
/* ---- Filter checkbox ---- */
#providers-filter { margin: 2px 0 10px; }
#providers-filter > label { font-size: 12px !important; color: #6b7280 !important; }
/* ---- Leaderboard HTML ---- */
.hf-lb { border: 1px solid #e5e7eb; border-radius: 10px; overflow: hidden; background: white; }
.hf-lb-head {
display: flex; align-items: center; justify-content: space-between;
padding: 10px 16px; border-bottom: 1px solid #f3f4f6; background: #f9fafb;
}
.hf-lb-title { font-size: 13px; font-weight: 600; color: #374151; }
.hf-lb-meta { display: flex; align-items: center; gap: 12px; }
.hf-lb-count { font-size: 11px; color: #9ca3af; }
.hf-hub-link { font-size: 11px; color: #7c3aed; text-decoration: none; font-weight: 500; }
.hf-hub-link:hover { text-decoration: underline; }
.hf-lb-scroll { overflow-x: auto; }
.hf-table { width: 100%; border-collapse: collapse; font-size: 12px; }
.hf-table thead th {
padding: 7px 12px; text-align: left; font-size: 10px; font-weight: 700;
color: #6b7280; text-transform: uppercase; letter-spacing: 0.4px;
white-space: nowrap; background: white; border-bottom: 1px solid #f3f4f6;
}
.hf-table td { padding: 7px 12px; border-bottom: 1px solid #f3f4f6; vertical-align: middle; }
.hf-table tbody tr:last-child td { border-bottom: none; }
.hf-table tbody tr:nth-child(even) td { background: #fafafa; }
.hf-table tbody tr:hover td { background: #faf5ff !important; }
.hf-rank { width: 44px; text-align: center; font-size: 17px; }
.hf-rank-num { color: #9ca3af; font-size: 12px; font-variant-numeric: tabular-nums; }
.hf-model a { color: #2563eb; text-decoration: none; font-size: 11px; word-break: break-all; }
.hf-model a:hover { text-decoration: underline; }
.hf-score { font-variant-numeric: tabular-nums; font-weight: 600; color: #111827; white-space: nowrap; }
.hf-price { font-variant-numeric: tabular-nums; color: #059669; white-space: nowrap; }
.hf-ctx, .hf-params, .hf-ttft, .hf-tput { white-space: nowrap; }
.hf-ttft { color: #7c3aed; }
.hf-tput { color: #0369a1; }
.hf-lic { color: #6b7280; font-size: 11px; }
.hf-params { font-weight: 500; }
.hf-provs { display: flex; flex-wrap: wrap; gap: 3px; }
.hf-chip {
background: #dbeafe; color: #1e40af;
border-radius: 4px; padding: 1px 6px;
font-size: 10px; font-weight: 500; white-space: nowrap;
}
.hf-chip-more { background: #f3f4f6 !important; color: #6b7280 !important; }
.hf-na { color: #d1d5db; }
.hf-empty { padding: 48px 24px; text-align: center; color: #9ca3af; font-size: 14px; }
"""
# ---------------------------------------------------------------------------
# HF API helpers
# ---------------------------------------------------------------------------
def _http_get_json(url: str, token: str | None = None, timeout: int = 30):
req = urllib.request.Request(url, headers={"Accept": "application/json"})
if token:
req.add_header("Authorization", f"Bearer {token}")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
def _read_token() -> str | None:
import os
p = Path(os.path.expanduser("~/.cache/huggingface/token"))
if p.exists():
tok = p.read_text().strip()
if tok:
return tok
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
def discover_benchmarks(token=None) -> list[dict]:
url = "https://huggingface.co/api/datasets?filter=benchmark:official&limit=1000"
data = _http_get_json(url, token, timeout=30)
results = []
for d in data:
if not isinstance(d, dict) or "id" not in d:
continue
results.append({
"id": d["id"],
"tags": d.get("tags", []),
"description": (d.get("description") or "")[:200],
})
return results
def get_leaderboard(dataset_id: str, token=None) -> list[dict]:
url = f"https://huggingface.co/api/datasets/{dataset_id}/leaderboard"
try:
data = _http_get_json(url, token, timeout=30)
except (urllib.error.HTTPError, urllib.error.URLError):
return []
if isinstance(data, dict) and "entries" in data:
data = data["entries"]
return data if isinstance(data, list) else []
# ---------------------------------------------------------------------------
# Categorisation
# ---------------------------------------------------------------------------
def categorize_benchmark(bench: dict) -> list[str]:
tags = bench.get("tags", [])
bid = bench["id"]
bid_lower = bid.lower()
categories = set()
if any(t in tags for t in ["modality:audio", "modality:speech"]):
categories.add("Audio / Speech")
if any(t in tags for t in ["modality:image", "modality:video"]):
categories.add("Vision")
if any(t in tags for t in ["modality:document"]):
categories.add("Document / OCR")
if any(t in tags for t in ["task_categories:robotics"]):
categories.add("Robotics")
if any(t in tags for t in ["task_categories:text-retrieval"]):
categories.add("Retrieval / Embedding")
if "math" in bid_lower or "aime" in bid_lower or "hmmt" in bid_lower or "gsm8k" in bid_lower:
categories.add("Math / Reasoning")
if "swe" in bid_lower or "terminal" in bid_lower or "compute-eval" in bid_lower:
categories.add("Code / Engineering")
if "agent" in bid_lower or "claw" in bid_lower or "apex-agent" in bid_lower or "wildclaw" in bid_lower or "yc-bench" in bid_lower:
categories.add("Agents")
if "mmlu" in bid_lower or "gpqa" in bid_lower or "hle" in bid_lower:
categories.add("Knowledge")
if "ocr" in bid_lower or "parse" in bid_lower or "mdp" in bid_lower:
categories.add("Document / OCR")
if "asr" in bid_lower:
categories.add("Audio / Speech")
if "screen" in bid_lower or "mmmu" in bid_lower or "video" in bid_lower or "pbench" in bid_lower:
categories.add("Vision")
if "evasion" in bid_lower or "lex" in bid_lower:
categories.add("NLP / Classification")
if "bright" in bid_lower or "arguana" in bid_lower:
categories.add("Retrieval / Embedding")
if not categories:
categories.add("Other")
return sorted(categories, key=lambda c: CATEGORY_ORDER.index(c) if c in CATEGORY_ORDER else 99)
# ---------------------------------------------------------------------------
# Data fetching & aggregation
# ---------------------------------------------------------------------------
def fetch_all_data() -> dict:
token = _read_token()
benchmarks = discover_benchmarks(token)
all_models: set[str] = set()
benchmark_data = []
for bench in benchmarks:
bid = bench["id"]
entries = get_leaderboard(bid, token)
models: set[str] = set()
model_details = []
for entry in entries:
mid = entry.get("modelId") or entry.get("model_id") or entry.get("model") or ""
if not mid:
continue
models.add(mid)
model_details.append({
"rank": entry.get("rank"),
"model_id": mid,
"value": entry.get("value"),
"verified": entry.get("verified", False),
})
model_details.sort(key=lambda x: (x["rank"] is None, x["rank"] or 999))
all_models.update(models)
cats = categorize_benchmark(bench)
display_name = BENCHMARK_DISPLAY_NAMES.get(bid, bid.split("/")[-1])
benchmark_data.append({
"id": bid,
"display_name": display_name,
"categories": cats,
"num_models": len(models),
"models": sorted(models),
"model_details": model_details,
"description": bench["description"],
})
cat_benchmarks: dict[str, list] = defaultdict(list)
cat_models: dict[str, set] = defaultdict(set)
for bd in benchmark_data:
for cat in bd["categories"]:
cat_benchmarks[cat].append(bd)
cat_models[cat].update(bd["models"])
return {
"total_benchmarks": len(benchmarks),
"total_unique_models": len(all_models),
"benchmarks_with_entries": sum(1 for bd in benchmark_data if bd["num_models"] > 0),
"benchmarks_empty": sum(1 for bd in benchmark_data if bd["num_models"] == 0),
"timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"all_models": sorted(all_models),
"benchmark_data": benchmark_data,
"categories": {
cat: {
"benchmarks": len(cat_benchmarks[cat]),
"unique_models": len(cat_models[cat]),
}
for cat in CATEGORY_ORDER
if cat in cat_benchmarks
},
}
def load_cached_data() -> dict | None:
if not CACHE_PATH.exists():
return None
try:
d = json.loads(CACHE_PATH.read_text())
ts = d.get("timestamp", "")
if ts:
age = (datetime.now(timezone.utc) - datetime.fromisoformat(ts)).total_seconds()
if age < CACHE_TTL_SECONDS:
return d
except Exception:
pass
return None
def save_cache(data: dict) -> None:
CACHE_PATH.write_text(json.dumps(data, indent=2))
# ---------------------------------------------------------------------------
# UI helpers
# ---------------------------------------------------------------------------
_app_data: dict = {}
_router_data: dict = {} # model_id β†’ {providers, cheapest_input, cheapest_output, context_length}
_model_meta_cache: dict = {} # model_id β†’ {license, params}
def _render_topbar(data: dict) -> str:
ts = data.get("timestamp", "?")[:19]
total_entries = sum(bd["num_models"] for bd in data.get("benchmark_data", []))
return (
f'<div class="topbar">'
f'<div><div class="topbar-title">πŸ† HF Hub Benchmark Dashboard</div>'
f'<div class="topbar-meta">Last updated: {ts} UTC Β· auto-refreshes every 6h</div></div>'
f'<div class="topbar-pills">'
f'<div class="stat-pill"><b>{data["total_benchmarks"]}</b> benchmarks</div>'
f'<div class="stat-pill"><b>{data["total_unique_models"]}</b> models</div>'
f'<div class="stat-pill"><b>{total_entries:,}</b> entries</div>'
f'<div class="stat-pill"><b>{data["benchmarks_with_entries"]}</b> active</div>'
f'<div class="stat-pill"><b>{data["benchmarks_empty"]}</b> empty</div>'
f'</div></div>'
)
def _load_router_data(token: str | None = None) -> dict:
"""Fetch all inference-available models from the HF router (pricing + context)."""
try:
resp = _http_get_json("https://router.huggingface.co/v1/models", token, timeout=30)
except Exception:
return {}
out: dict = {}
for m in resp.get("data", []):
mid = m.get("id", "")
if not mid:
continue
live = [p for p in m.get("providers", []) if p.get("status") == "live"]
if not live:
continue
cheapest_out = min(live, key=lambda p: p.get("pricing", {}).get("output", 1e9))
cheapest_in = min(live, key=lambda p: p.get("pricing", {}).get("input", 1e9))
ttfts = [p["first_token_latency_ms"] for p in live if p.get("first_token_latency_ms")]
throughputs = [p["throughput"] for p in live if p.get("throughput")]
out[mid] = {
"providers": [p["provider"] for p in live],
"cheapest_input": cheapest_in.get("pricing", {}).get("input"),
"cheapest_output": cheapest_out.get("pricing", {}).get("output"),
"context_length": max((p.get("context_length") or 0) for p in live),
"fastest_ttft_ms": min(ttfts) if ttfts else None,
"fastest_throughput": max(throughputs) if throughputs else None,
}
return out
def _load_model_metas(model_ids: list[str], token: str | None = None) -> None:
"""Fetch license + param count for model_ids not yet cached. Fills _model_meta_cache."""
to_fetch = [m for m in model_ids if m not in _model_meta_cache]
if not to_fetch:
return
def _fetch_one(mid: str) -> tuple[str, dict]:
url = f"https://huggingface.co/api/models/{mid}?expand[]=safetensors&expand[]=cardData"
try:
d = _http_get_json(url, token, timeout=10)
except Exception:
return mid, {}
lic = (d.get("cardData") or {}).get("license", "")
if not lic:
for t in d.get("tags", []):
if t.startswith("license:"):
lic = t[8:]
break
total = (d.get("safetensors") or {}).get("total", 0)
params = ""
if total:
b = total / 1e9
params = f"{round(b)}B" if b >= 1 else f"{round(total / 1e6)}M"
return mid, {"license": lic, "params": params}
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as ex:
for mid, meta in ex.map(_fetch_one, to_fetch):
_model_meta_cache[mid] = meta
def _sidebar_choices(data: dict) -> list[tuple[str, str]]:
cats = data.get("categories", {})
result = []
for cat in CATEGORY_ORDER:
if cat not in cats:
continue
icon = CATEGORY_ICONS.get(cat, "")
count = cats[cat]["benchmarks"]
result.append((f"{icon} {cat} ({count})", cat))
return result
def _card_choices(data: dict, category: str) -> list[tuple[str, str]]:
bds = sorted(
[bd for bd in data["benchmark_data"] if category in bd["categories"]],
key=lambda x: x["num_models"], reverse=True,
)
choices = []
for bd in bds:
owner = bd["id"].split("/")[0] if "/" in bd["id"] else ""
label = f"{bd['display_name']} Β· {bd['num_models']} models"
if owner:
label += f" [{owner}]"
choices.append((label, bd["id"]))
return choices
def _cat_header(data: dict, cat: str) -> str:
icon = CATEGORY_ICONS.get(cat, "")
info = data.get("categories", {}).get(cat, {})
return f"### {icon} {cat} &nbsp;Β·&nbsp; {info.get('benchmarks', 0)} benchmarks Β· {info.get('unique_models', 0)} models"
def _render_leaderboard(data: dict, bid: str, providers_only: bool = False) -> str:
if not bid:
return '<div class="hf-empty">Select a benchmark to view its leaderboard.</div>'
lookup = {bd["id"]: bd for bd in data["benchmark_data"]}
bd = lookup.get(bid, {})
safe_bid = _html.escape(bid)
hub_link = (
f'<a class="hf-hub-link" href="https://huggingface.co/datasets/{safe_bid}" target="_blank">'
f'β†— View on Hub</a>'
)
rows = _lb_rows(data, bid, providers_only)
if not rows:
return (
f'<div class="hf-lb">'
f'<div class="hf-lb-head">'
f'<span class="hf-lb-title">{_html.escape(bd.get("display_name", bid))}</span>'
f'{hub_link}</div>'
f'<div class="hf-empty">No entries yet.</div></div>'
)
thead = "<tr>" + "".join(
f"<th>{h}</th>"
for h in ["", "Model", "Score", "In $/1M", "Out $/1M", "Context", "TTFT", "Throughput", "License", "Params", "Providers"]
) + "</tr>"
tbody = []
for rank, model_id, score, price_in, price_out, ctx, ttft, tput, lic, params, provs in rows:
if rank == 1: rank_html = "πŸ₯‡"
elif rank == 2: rank_html = "πŸ₯ˆ"
elif rank == 3: rank_html = "πŸ₯‰"
else: rank_html = f'<span class="hf-rank-num">{rank}</span>'
safe_mid = _html.escape(model_id)
model_html = f'<a href="https://huggingface.co/{safe_mid}" target="_blank">{safe_mid}</a>'
if provs != "β€”":
chips = []
for p in provs.split(","):
p = p.strip()
cls = "hf-chip hf-chip-more" if p.startswith("+") else "hf-chip"
chips.append(f'<span class="{cls}">{_html.escape(p)}</span>')
prov_html = f'<div class="hf-provs">{"".join(chips)}</div>'
else:
prov_html = '<span class="hf-na">β€”</span>'
tbody.append(
f'<tr>'
f'<td class="hf-rank">{rank_html}</td>'
f'<td class="hf-model">{model_html}</td>'
f'<td class="hf-score">{_html.escape(str(score))}</td>'
f'<td class="hf-price">{_html.escape(str(price_in))}</td>'
f'<td class="hf-price">{_html.escape(str(price_out))}</td>'
f'<td class="hf-ctx">{_html.escape(str(ctx))}</td>'
f'<td class="hf-ttft">{_html.escape(str(ttft))}</td>'
f'<td class="hf-tput">{_html.escape(str(tput))}</td>'
f'<td class="hf-lic">{_html.escape(str(lic))}</td>'
f'<td class="hf-params">{_html.escape(str(params))}</td>'
f'<td>{prov_html}</td>'
f'</tr>'
)
return (
f'<div class="hf-lb">'
f'<div class="hf-lb-head">'
f'<span class="hf-lb-title">{_html.escape(bd.get("display_name", bid))}</span>'
f'<div class="hf-lb-meta">'
f'<span class="hf-lb-count">{len(rows)} entries</span>'
f'{hub_link}</div></div>'
f'<div class="hf-lb-scroll">'
f'<table class="hf-table">'
f'<thead>{thead}</thead>'
f'<tbody>{"".join(tbody)}</tbody>'
f'</table></div></div>'
)
def _fmt_ctx(n: int) -> str:
if n >= 1_000_000:
v = n / 1_000_000
return f"{v:.0f}M" if v == int(v) else f"{v:.1f}M"
if n >= 1_000:
v = n / 1_000
return f"{v:.0f}K" if v == int(v) else f"{v:.1f}K"
return str(n) if n else "β€”"
def _lb_rows(data: dict, bid: str, providers_only: bool = False) -> list[list]:
lookup = {bd["id"]: bd for bd in data["benchmark_data"]}
details = lookup.get(bid, {}).get("model_details", [])[:50]
if not details:
return []
model_ids = [m["model_id"] for m in details]
_load_model_metas(model_ids, _read_token())
if providers_only:
details = [m for m in details if _router_data.get(m["model_id"], {}).get("providers")]
rows = []
for m in details:
mid = m["model_id"]
meta = _model_meta_cache.get(mid, {})
router = _router_data.get(mid, {})
providers = router.get("providers", [])
if providers:
prov_str = ", ".join(providers[:3])
if len(providers) > 3:
prov_str += f" +{len(providers) - 3}"
else:
prov_str = "β€”"
in_price = router.get("cheapest_input")
out_price = router.get("cheapest_output")
price_in_str = f"${in_price:.2f}" if in_price is not None else "β€”"
price_out_str = f"${out_price:.2f}" if out_price is not None else "β€”"
ctx_str = _fmt_ctx(router.get("context_length") or 0)
ttft = router.get("fastest_ttft_ms")
ttft_str = f"{ttft:,.0f} ms" if ttft is not None else "β€”"
tput = router.get("fastest_throughput")
tput_str = f"{tput:.0f} t/s" if tput is not None else "β€”"
rows.append([
m["rank"] if m["rank"] is not None else "β€”",
mid,
str(m["value"]) if m["value"] is not None else "β€”",
price_in_str,
price_out_str,
ctx_str,
ttft_str,
tput_str,
meta.get("license") or "β€”",
meta.get("params") or "β€”",
prov_str,
])
return rows
# ---------------------------------------------------------------------------
# Gradio app
# ---------------------------------------------------------------------------
def build_app() -> gr.Blocks:
global _app_data, _router_data
_app_data = load_cached_data()
if _app_data is None:
_app_data = fetch_all_data()
save_cache(_app_data)
_router_data = _load_router_data(_read_token())
s_choices = _sidebar_choices(_app_data)
init_cat = s_choices[0][1] if s_choices else ""
c_choices = _card_choices(_app_data, init_cat)
init_bid = c_choices[0][1] if c_choices else ""
with gr.Blocks(
title="HF Hub Benchmark Dashboard",
css=CUSTOM_CSS,
theme=gr.themes.Soft(),
) as demo:
topbar = gr.HTML(_render_topbar(_app_data))
# Tracks the currently-selected benchmark for the filter toggle
sel_bid = gr.State(init_bid)
with gr.Row(equal_height=True):
with gr.Column(scale=1, min_width=170, elem_id="sidebar-col"):
cat_radio = gr.Radio(
choices=s_choices, value=init_cat,
label="Categories", elem_id="cat_radio",
)
with gr.Column(scale=5, elem_id="main-col"):
with gr.Row():
cat_header = gr.Markdown(_cat_header(_app_data, init_cat))
refresh_btn = gr.Button("πŸ”„ Refresh Now", variant="primary", scale=0, min_width=150)
bench_radio = gr.Radio(
choices=c_choices, value=init_bid,
show_label=False, elem_id="bench_radio",
)
providers_filter = gr.Checkbox(
label="Only show models with inference providers",
value=False, elem_id="providers-filter",
)
leaderboard = gr.HTML(_render_leaderboard(_app_data, init_bid))
# ---- Event handlers ----
def on_cat(cat: str, prov_only: bool):
new_c = _card_choices(_app_data, cat)
new_bid = new_c[0][1] if new_c else ""
return (
_cat_header(_app_data, cat),
gr.update(choices=new_c, value=new_bid),
_render_leaderboard(_app_data, new_bid, prov_only),
new_bid,
)
def on_bench(bid: str, prov_only: bool):
return _render_leaderboard(_app_data, bid, prov_only), bid
def on_filter(bid: str, prov_only: bool):
return _render_leaderboard(_app_data, bid, prov_only)
def on_refresh(prov_only: bool):
global _app_data, _router_data
try:
new_data = fetch_all_data()
save_cache(new_data)
_app_data = new_data
_router_data = _load_router_data(_read_token())
except Exception as e:
err = f'<p style="color:#dc2626;padding:8px">⚠️ Refresh failed: {e}</p>'
return _render_topbar(_app_data) + err, gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
new_s = _sidebar_choices(_app_data)
new_cat = new_s[0][1] if new_s else ""
new_c = _card_choices(_app_data, new_cat)
new_bid = new_c[0][1] if new_c else ""
return (
_render_topbar(_app_data),
gr.update(choices=new_s, value=new_cat),
_cat_header(_app_data, new_cat),
gr.update(choices=new_c, value=new_bid),
_render_leaderboard(_app_data, new_bid, prov_only),
new_bid,
)
cat_radio.change(
fn=on_cat,
inputs=[cat_radio, providers_filter],
outputs=[cat_header, bench_radio, leaderboard, sel_bid],
)
bench_radio.change(
fn=on_bench,
inputs=[bench_radio, providers_filter],
outputs=[leaderboard, sel_bid],
)
providers_filter.change(
fn=on_filter,
inputs=[sel_bid, providers_filter],
outputs=[leaderboard],
)
refresh_btn.click(
fn=on_refresh,
inputs=[providers_filter],
outputs=[topbar, cat_radio, cat_header, bench_radio, leaderboard, sel_bid],
show_progress="full",
)
return demo
if __name__ == "__main__":
demo = build_app()
demo.launch()