#!/usr/bin/env python3 """ HF Hub Benchmark Dashboard โ Gradio app. Run: python app.py """ import html as _html import json import urllib.request import urllib.error import concurrent.futures from collections import defaultdict from datetime import datetime, timezone from pathlib import Path import gradio as gr # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- ROOT = Path(__file__).resolve().parent CACHE_PATH = ROOT / "dashboard_cache.json" CACHE_TTL_SECONDS = 6 * 60 * 60 # 6 hours CATEGORY_ORDER = [ "Knowledge", "Math / Reasoning", "Code / Engineering", "Agents", "Vision", "Audio / Speech", "Document / OCR", "Retrieval / Embedding", "NLP / Classification", "Robotics", "Other", ] CATEGORY_ICONS = { "Knowledge": "๐ง ", "Math / Reasoning": "๐ข", "Code / Engineering": "๐ป", "Agents": "๐ค", "Vision": "๐๏ธ", "Audio / Speech": "๐", "Document / OCR": "๐", "Retrieval / Embedding": "๐", "NLP / Classification": "๐ท๏ธ", "Robotics": "๐ฆพ", "Other": "๐ฆ", } BENCHMARK_DISPLAY_NAMES = { "openai/gsm8k": "GSM8K", "Idavidrein/gpqa": "GPQA", "allenai/olmOCR-bench": "olmOCR-Bench", "llamaindex/ParseBench": "ParseBench", "mercor/apex-agents": "APEX-Agents", "harborframework/terminal-bench-2.0": "Terminal-Bench 2.0", "SWE-bench/SWE-bench_Verified": "SWE-bench Verified", "TIGER-Lab/MMLU-Pro": "MMLU-Pro", "hf-audio/open-asr-leaderboard": "Open ASR Leaderboard", "MathArena/aime_2026": "AIME 2026", "claw-eval/Claw-Eval": "Claw-Eval", "cais/hle": "HLE", "likaixin/ScreenSpot-Pro": "ScreenSpot-Pro", "nvidia/compute-eval": "ComputeEval", "ScaleAI/SWE-bench_Pro": "SWE-bench Pro", "FutureMa/EvasionBench": "EvasionBench", "mteb/BRIGHT": "BRIGHT", "Delores-Lin/MDPBench": "MDPBench", "mteb/arguana": "ArguAna", "MMMU/MMMU_Pro": "MMMU-Pro", "LEXam-Benchmark/LEXam": "LEXam", "mercor/ACE": "ACE", "mercor/APEX-v1-extended": "APEX-v1", "VLABench/vlabench_primitive_ft_lerobot_video": "VLABench", "tiiuae/PBench": "PBench", "MathArena/hmmt_feb_2026": "HMMT Feb 2026", "collinear-ai/yc-bench": "YC-Bench", "internlm/WildClawBench": "WildClawBench", "MME-Benchmarks/Video-MME-v2": "Video-MME v2", "open-agent-leaderboard/results": "Open Agent Leaderboard", } CUSTOM_CSS = """ /* ---- Topbar ---- */ .topbar { display: flex; align-items: center; justify-content: space-between; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 14px 24px; border-radius: 10px; margin-bottom: 8px; flex-wrap: wrap; gap: 12px; } .topbar-title { font-size: 18px; font-weight: 700; margin-bottom: 2px; } .topbar-meta { font-size: 11px; opacity: 0.85; } .topbar-pills { display: flex; gap: 8px; flex-wrap: wrap; } .stat-pill { background: rgba(255,255,255,0.2); border-radius: 20px; padding: 4px 14px; font-size: 12px; white-space: nowrap; } .stat-pill b { font-size: 14px; } /* ---- Layout columns ---- */ #sidebar-col { background: white !important; padding: 0 !important; border-right: 1px solid #e5e7eb !important; border-radius: 10px 0 0 10px !important; } #main-col { background: #f8fafc !important; padding: 18px 22px !important; border-radius: 0 10px 10px 0 !important; min-width: 0 !important; } #sidebar-col > .form, #main-col > .form { background: transparent !important; box-shadow: none !important; border: none !important; padding: 0 !important; } /* ---- Sidebar Radio โ nav buttons ---- */ #cat_radio { background: transparent !important; border: none !important; box-shadow: none !important; padding: 0 !important; } #cat_radio > .wrap { flex-direction: column !important; gap: 0 !important; padding: 0 !important; } #cat_radio label { display: flex !important; align-items: center !important; padding: 8px 12px !important; margin: 0 !important; border-left: 3px solid transparent !important; border-radius: 0 !important; cursor: pointer !important; font-size: 12px !important; color: #374151 !important; background: white !important; width: 100% !important; box-sizing: border-box !important; gap: 0 !important; } #cat_radio label:hover { background: #f3f4f6 !important; } #cat_radio label:has(input:checked) { background: #ede9fe !important; border-left-color: #7c3aed !important; color: #5b21b6 !important; font-weight: 600 !important; } #cat_radio input[type="radio"] { display: none !important; } #cat_radio .wrap span { margin-left: 0 !important; padding-left: 0 !important; } /* ---- Bench cards Radio ---- */ #bench_radio { background: transparent !important; border: none !important; box-shadow: none !important; padding: 0 !important; } #bench_radio > .wrap { flex-direction: row !important; flex-wrap: wrap !important; gap: 10px !important; padding: 4px 0 12px !important; } #bench_radio label { display: flex !important; align-items: center !important; padding: 10px 14px !important; border: 2px solid #e5e7eb !important; border-radius: 10px !important; cursor: pointer !important; font-size: 12px !important; background: white !important; color: #374151 !important; min-width: 150px !important; margin: 0 !important; gap: 0 !important; transition: border-color 0.15s !important; } #bench_radio label:hover { border-color: #a78bfa !important; } #bench_radio label:has(input:checked) { border-color: #7c3aed !important; background: #faf5ff !important; font-weight: 600 !important; color: #5b21b6 !important; } #bench_radio input[type="radio"] { display: none !important; } #bench_radio .wrap span { margin-left: 0 !important; padding-left: 0 !important; } /* ---- Filter checkbox ---- */ #providers-filter { margin: 2px 0 10px; } #providers-filter > label { font-size: 12px !important; color: #6b7280 !important; } /* ---- Layout columns ---- */ #sidebar-col { background: white !important; padding: 0 !important; border-right: 1px solid #e5e7eb !important; border-radius: 10px 0 0 10px !important; } #main-col { background: #f8fafc !important; padding: 18px 22px !important; border-radius: 0 10px 10px 0 !important; min-width: 0 !important; } #sidebar-col > .form, #main-col > .form { background: transparent !important; box-shadow: none !important; border: none !important; padding: 0 !important; } /* ---- Sidebar HTML ---- */ .hf-sidebar { display: flex; flex-direction: column; padding: 10px 0; } .hf-sidebar-label { font-size: 10px; font-weight: 700; color: #9ca3af; text-transform: uppercase; letter-spacing: 0.8px; padding: 0 16px 8px; } .hf-cat-btn { display: flex; align-items: center; gap: 9px; width: 100%; padding: 9px 16px; border: none; background: none; cursor: pointer; border-left: 3px solid transparent; font-size: 13px; color: #374151; text-align: left; transition: background 0.1s; } .hf-cat-btn:hover { background: #f3f4f6; } .hf-cat-active { background: #ede9fe !important; border-left-color: #7c3aed !important; color: #5b21b6 !important; font-weight: 600; } .hf-cat-active .hf-cat-badge { background: #ddd6fe !important; color: #7c3aed !important; } .hf-cat-icon { font-size: 15px; min-width: 20px; } .hf-cat-name { flex: 1; } .hf-cat-badge { background: #f3f4f6; border-radius: 12px; padding: 1px 8px; font-size: 11px; color: #6b7280; } /* ---- Benchmark cards HTML ---- */ .hf-section-head { display: flex; align-items: baseline; gap: 12px; margin-bottom: 12px; } .hf-section-title { font-size: 16px; font-weight: 700; color: #111827; } .hf-section-meta { font-size: 12px; color: #9ca3af; } .hf-cards { display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 14px; } .hf-card { border: 2px solid #e5e7eb; border-radius: 10px; padding: 10px 14px; cursor: pointer; background: white; min-width: 150px; transition: border-color 0.15s, box-shadow 0.15s; } .hf-card:hover { border-color: #a78bfa; box-shadow: 0 1px 6px rgba(124,58,237,0.1); } .hf-card-active { border-color: #7c3aed !important; background: #faf5ff !important; } .hf-card-name { font-size: 13px; font-weight: 600; color: #111827; } .hf-card-active .hf-card-name { color: #5b21b6; } .hf-card-count { font-size: 11px; color: #6b7280; margin-top: 3px; } .hf-card-owner { font-size: 10px; color: #9ca3af; margin-top: 2px; } /* ---- JS bridge textboxes (rendered but invisible) ---- */ #cat_trigger, #bench_trigger { display: none !important; position: absolute !important; pointer-events: none !important; } /* ---- Filter checkbox ---- */ #providers-filter { margin: 2px 0 10px; } #providers-filter > label { font-size: 12px !important; color: #6b7280 !important; } /* ---- Leaderboard HTML ---- */ .hf-lb { border: 1px solid #e5e7eb; border-radius: 10px; overflow: hidden; background: white; } .hf-lb-head { display: flex; align-items: center; justify-content: space-between; padding: 10px 16px; border-bottom: 1px solid #f3f4f6; background: #f9fafb; } .hf-lb-title { font-size: 13px; font-weight: 600; color: #374151; } .hf-lb-meta { display: flex; align-items: center; gap: 12px; } .hf-lb-count { font-size: 11px; color: #9ca3af; } .hf-hub-link { font-size: 11px; color: #7c3aed; text-decoration: none; font-weight: 500; } .hf-hub-link:hover { text-decoration: underline; } .hf-lb-scroll { overflow-x: auto; } .hf-table { width: 100%; border-collapse: collapse; font-size: 12px; } .hf-table thead th { padding: 7px 12px; text-align: left; font-size: 10px; font-weight: 700; color: #6b7280; text-transform: uppercase; letter-spacing: 0.4px; white-space: nowrap; background: white; border-bottom: 1px solid #f3f4f6; } .hf-table td { padding: 7px 12px; border-bottom: 1px solid #f3f4f6; vertical-align: middle; } .hf-table tbody tr:last-child td { border-bottom: none; } .hf-table tbody tr:nth-child(even) td { background: #fafafa; } .hf-table tbody tr:hover td { background: #faf5ff !important; } .hf-rank { width: 44px; text-align: center; font-size: 17px; } .hf-rank-num { color: #9ca3af; font-size: 12px; font-variant-numeric: tabular-nums; } .hf-model a { color: #2563eb; text-decoration: none; font-size: 11px; word-break: break-all; } .hf-model a:hover { text-decoration: underline; } .hf-score { font-variant-numeric: tabular-nums; font-weight: 600; color: #111827; white-space: nowrap; } .hf-price { font-variant-numeric: tabular-nums; color: #059669; white-space: nowrap; } .hf-ctx, .hf-params, .hf-ttft, .hf-tput { white-space: nowrap; } .hf-ttft { color: #7c3aed; } .hf-tput { color: #0369a1; } .hf-lic { color: #6b7280; font-size: 11px; } .hf-params { font-weight: 500; } .hf-provs { display: flex; flex-wrap: wrap; gap: 3px; } .hf-chip { background: #dbeafe; color: #1e40af; border-radius: 4px; padding: 1px 6px; font-size: 10px; font-weight: 500; white-space: nowrap; } .hf-chip-more { background: #f3f4f6 !important; color: #6b7280 !important; } .hf-na { color: #d1d5db; } .hf-empty { padding: 48px 24px; text-align: center; color: #9ca3af; font-size: 14px; } """ # --------------------------------------------------------------------------- # HF API helpers # --------------------------------------------------------------------------- def _http_get_json(url: str, token: str | None = None, timeout: int = 30): req = urllib.request.Request(url, headers={"Accept": "application/json"}) if token: req.add_header("Authorization", f"Bearer {token}") with urllib.request.urlopen(req, timeout=timeout) as resp: return json.loads(resp.read().decode("utf-8")) def _read_token() -> str | None: import os p = Path(os.path.expanduser("~/.cache/huggingface/token")) if p.exists(): tok = p.read_text().strip() if tok: return tok return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") def discover_benchmarks(token=None) -> list[dict]: url = "https://huggingface.co/api/datasets?filter=benchmark:official&limit=1000" data = _http_get_json(url, token, timeout=30) results = [] for d in data: if not isinstance(d, dict) or "id" not in d: continue results.append({ "id": d["id"], "tags": d.get("tags", []), "description": (d.get("description") or "")[:200], }) return results def get_leaderboard(dataset_id: str, token=None) -> list[dict]: url = f"https://huggingface.co/api/datasets/{dataset_id}/leaderboard" try: data = _http_get_json(url, token, timeout=30) except (urllib.error.HTTPError, urllib.error.URLError): return [] if isinstance(data, dict) and "entries" in data: data = data["entries"] return data if isinstance(data, list) else [] # --------------------------------------------------------------------------- # Categorisation # --------------------------------------------------------------------------- def categorize_benchmark(bench: dict) -> list[str]: tags = bench.get("tags", []) bid = bench["id"] bid_lower = bid.lower() categories = set() if any(t in tags for t in ["modality:audio", "modality:speech"]): categories.add("Audio / Speech") if any(t in tags for t in ["modality:image", "modality:video"]): categories.add("Vision") if any(t in tags for t in ["modality:document"]): categories.add("Document / OCR") if any(t in tags for t in ["task_categories:robotics"]): categories.add("Robotics") if any(t in tags for t in ["task_categories:text-retrieval"]): categories.add("Retrieval / Embedding") if "math" in bid_lower or "aime" in bid_lower or "hmmt" in bid_lower or "gsm8k" in bid_lower: categories.add("Math / Reasoning") if "swe" in bid_lower or "terminal" in bid_lower or "compute-eval" in bid_lower: categories.add("Code / Engineering") if "agent" in bid_lower or "claw" in bid_lower or "apex-agent" in bid_lower or "wildclaw" in bid_lower or "yc-bench" in bid_lower: categories.add("Agents") if "mmlu" in bid_lower or "gpqa" in bid_lower or "hle" in bid_lower: categories.add("Knowledge") if "ocr" in bid_lower or "parse" in bid_lower or "mdp" in bid_lower: categories.add("Document / OCR") if "asr" in bid_lower: categories.add("Audio / Speech") if "screen" in bid_lower or "mmmu" in bid_lower or "video" in bid_lower or "pbench" in bid_lower: categories.add("Vision") if "evasion" in bid_lower or "lex" in bid_lower: categories.add("NLP / Classification") if "bright" in bid_lower or "arguana" in bid_lower: categories.add("Retrieval / Embedding") if not categories: categories.add("Other") return sorted(categories, key=lambda c: CATEGORY_ORDER.index(c) if c in CATEGORY_ORDER else 99) # --------------------------------------------------------------------------- # Data fetching & aggregation # --------------------------------------------------------------------------- def fetch_all_data() -> dict: token = _read_token() benchmarks = discover_benchmarks(token) all_models: set[str] = set() benchmark_data = [] for bench in benchmarks: bid = bench["id"] entries = get_leaderboard(bid, token) models: set[str] = set() model_details = [] for entry in entries: mid = entry.get("modelId") or entry.get("model_id") or entry.get("model") or "" if not mid: continue models.add(mid) model_details.append({ "rank": entry.get("rank"), "model_id": mid, "value": entry.get("value"), "verified": entry.get("verified", False), }) model_details.sort(key=lambda x: (x["rank"] is None, x["rank"] or 999)) all_models.update(models) cats = categorize_benchmark(bench) display_name = BENCHMARK_DISPLAY_NAMES.get(bid, bid.split("/")[-1]) benchmark_data.append({ "id": bid, "display_name": display_name, "categories": cats, "num_models": len(models), "models": sorted(models), "model_details": model_details, "description": bench["description"], }) cat_benchmarks: dict[str, list] = defaultdict(list) cat_models: dict[str, set] = defaultdict(set) for bd in benchmark_data: for cat in bd["categories"]: cat_benchmarks[cat].append(bd) cat_models[cat].update(bd["models"]) return { "total_benchmarks": len(benchmarks), "total_unique_models": len(all_models), "benchmarks_with_entries": sum(1 for bd in benchmark_data if bd["num_models"] > 0), "benchmarks_empty": sum(1 for bd in benchmark_data if bd["num_models"] == 0), "timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds"), "all_models": sorted(all_models), "benchmark_data": benchmark_data, "categories": { cat: { "benchmarks": len(cat_benchmarks[cat]), "unique_models": len(cat_models[cat]), } for cat in CATEGORY_ORDER if cat in cat_benchmarks }, } def load_cached_data() -> dict | None: if not CACHE_PATH.exists(): return None try: d = json.loads(CACHE_PATH.read_text()) ts = d.get("timestamp", "") if ts: age = (datetime.now(timezone.utc) - datetime.fromisoformat(ts)).total_seconds() if age < CACHE_TTL_SECONDS: return d except Exception: pass return None def save_cache(data: dict) -> None: CACHE_PATH.write_text(json.dumps(data, indent=2)) # --------------------------------------------------------------------------- # UI helpers # --------------------------------------------------------------------------- _app_data: dict = {} _router_data: dict = {} # model_id โ {providers, cheapest_input, cheapest_output, context_length} _model_meta_cache: dict = {} # model_id โ {license, params} def _render_topbar(data: dict) -> str: ts = data.get("timestamp", "?")[:19] total_entries = sum(bd["num_models"] for bd in data.get("benchmark_data", [])) return ( f'
' ) def _load_router_data(token: str | None = None) -> dict: """Fetch all inference-available models from the HF router (pricing + context).""" try: resp = _http_get_json("https://router.huggingface.co/v1/models", token, timeout=30) except Exception: return {} out: dict = {} for m in resp.get("data", []): mid = m.get("id", "") if not mid: continue live = [p for p in m.get("providers", []) if p.get("status") == "live"] if not live: continue cheapest_out = min(live, key=lambda p: p.get("pricing", {}).get("output", 1e9)) cheapest_in = min(live, key=lambda p: p.get("pricing", {}).get("input", 1e9)) ttfts = [p["first_token_latency_ms"] for p in live if p.get("first_token_latency_ms")] throughputs = [p["throughput"] for p in live if p.get("throughput")] out[mid] = { "providers": [p["provider"] for p in live], "cheapest_input": cheapest_in.get("pricing", {}).get("input"), "cheapest_output": cheapest_out.get("pricing", {}).get("output"), "context_length": max((p.get("context_length") or 0) for p in live), "fastest_ttft_ms": min(ttfts) if ttfts else None, "fastest_throughput": max(throughputs) if throughputs else None, } return out def _load_model_metas(model_ids: list[str], token: str | None = None) -> None: """Fetch license + param count for model_ids not yet cached. Fills _model_meta_cache.""" to_fetch = [m for m in model_ids if m not in _model_meta_cache] if not to_fetch: return def _fetch_one(mid: str) -> tuple[str, dict]: url = f"https://huggingface.co/api/models/{mid}?expand[]=safetensors&expand[]=cardData" try: d = _http_get_json(url, token, timeout=10) except Exception: return mid, {} lic = (d.get("cardData") or {}).get("license", "") if not lic: for t in d.get("tags", []): if t.startswith("license:"): lic = t[8:] break total = (d.get("safetensors") or {}).get("total", 0) params = "" if total: b = total / 1e9 params = f"{round(b)}B" if b >= 1 else f"{round(total / 1e6)}M" return mid, {"license": lic, "params": params} with concurrent.futures.ThreadPoolExecutor(max_workers=20) as ex: for mid, meta in ex.map(_fetch_one, to_fetch): _model_meta_cache[mid] = meta def _sidebar_choices(data: dict) -> list[tuple[str, str]]: cats = data.get("categories", {}) result = [] for cat in CATEGORY_ORDER: if cat not in cats: continue icon = CATEGORY_ICONS.get(cat, "") count = cats[cat]["benchmarks"] result.append((f"{icon} {cat} ({count})", cat)) return result def _card_choices(data: dict, category: str) -> list[tuple[str, str]]: bds = sorted( [bd for bd in data["benchmark_data"] if category in bd["categories"]], key=lambda x: x["num_models"], reverse=True, ) choices = [] for bd in bds: owner = bd["id"].split("/")[0] if "/" in bd["id"] else "" label = f"{bd['display_name']} ยท {bd['num_models']} models" if owner: label += f" [{owner}]" choices.append((label, bd["id"])) return choices def _cat_header(data: dict, cat: str) -> str: icon = CATEGORY_ICONS.get(cat, "") info = data.get("categories", {}).get(cat, {}) return f"### {icon} {cat} ยท {info.get('benchmarks', 0)} benchmarks ยท {info.get('unique_models', 0)} models" def _render_leaderboard(data: dict, bid: str, providers_only: bool = False) -> str: if not bid: return 'โ ๏ธ Refresh failed: {e}
' return _render_topbar(_app_data) + err, gr.update(), gr.update(), gr.update(), gr.update(), gr.update() new_s = _sidebar_choices(_app_data) new_cat = new_s[0][1] if new_s else "" new_c = _card_choices(_app_data, new_cat) new_bid = new_c[0][1] if new_c else "" return ( _render_topbar(_app_data), gr.update(choices=new_s, value=new_cat), _cat_header(_app_data, new_cat), gr.update(choices=new_c, value=new_bid), _render_leaderboard(_app_data, new_bid, prov_only), new_bid, ) cat_radio.change( fn=on_cat, inputs=[cat_radio, providers_filter], outputs=[cat_header, bench_radio, leaderboard, sel_bid], ) bench_radio.change( fn=on_bench, inputs=[bench_radio, providers_filter], outputs=[leaderboard, sel_bid], ) providers_filter.change( fn=on_filter, inputs=[sel_bid, providers_filter], outputs=[leaderboard], ) refresh_btn.click( fn=on_refresh, inputs=[providers_filter], outputs=[topbar, cat_radio, cat_header, bench_radio, leaderboard, sel_bid], show_progress="full", ) return demo if __name__ == "__main__": demo = build_app() demo.launch()