all-bench / index.html
mayafree's picture
Update index.html
5ccb370 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ALL Bench Leaderboard 2026 — Complete LLM Comparison</title>
<link href="https://fonts.googleapis.com/css2?family=Sora:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600;700&display=swap" rel="stylesheet">
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.1/chart.umd.min.js"></script>
<style>
*{margin:0;padding:0;box-sizing:border-box;}
:root{
--bg:#f8f9fc;--bg2:#f0f2f8;--surface:#ffffff;--surface-alt:#f5f6fa;
--border:#e2e5f0;--border-hover:#c7cce0;
--shadow-sm:0 1px 3px rgba(15,23,42,.04),0 1px 2px rgba(15,23,42,.06);
--shadow:0 4px 16px rgba(15,23,42,.06),0 1px 3px rgba(15,23,42,.08);
--shadow-lg:0 12px 40px rgba(15,23,42,.08),0 4px 12px rgba(15,23,42,.06);
--text:#0f172a;--text-sec:#475569;--text-muted:#94a3b8;
--ac:#6366f1;--ac2:#4f46e5;--ac-bg:rgba(99,102,241,.06);
--teal:#0d9488;--amber:#d97706;--green:#16a34a;--rose:#e11d48;
--radius:16px;--radius-sm:10px;--radius-xs:6px;
--font:'Sora',sans-serif;--font-mono:'JetBrains Mono',monospace;
--tr:0.22s cubic-bezier(0.4,0,0.2,1);
}
html{scroll-behavior:smooth;}
body{font-family:var(--font);background:var(--bg);color:var(--text);min-height:100vh;-webkit-font-smoothing:antialiased;font-size:13px;}
::-webkit-scrollbar{width:5px;height:4px;}
::-webkit-scrollbar-track{background:transparent;}
::-webkit-scrollbar-thumb{background:rgba(99,102,241,.2);border-radius:10px;}
::-webkit-scrollbar-thumb:hover{background:rgba(99,102,241,.4);}
::selection{background:rgba(99,102,241,.12);}
body::before{content:"";position:fixed;inset:0;z-index:0;pointer-events:none;
background:radial-gradient(ellipse 70% 45% at 15% 8%,rgba(99,102,241,.05),transparent 55%),
radial-gradient(ellipse 55% 35% at 85% 92%,rgba(13,148,136,.04),transparent 50%);}
.wrap{position:relative;z-index:1;max-width:100%;margin:0 auto;padding:22px 12px 70px;}
/* HEADER */
header{text-align:center;margin-bottom:20px;animation:fadeIn .6s ease-out;}
@keyframes fadeIn{from{opacity:0;transform:translateY(-10px)}to{opacity:1;transform:translateY(0)}}
.badge-row{display:flex;align-items:center;justify-content:center;gap:8px;margin-bottom:10px;}
.badge{display:inline-flex;align-items:center;gap:6px;background:var(--surface);border:1px solid var(--border);border-radius:100px;padding:4px 14px;font-family:var(--font-mono);font-size:9px;font-weight:600;letter-spacing:2px;text-transform:uppercase;color:var(--ac);box-shadow:var(--shadow-sm);}
.pulse{width:5px;height:5px;border-radius:50%;background:var(--ac);animation:p 2s infinite;}
@keyframes p{0%,100%{opacity:1;transform:scale(1)}50%{opacity:.4;transform:scale(.8)}}
h1{font-size:clamp(18px,2.8vw,34px);font-weight:800;line-height:1.1;letter-spacing:-1.5px;margin-bottom:6px;
background:linear-gradient(135deg,#1e1b4b 15%,#6366f1 50%,#0d9488 85%);background-size:200%;
-webkit-background-clip:text;-webkit-text-fill-color:transparent;animation:shimmer 6s ease-in-out infinite;}
@keyframes shimmer{0%,100%{background-position:0%}50%{background-position:100%}}
.sub{color:var(--text-muted);font-size:10px;line-height:1.8;}
.sub b{color:var(--text-sec);font-weight:600;-webkit-text-fill-color:var(--text-sec);}
/* STATS */
.stats{display:flex;flex-wrap:wrap;gap:7px;justify-content:center;margin-bottom:16px;}
.st{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius-sm);padding:8px 14px;text-align:center;min-width:80px;box-shadow:var(--shadow-sm);transition:var(--tr);}
.st:hover{box-shadow:var(--shadow);border-color:var(--border-hover);}
.stn{font-family:var(--font-mono);font-size:15px;font-weight:700;color:var(--ac);}
.stl{font-size:8.5px;color:var(--text-muted);margin-top:2px;text-transform:uppercase;letter-spacing:.5px;}
/* TABS */
.tab-bar{display:flex;gap:0;border-bottom:1px solid var(--border);background:var(--surface);border-radius:var(--radius-sm) var(--radius-sm) 0 0;overflow:hidden;box-shadow:var(--shadow-sm);}
.tab{padding:10px 20px;font-size:10.5px;font-family:var(--font-mono);font-weight:600;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;transition:var(--tr);user-select:none;white-space:nowrap;letter-spacing:.3px;}
.tab:hover{color:var(--text);background:var(--ac-bg);}
.tab.on{color:var(--ac);border-bottom-color:var(--ac);background:var(--ac-bg);}
.tpane{display:none;padding-top:12px;}
.tpane.on{display:block;}
/* TOOLBAR */
.toolbar{display:flex;flex-wrap:wrap;gap:6px;margin-bottom:10px;align-items:center;}
.search-wrap{position:relative;flex:1;min-width:160px;max-width:240px;}
.search-wrap input{width:100%;padding:5px 8px 5px 28px;border:1px solid var(--border);border-radius:20px;background:var(--surface);font-family:var(--font-mono);font-size:10px;color:var(--text);outline:none;transition:var(--tr);}
.search-wrap input:focus{border-color:var(--ac);box-shadow:0 0 0 2px rgba(99,102,241,.1);}
.search-wrap::before{content:"⌕";position:absolute;left:9px;top:50%;transform:translateY(-50%);color:var(--text-muted);font-size:13px;pointer-events:none;}
.flbl{font-size:8.5px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:1px;font-weight:600;}
.fb{background:var(--surface);border:1px solid var(--border);color:var(--text-sec);padding:4px 10px;border-radius:20px;font-size:10px;font-weight:500;cursor:pointer;transition:var(--tr);box-shadow:var(--shadow-sm);font-family:var(--font);}
.fb:hover{background:var(--ac-bg);border-color:rgba(99,102,241,.3);color:var(--ac);}
.fb.on{background:linear-gradient(135deg,#6366f1,#4f46e5);border-color:transparent;color:#fff;box-shadow:0 3px 12px rgba(99,102,241,.25);}
/* COLUMN TOGGLE */
.col-toggle-wrap{position:relative;}
.col-toggle-btn{background:var(--surface);border:1px solid var(--border);color:var(--text-sec);padding:4px 10px;border-radius:20px;font-size:10px;font-weight:500;cursor:pointer;transition:var(--tr);box-shadow:var(--shadow-sm);font-family:var(--font);display:flex;align-items:center;gap:4px;}
.col-toggle-btn:hover{background:var(--ac-bg);border-color:rgba(99,102,241,.3);color:var(--ac);}
.col-dropdown{position:absolute;top:calc(100% + 6px);right:0;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius-sm);padding:10px;box-shadow:var(--shadow-lg);z-index:100;min-width:200px;display:none;}
.col-dropdown.open{display:grid;grid-template-columns:1fr 1fr;gap:4px;}
.col-chk{display:flex;align-items:center;gap:5px;font-size:9.5px;color:var(--text-sec);cursor:pointer;padding:3px 4px;border-radius:4px;transition:var(--tr);}
.col-chk:hover{background:var(--ac-bg);}
.col-chk input{accent-color:var(--ac);cursor:pointer;}
/* TABLE */
.tw{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);overflow-x:auto;box-shadow:var(--shadow);}
table{width:100%;border-collapse:collapse;font-size:10px;}
thead{background:var(--surface-alt);}
thead tr:last-child{border-bottom:2px solid var(--border);}
th{padding:7px 4px;text-align:center;font-size:7px;font-family:var(--font-mono);text-transform:uppercase;letter-spacing:.3px;color:var(--text-muted);white-space:nowrap;cursor:pointer;user-select:none;vertical-align:bottom;line-height:1.6;font-weight:600;}
th.c-model{text-align:left;padding-left:10px;min-width:160px;position:sticky;left:0;background:var(--surface-alt);z-index:2;}
th:hover,th.on{color:var(--ac);}
.sa{opacity:.5;font-size:6px;margin-left:2px;}
th a{color:inherit;text-decoration:none;}
th a:hover{color:var(--ac);text-decoration:underline;}
tbody tr{border-bottom:1px solid var(--border);transition:background var(--tr);}
tbody tr:last-child{border-bottom:none;}
tbody tr:hover{background:rgba(99,102,241,.022);}
tbody tr.hl{background:rgba(22,163,74,.025);}
tbody tr.hl:hover{background:rgba(22,163,74,.05);}
tbody tr.hidden{display:none;}
td{padding:6px 4px;text-align:center;vertical-align:middle;}
td.c-model{text-align:left;padding-left:10px;position:sticky;left:0;background:var(--surface);z-index:1;}
tbody tr.hl td.c-model{background:rgba(22,163,74,.025);}
tbody tr:hover td.c-model{background:rgba(99,102,241,.022);}
/* GROUP COLORS */
.gA{color:#b45309!important;font-weight:700;}
.gB{color:#4f46e5!important;}
.gC{color:#0891b2!important;}
.gF{color:#7c3aed!important;}
.gT{color:#d97706!important;}
.gM{color:#0891b2!important;}
.gN{color:#db2777!important;}
.gP{color:#64748b!important;}
/* MODEL CELL */
.mc{display:flex;flex-direction:column;gap:1px;}
.mn{font-weight:700;font-size:11px;color:var(--text);display:flex;align-items:center;gap:4px;flex-wrap:wrap;white-space:nowrap;}
.mn a{color:inherit;text-decoration:none;}
.mn a:hover{color:var(--ac);text-decoration:underline;}
.link-icon{font-size:8px;opacity:.5;transition:var(--tr);}
.mn:hover .link-icon{opacity:1;color:var(--ac);}
.ms{display:flex;gap:3px;align-items:center;margin-top:1px;}
.dot{width:5px;height:5px;border-radius:50%;flex-shrink:0;}
.mp{font-size:7.5px;color:var(--text-muted);font-family:var(--font-mono);}
/* BADGES */
.pb{display:inline-block;padding:1.5px 5px;border-radius:4px;font-size:6.5px;font-family:var(--font-mono);font-weight:700;text-transform:uppercase;letter-spacing:.3px;}
.ob{background:rgba(22,163,74,.1);color:#16a34a;border:1px solid rgba(22,163,74,.2);}
.cb{background:rgba(99,102,241,.1);color:#6366f1;border:1px solid rgba(99,102,241,.2);}
/* PROVIDER BADGE */
.prov{display:inline-flex;align-items:center;gap:3px;padding:2px 6px;border-radius:5px;font-size:7.5px;font-family:var(--font-mono);font-weight:700;white-space:nowrap;border:1px solid transparent;}
/* SCORE CELL */
.sc{display:flex;flex-direction:column;align-items:center;gap:2px;}
.sn{font-family:var(--font-mono);font-size:10px;font-weight:700;}
.sb{width:32px;height:2.5px;background:var(--border);border-radius:2px;overflow:hidden;}
.sf{height:100%;border-radius:2px;transition:width .9s cubic-bezier(0.4,0,0.2,1);}
.na{color:var(--text-muted);font-size:8.5px;font-family:var(--font-mono);}
/* COMPOSITE SCORE */
.comp{display:flex;flex-direction:column;align-items:center;gap:2px;}
.compN{font-family:var(--font-mono);font-size:12px;font-weight:700;}
.compB{width:38px;height:2.5px;background:var(--border);border-radius:2px;overflow:hidden;}
/* TOKENS */
.tk{font-family:var(--font-mono);font-size:10px;font-weight:700;}
/* LICENSES */
.lic{font-size:7.5px;font-family:var(--font-mono);padding:1.5px 5px;border-radius:4px;font-weight:700;white-space:nowrap;}
.la{background:rgba(22,163,74,.1);color:#16a34a;border:1px solid rgba(22,163,74,.2);}
.lm{background:rgba(59,130,246,.1);color:#3b82f6;border:1px solid rgba(59,130,246,.2);}
.lp{background:rgba(100,116,139,.1);color:#64748b;border:1px solid rgba(100,116,139,.2);}
.ll{background:rgba(139,92,246,.1);color:#7c3aed;border:1px solid rgba(139,92,246,.2);}
/* ARCH BADGES */
.at{display:flex;flex-direction:column;align-items:center;gap:2px;}
.atb{font-size:7px;font-family:var(--font-mono);padding:1.5px 4px;border-radius:4px;font-weight:700;}
.at-moe{background:rgba(217,119,6,.1);color:#d97706;border:1px solid rgba(217,119,6,.2);}
.at-den{background:rgba(99,102,241,.1);color:#6366f1;border:1px solid rgba(99,102,241,.2);}
.at-hyb{background:rgba(139,92,246,.1);color:#7c3aed;border:1px solid rgba(139,92,246,.2);}
/* VISION BADGES */
.vis{display:flex;flex-wrap:wrap;gap:2px;justify-content:center;}
.vb{font-size:7px;padding:1.5px 4px;border-radius:4px;font-weight:600;white-space:nowrap;}
.vi{background:rgba(22,163,74,.1);color:#16a34a;border:1px solid rgba(22,163,74,.18);}
.vv{background:rgba(59,130,246,.1);color:#3b82f6;border:1px solid rgba(59,130,246,.18);}
.va{background:rgba(219,39,119,.1);color:#db2777;border:1px solid rgba(219,39,119,.18);}
.vt{background:rgba(100,116,139,.1);color:#64748b;border:1px solid rgba(100,116,139,.18);}
/* PRICE */
.pr{display:flex;flex-direction:column;align-items:center;gap:1px;}
.pri{font-family:var(--font-mono);font-size:10px;font-weight:700;}
.pro{font-family:var(--font-mono);font-size:8px;color:var(--text-muted);}
/* ARC AGI */
.arc-col{background:rgba(14,165,233,.025);}
td.arc-col{background:rgba(14,165,233,.02);}
.meta-col{background:rgba(99,102,241,.02);}
/* VERTICAL RANKING CHART */
.vrank-section{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px 20px 12px;margin-bottom:14px;box-shadow:var(--shadow-sm);}
.vrank-header{display:flex;align-items:center;justify-content:space-between;margin-bottom:4px;}
.vrank-title{font-size:11px;font-family:var(--font-mono);font-weight:700;color:var(--ac);text-transform:uppercase;letter-spacing:.8px;}
.vrank-desc{font-size:9px;color:var(--text-muted);}
.vrank-legend{display:flex;flex-wrap:wrap;gap:10px;margin-top:10px;padding-top:10px;border-top:1px solid var(--border);}
.vrl{display:flex;align-items:center;gap:4px;font-size:8.5px;font-family:var(--font-mono);color:var(--text-sec);}
.vrl-dot{width:9px;height:9px;border-radius:3px;flex-shrink:0;}
.vrank-chart-wrap{position:relative;overflow-x:auto;padding-bottom:4px;}
/* LEGEND */
.leg{margin-top:12px;display:flex;flex-wrap:wrap;gap:10px;align-items:center;}
.lt{font-size:8.5px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:.8px;font-weight:600;}
.li{display:flex;align-items:center;gap:3px;font-size:9.5px;color:var(--text-sec);}
.ld{width:7px;height:7px;border-radius:50%;}
/* CHARTS EXPANDED */
.charts-grid{display:grid;grid-template-columns:1fr 1fr;gap:14px;margin-bottom:18px;}
.chart-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:18px;box-shadow:var(--shadow-sm);}
.chart-card.full{grid-column:1/-1;}
.chart-card.third{grid-column:span 1;}
.charts-grid-3{display:grid;grid-template-columns:1fr 1fr 1fr;gap:14px;margin-bottom:18px;}
.chart-card h3{font-size:10.5px;font-family:var(--font-mono);font-weight:700;color:var(--ac);margin-bottom:4px;text-transform:uppercase;letter-spacing:.7px;}
.chart-card p{font-size:9.5px;color:var(--text-muted);margin-bottom:12px;}
.chart-card canvas{max-width:100%;}
.chart-insight{margin-top:10px;padding:8px 10px;background:var(--ac-bg);border-radius:6px;font-size:8.5px;color:var(--text-sec);line-height:1.7;border-left:2px solid var(--ac);}
.chart-insight b{color:var(--ac);}
/* INFO TAB */
.info-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(240px,1fr));gap:9px;}
.fni{background:var(--surface-alt);border:1px solid var(--border);border-radius:var(--radius-sm);padding:11px 13px;transition:var(--tr);}
.fni:hover{border-color:var(--border-hover);box-shadow:var(--shadow-sm);}
.fni b{color:var(--text);font-size:9px;display:block;margin-bottom:3px;font-weight:700;}
.fni p{font-size:8.5px;color:var(--text-sec);line-height:1.75;}
.fni a{color:var(--ac);text-decoration:none;}
.fni a:hover{text-decoration:underline;}
.upd{text-align:center;margin-top:14px;font-size:8.5px;font-family:var(--font-mono);color:var(--text-muted);}
@media(max-width:900px){.charts-grid{grid-template-columns:1fr;}}
</style>
</head>
<body>
<div class="wrap">
<header>
<div class="badge-row">
<div class="badge"><div class="pulse"></div>LIVE · 2026.03.04 · v1.0</div>
</div>
<h1>ALL Bench Leaderboard 2026</h1>
<p class="sub">
<b>31 Models × 25 Metrics</b> — The only leaderboard combining Metacognitive (FINAL Bench) + ARC-AGI-2 + 23 standard benchmarks<br>
<span style="color:#0d9488;font-weight:600">✓ Scores verified against Artificial Analysis Intelligence Index (2026.03.04) · AA #1 Gemini 3.1 Pro · AA #2 GPT-5.3 Codex · AA OpenSrc #1 GLM-5</span><br>
<span style="font-size:9px">Scale AI SEAL · artificialanalysis.ai · arcprize.org · FINAL-Bench/Metacognitive (HF Official) · Chatbot Arena · aimultiple.com</span>
</p>
</header>
<div class="stats">
<div class="st"><div class="stn">31</div><div class="stl">Models</div></div>
<div class="st"><div class="stn">25</div><div class="stl">Metrics</div></div>
<div class="st"><div class="stn">10</div><div class="stl">Providers</div></div>
<div class="st"><div class="stn" style="color:#16a34a">17</div><div class="stl">Open Source</div></div>
<div class="st"><div class="stn" style="color:#6366f1">+6</div><div class="stl">v1.0 New</div></div>
</div>
<!-- TABS -->
<div class="tab-bar">
<div class="tab on" onclick="showTab('tbl',this)">📊 Leaderboard</div>
<div class="tab" onclick="showTab('charts',this)">📈 Charts</div>
<div class="tab" onclick="showTab('info',this)">📎 Benchmark Info</div>
</div>
<!-- ========== TAB: TABLE ========== -->
<div id="tbl" class="tpane on">
<!-- ========== VERTICAL RANKING BAR CHART (above table) ========== -->
<div class="vrank-section">
<div class="vrank-header">
<div>
<div class="vrank-title">🏆 ALL Bench Composite Score Ranking</div>
<div class="vrank-desc">Average of all available benchmarks · MMLU-Pro · GPQA · AIME · HLE · ARC-AGI-2 · Metacog · SWE-Pro · BFCL · IFEval · SWE-V · Colored by provider</div>
</div>
</div>
<div class="vrank-chart-wrap">
<canvas id="cVertRank" style="min-width:900px;height:200px;"></canvas>
</div>
<div class="vrank-legend" id="vrankLegend"></div>
</div>
<div class="toolbar" style="margin-top:12px">
<div class="search-wrap">
<input type="text" id="searchBox" placeholder="Search models..." oninput="doSearch(this.value)">
</div>
<span class="flbl">Filter:</span>
<button class="fb on" onclick="flt('all',this)">All 21</button>
<button class="fb" onclick="flt('open',this)">🔓 Open</button>
<button class="fb" onclick="flt('closed',this)">🔒 Closed</button>
<button class="fb" onclick="flt('qwen',this)">🟠 Qwen3.5</button>
<button class="fb" onclick="flt('gptoss',this)">⬛ GPT-OSS</button>
<button class="fb" onclick="flt('reasoning',this)">🧠 Reasoning</button>
<button class="fb" onclick="flt('moe',this)">⚡ MoE</button>
<button class="fb" onclick="flt('vision',this)">👁 Vision</button>
<button class="fb" onclick="flt('value',this)">💚 Value</button>
<button class="fb" onclick="flt('flagship',this)">👑 Flagship</button>
<button class="fb" onclick="flt('korean',this)" style="background:linear-gradient(135deg,#c9002b,#003478);color:#fff;border-color:#c9002b;font-weight:700">🇰🇷 소버린 AI</button>
<div class="col-toggle-wrap" style="margin-left:auto">
<button class="col-toggle-btn" onclick="toggleColMenu()" id="colBtn">⚙ Columns ▾</button>
<div class="col-dropdown" id="colMenu"></div>
</div>
</div>
<div class="tw">
<table id="T">
<thead>
<tr>
<th class="c-model" onclick="srt(0)">Model<span class="sa"></span></th>
<th class="gP" style="min-width:72px">Provider</th>
<th onclick="srt(2)" class="gA" title="Average of available 100-pt benchmarks: MMLU-Pro·GPQA·AIME·HLE·ARC-AGI-2·Metacog·SWE-Pro·BFCL·IFEval·SWE-V" style="min-width:58px">🏆 Score<span class="sa"></span></th>
<th onclick="srt(3)" class="gT" style="min-width:48px">📅 Release<span class="sa"></span></th>
<th onclick="srt(4)" class="gB" data-col="4" title="MMLU-Pro: 57K questions, highest sample size &amp; reliability" style="min-width:52px"><a href="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro" target="_blank">📚 MMLU-Pro</a><span class="sa"></span></th>
<th onclick="srt(5)" class="gB" data-col="5" title="GPQA Diamond: PhD-level expert questions, highest discrimination" style="min-width:52px"><a href="https://huggingface.co/datasets/Idavidrein/gpqa" target="_blank">🧠 GPQA◆</a><span class="sa"></span></th>
<th onclick="srt(6)" class="gB" data-col="6" title="AIME 2025: Math olympiad, minimal contamination" style="min-width:50px"><a href="https://artofproblemsolving.com/wiki/index.php/2025_AIME" target="_blank">📐 AIME25</a><span class="sa"></span></th>
<th onclick="srt(7)" class="gB" data-col="7" title="HLE: Humanity's Last Exam — hardest existing benchmark, 2500 expert-sourced questions" style="min-width:48px"><a href="https://huggingface.co/datasets/centerforaisafety/hle" target="_blank">🔭 HLE</a><span class="sa"></span></th>
<th onclick="srt(8)" class="arc-col" style="color:#0ea5e9!important;min-width:56px" data-col="8" title="ARC-AGI-2: Abstract reasoning, novel visual puzzles — most contamination-proof"><a href="https://arcprize.org/arc-agi-2" target="_blank" style="color:#0ea5e9">🧩 ARC-AGI-2★</a><span class="sa"></span></th>
<th onclick="srt(9)" class="gF meta-col" data-col="9" title="FINAL-Bench Metacognitive: 100 tasks, measures self-correction &amp; error recovery (ER)" style="min-width:54px"><a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="color:#7c3aed">🧬 Metacog★</a><span class="sa"></span></th>
<th onclick="srt(10)" class="gB" data-col="10" title="SWE-Pro: Scale AI SEAL, 1865 real repos, contamination-free" style="min-width:52px"><a href="https://scale.com/leaderboard/coding" target="_blank">🏗 SWE-Pro</a><span class="sa"></span></th>
<th onclick="srt(11)" class="gB" data-col="11" title="BFCL v4: Tool use &amp; agent capability" style="min-width:48px"><a href="https://gorilla.cs.berkeley.edu/leaderboard.html" target="_blank">🔧 BFCL</a><span class="sa"></span></th>
<th onclick="srt(12)" class="gB" data-col="12" title="IFEval: Instruction following" style="min-width:48px"><a href="https://huggingface.co/datasets/google/IFEval" target="_blank">📋 IFEval</a><span class="sa"></span></th>
<th onclick="srt(13)" class="gB" data-col="13" title="LiveCodeBench: Competitive programming" style="min-width:44px"><a href="https://livecodebench.github.io/leaderboard.html" target="_blank">🖥 LCB</a><span class="sa"></span></th>
<th onclick="srt(14)" class="gB" style="opacity:.75" data-col="14" title="SWE-Verified: ⚠ Contamination risk, 59.4% tasks found defective by OpenAI audit" style="min-width:48px"><a href="https://www.swebench.com" target="_blank">💻 SWE-V⚠</a><span class="sa"></span></th>
<th onclick="srt(15)" class="gM" data-col="15" title="MMMLU: 50+ languages multilingual MMLU" style="min-width:52px"><a href="https://huggingface.co/datasets/openai/MMMLU" target="_blank">🌍 MMMLU</a><span class="sa"></span></th>
<th onclick="srt(16)" class="gT" data-col="16" style="min-width:44px">📥 CtxIn<span class="sa"></span></th>
<th onclick="srt(17)" class="gT" data-col="17" style="min-width:44px">📤 CtxOut<span class="sa"></span></th>
<th onclick="srt(18)" class="gT" data-col="18" style="min-width:44px">⚡ tok/s<span class="sa"></span></th>
<th onclick="srt(19)" class="gN" data-col="19" title="TTFT — lower is faster" style="min-width:44px">⏱ TTFT<span class="sa"></span></th>
<th class="gN" data-col="20" style="min-width:80px">👁 Vision</th>
<th class="gN" data-col="21" style="min-width:80px">⚙ Arch</th>
<th onclick="srt(22)" class="gP" data-col="22" style="min-width:48px">🏆 ELO<span class="sa"></span></th>
<th class="gP" data-col="23" style="min-width:52px">📄 License</th>
<th onclick="srt(24)" class="gP" data-col="24" style="min-width:50px">💰 $/M in<span class="sa"></span></th>
</tr>
</thead>
<tbody id="TB"></tbody>
</table>
</div>
<div class="leg">
<span class="lt">Grade:</span>
<div class="li"><div class="ld" style="background:#6366f1"></div>S≥90%</div>
<div class="li"><div class="ld" style="background:#0d9488"></div>A≥75%</div>
<div class="li"><div class="ld" style="background:#d97706"></div>B≥60%</div>
<div class="li"><div class="ld" style="background:#e11d48"></div>C&lt;60%</div>
<span style="color:#db2777;font-size:9px;margin-left:8px">★ = New in v1.0</span>
<span style="color:#16a34a;font-size:9px;margin-left:6px">💚 Green row = Open-source value pick</span>
<span style="font-family:var(--font-mono);font-size:8px;color:#0ea5e9;margin-left:6px">🧩 ARC-AGI-2 = arcprize.org official</span>
<span style="font-family:var(--font-mono);font-size:8px;color:#7c3aed;margin-left:6px">🧬 Metacog = FINAL-Bench official (9 models measured)</span>
</div>
</div>
<!-- ========== TAB: CHARTS ========== -->
<div id="charts" class="tpane">
<!-- ROW 1: ARC-AGI-2 + Metacog Delta -->
<div class="charts-grid" style="margin-top:14px">
<div class="chart-card">
<h3>🧩 ARC-AGI-2 — Abstract Reasoning Frontier</h3>
<p>Official arcprize.org · Vertical bars by score · Contamination-proof visual reasoning benchmark</p>
<canvas id="cArc" height="220"></canvas>
<div class="chart-insight"><b>Key:</b> Gemini 3.1 Pro dominates at <b>88.1%</b> (Feb 2026 leaderboard #1). GPT-5.2 52.9% · Claude Opus 4.6 ~37.6%. Kimi K2.5 surprisingly low at 12.1% despite HLE dominance — distinct capability axis.</div>
</div>
<div class="chart-card">
<h3>🧬 Metacog: Baseline → Self-Correction Gain (Δ)</h3>
<p>FINAL-Bench official · Baseline FINAL Score vs MetaCog condition · Error Recovery drives 94.8% of gains</p>
<canvas id="cMetaDelta" height="220"></canvas>
<div class="chart-insight"><b>Key:</b> Claude Opus 4.6 has lowest baseline (rank 9) but <b>largest Δ gain (+20.13)</b> — strongest self-correction. Kimi K2.5 highest baseline but smallest gain. Declarative–Procedural gap persists across all models.</div>
</div>
</div>
<!-- ROW 2: Radar + Benchmark Category Breakdown -->
<div class="charts-grid">
<div class="chart-card">
<h3>🕸 Capability Radar — TOP 6 Multi-Axis Profile</h3>
<p>MMLU-Pro · GPQA · AIME · HLE · ARC-AGI-2 · MMMLU · Each axis normalized to 100</p>
<canvas id="cRadar" height="260"></canvas>
<div class="chart-insight"><b>Key:</b> No single model dominates all axes. Gemini leads MMMLU+HLE, GPT-5.2 leads MMLU-Pro, Kimi K2.5 exceptional on MMLU-Pro 92.0. Different strengths suggest routing strategies.</div>
</div>
<div class="chart-card">
<h3>📊 Capability Domains — Reasoning vs Coding vs Language</h3>
<p>Grouped bars: Reasoning avg (GPQA+AIME+HLE) · Coding avg (SWE-Pro+LCB) · Language avg (MMLU-Pro+MMMLU+IFEval)</p>
<canvas id="cDomain" height="260"></canvas>
<div class="chart-insight"><b>Key:</b> Claude Opus 4.6 leads Coding domain. Gemini 3.1 Pro leads Language. GPT-5.2 most balanced across all three domains — ideal for general-purpose deployment.</div>
</div>
</div>
<!-- ROW 3: Perf vs Cost + Provider Comparison -->
<div class="charts-grid">
<div class="chart-card">
<h3>💰 Performance vs Cost — Value Frontier Map</h3>
<p>X = Input price log scale ($/M tokens) · Y = Composite Score · Top-left quadrant = elite value zone</p>
<canvas id="cScatter" height="260"></canvas>
<div class="chart-insight"><b>Value leaders:</b> DeepSeek V3.2 ($0.14/M, score ~74) and GLM-5 ($0.35/M) offer exceptional open-weight value. GPT-OSS-120B is truly free with competitive performance.</div>
</div>
<div class="chart-card">
<h3>🏭 Provider Strength — Average Score by Company</h3>
<p>Average composite score across all models per provider · Shows lab-level consistency</p>
<canvas id="cProvider" height="260"></canvas>
<div class="chart-insight"><b>Key:</b> OpenAI strongest average (combining closed+OSS models). Alibaba's Qwen3.5 family shows remarkable breadth. DeepSeek punches above weight with MIT-licensed models.</div>
</div>
</div>
<!-- ROW 4: Timeline + Open vs Closed -->
<div class="charts-grid">
<div class="chart-card">
<h3>📅 Intelligence Timeline — Score vs Release Date</h3>
<p>Bubble size = context window (log scale) · Color = provider · Rapid capability gains 2025→2026</p>
<canvas id="cTimeline" height="260"></canvas>
<div class="chart-insight"><b>Key:</b> ~15-point score jump from Jan 2025 to Feb 2026. Feb 2026 releases (GPT-5.2, Gemini 3.1 Pro) establish new ceiling. Context window growth independent of intelligence score.</div>
</div>
<div class="chart-card">
<h3>⚖ Open vs Closed — Distribution Comparison</h3>
<p>Score distribution: Open-weight (15 models) vs Closed-API (6 models) · Box plot style with individual points</p>
<canvas id="cOpenClosed" height="260"></canvas>
<div class="chart-insight"><b>Key:</b> Open-weight models now overlap significantly with closed-API. Top open models (Kimi K2.5, Qwen3.5-397B) match or exceed many closed offerings — open-source gap is closing.</div>
</div>
</div>
<!-- ROW 5: Benchmark Coverage + Heatmap -->
<div class="charts-grid">
<div class="chart-card">
<h3>📐 Benchmark Score Variance — Consistency Analysis</h3>
<p>For each benchmark: show min/max/mean across all models · Reveals benchmark difficulty &amp; discrimination power</p>
<canvas id="cVariance" height="220"></canvas>
<div class="chart-insight"><b>Key:</b> HLE shows widest variance (7.0–44.9) = best discrimination. ARC-AGI-2 also highly discriminating (12.1–88.1). AIME25 scores cluster high — many models saturating it.</div>
</div>
<div class="chart-card full">
<h3>🌡 Full Benchmark Heatmap — 31 Models × 11 Benchmarks</h3>
<p>Color intensity = score · White/light = unreported · Indigo = high · Reveals capability patterns across the entire landscape</p>
<canvas id="cHeat" style="width:100%;display:block;"></canvas>
</div>
</div>
</div>
<!-- ========== TAB: INFO ========== -->
<div id="info" class="tpane">
<div class="info-grid" style="margin-top:12px">
<div class="fni"><b>🧩 ARC-AGI-2 ★NEW — Abstract Reasoning</b><p>Tests novel visual pattern completion — cannot be solved by memorization. <a href="https://arcprize.org/arc-agi-2" target="_blank">arcprize.org</a>. Gemini 3.1 Pro 88.1% (Feb 2026 leaderboard 1st) · GPT-5.2 52.9% · Claude Opus 4.6 ~37.6% · Kimi K2.5 12.1%. Most contamination-proof benchmark available.</p></div>
<div class="fni" style="border-left:4px solid #c9002b;background:linear-gradient(135deg,#c9002b08,#00347808)"><b>🇰🇷 한국 소버린 AI — 독자 AI 파운데이션 모델 (독파모) 현황</b><p>
과기정통부 주관 '독자 AI 파운데이션 모델 프로젝트' 2026.02 기준 4개 정예팀: <b>LG AI연구원(K-EXAONE)</b> · <b>SK텔레콤(A.X K1)</b> · <b>업스테이지(Solar Open 100B)</b> · <b>모티프테크놀로지스</b>.<br>
• 1차 평가(2026.01.15): 5팀 → 3팀 (네이버클라우드 독자성 미달, NC AI 점수 미달 탈락)<br>
• 패자부활전(2026.02.20): 모티프테크놀로지스 추가 선정 → 4팀 체제<br>
• K-EXAONE: 1차 평가 1위 · 13개 벤치마크 평균 72점 · AA 오픈웨이트 톱10 · 236B MoE<br>
• Solar Open 100B: AIME 84.3% · 19.7T 토큰 · 100B MoE · arXiv 2601.07022<br>
• A.X K1: 국내 최초 500B 파라미터 · Apache 2.0 오픈소스<br>
• 목표: 글로벌 AI 모델 95% 이상 성능 확보 · 2027년 최종 2팀 선정 · 5,300억원 예산
</p></div>
<div class="fni"><b>🧬 Metacognitive ★NEW — FINAL-Bench</b><p>Official: <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank">HF FINAL-Bench/Metacognitive</a>. 100 tasks, 9 SOTA models tested. Baseline FINAL Score: Kimi K2.5 68.71 · GPT-5.2 62.76 · GLM-5 62.50 · Gemini 59.5 · Opus 4.6 56.04 (rank 9). ER (error recovery) accounts for 94.8% of self-correction gains. 14 models = not evaluated (—).</p></div>
<div class="fni"><b>📊 Composite Score — 공정 가중 평균</b><p>10개 벤치마크(MMLU-Pro · GPQA · AIME · HLE · ARC-AGI-2 · Metacog · SWE-Pro · BFCL · IFEval · SWE-V) 총합÷10. <b style="color:#e11d48">미제출(null)=0점 패널티</b> → 벤치마크 회피 시 불이익. 막대 상단 n/10 = 실제 커버리지 표시. 이전 버전의 "null 제외 평균" 방식은 Grok 4.1 Fast(2개만 제출→86점), DeepSeek R2(4개→88점) 같은 허위 1위 오류를 유발함.</p></div>
<div class="fni"><b>📚 MMLU-Pro</b><p><a href="https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro" target="_blank">HF: TIGER-Lab/MMLU-Pro</a>. 57,000 expert-level questions across disciplines. Largest sample size → highest statistical reliability. Much harder than original MMLU. Gold standard general knowledge benchmark.</p></div>
<div class="fni"><b>🧠 GPQA Diamond ⭐</b><p><a href="https://huggingface.co/datasets/Idavidrein/gpqa" target="_blank">HF: Idavidrein/gpqa</a>. 198 PhD-level questions in biology, chemistry, physics. Human expert average ~65%. Highest discrimination power among frontier models.</p></div>
<div class="fni"><b>📐 AIME 2025</b><p><a href="https://artofproblemsolving.com/wiki/index.php/2025_AIME" target="_blank">AoPS: 2025 AIME</a>. American Invitational Mathematics Examination. 2025 problem set minimizes contamination. Tests mathematical reasoning and creative problem solving.</p></div>
<div class="fni"><b>🔭 HLE — Humanity's Last Exam</b><p><a href="https://huggingface.co/datasets/centerforaisafety/hle" target="_blank">HF: centerforaisafety/hle</a>. 2,500 expert-submitted questions. Intended to be the final closed-ended academic benchmark. Kimi K2.5 44.9% · Gemini 3.1 Pro 44.7% lead.</p></div>
<div class="fni"><b>🏗 SWE-Pro ⭐ Recommended</b><p><a href="https://scale.com/leaderboard/coding" target="_blank">scale.com/leaderboard/coding</a>. Scale AI SEAL, 1865 real repos. Contamination-free. ~35pt lower than SWE-Verified — honest measure of real coding. OpenAI recommends over Verified.</p></div>
<div class="fni"><b>💻 SWE-Verified ⚠ Caution</b><p><a href="https://www.swebench.com" target="_blank">swebench.com</a>. 59.4% of tasks found defective in OpenAI audit. Memorization/contamination risk. Reference only. Prefer SWE-Pro for accurate assessment.</p></div>
<div class="fni"><b>🔧 BFCL v4</b><p><a href="https://gorilla.cs.berkeley.edu/leaderboard.html" target="_blank">gorilla.cs.berkeley.edu</a>. Berkeley Function-Calling Leaderboard. Measures tool use and agent capability. Qwen3.5-122B world #1.</p></div>
<div class="fni"><b>📋 IFEval</b><p><a href="https://huggingface.co/datasets/google/IFEval" target="_blank">HF: google/IFEval</a>. Instruction following evaluation. Verifiable output constraints. Tests precision compliance.</p></div>
<div class="fni"><b>🖥 LiveCodeBench</b><p><a href="https://livecodebench.github.io/leaderboard.html" target="_blank">livecodebench.github.io</a>. Competitive programming from LeetCode, AtCoder, Codeforces. Continuously updated to prevent contamination.</p></div>
<div class="fni"><b>🌍 MMMLU — Multilingual</b><p><a href="https://huggingface.co/datasets/openai/MMMLU" target="_blank">HF: openai/MMMLU</a>. MMLU in 57 languages. Gemini 3.1 Pro ~88% leads. Qwen3.5 officially supports 201 languages.</p></div>
<div class="fni"><b>⚙ Architecture</b><p>MoE = sparse activation (efficient), Dense = full params (quality), Hybrid = DeltaNet+MoE. Parentheses = active/total params. Active params determine inference cost. Qwen3.5-35B: 3B active → 194 tok/s.</p></div>
<div class="fni"><b>⏱ TTFT Latency</b><p>Time To First Token (seconds). Lower is faster. Mistral Large 3 0.3s · GPT-5.2 0.6s fastest. Reasoning models (DeepSeek R1 8s) are slower due to chain-of-thought. &lt;2s recommended for real-time apps.</p></div>
<div class="fni"><b>💰 Pricing</b><p>Input cost in $/million tokens. 0 = free open-weights. Qwen3.5-35B $0.10/M, DeepSeek V3.2 $0.14/M offer extreme value vs closed models. GPT-5.2 $1.75/M · Claude Opus 4.6 $5/M.</p></div>
</div>
<p class="upd">ALL Bench Leaderboard v1.0 · Updated March 4, 2026 · Scale AI SEAL · artificialanalysis.ai · arcprize.org · FINAL-Bench/Metacognitive (HF) · Chatbot Arena</p>
</div>
</div><!-- /wrap -->
<script>
// DATA STRUCTURE:
// [0:name, 1:provider, 2:provColor, 3:type, 4:group, 5:dotColor, 6:released,
// 7:mmluP, 8:gpqa, 9:aime, 10:hle, 11:arcagi2, 12:metacog, 13:swePro, 14:bfcl, 15:ifeval, 16:lcb, 17:sweV, 18:mmmlu,
// 19:maxIn, 20:maxOut, 21:tokPerSec, 22:ttft, 23:vis[], 24:archType, 25:archDetail,
// 26:elo, 27:license, 28:priceIn, 29:priceOut,
// 30:valScore, 31:valLabel, 32:csScore, 33:csLabel,
// 34:modelUrl]
const D=[
["GPT-5.2","OpenAI","#10a37f","closed","flagship","#10a37f","2026.02",
80.0,93.2,100,35.4,52.9,62.76,29.9,null,90.5,null,80.0,82,
400,128,null,0.6,["Image"],"Dense","Reasoning",1510,"Prop",1.75,14.0,
3,"Top performance",4,"All-around leader · high cost",
"https://openai.com/gpt-5"],
["GPT-5.3 Codex","OpenAI","#10a37f","closed","flagship","#047857","2026.02",
82.9,91.5,95.0,36.0,null,null,78.2,null,null,null,null,83,
400,128,null,null,["Image"],"Dense","Reasoning(Coding)",1500,"Prop",7.50,30.0,
4,"AA 전체 2위(54) · 코딩 전문 frontier",4,"Terminal-Bench 77.3% · SWE 78.2% · coding SOTA",
"https://openai.com/codex"],
["Claude Opus 4.6","Anthropic","#d97706","closed","flagship","#d97706","2025.10",
78.5,91.3,100,36.7,37.6,56.04,45.0,null,93.1,null,80.8,80,
200,32,null,3.5,["Image","Video"],"Dense","Reasoning(Adaptive)",1498,"Prop",5.0,25.0,
3,"Coding & agents #1",5,"Agent SOTA · community fav",
"https://anthropic.com/claude"],
["Gemini 3.1 Pro","Google","#4285f4","closed","flagship","#4285f4","2026.01",
83.0,94.3,97,44.7,88.1,59.5,43.3,null,91.0,null,74.2,88,
2000,64,null,4.2,["Image","Video","Audio"],"Dense","Reasoning(DeepThink)",1501,"Prop",2.0,12.0,
4,"GPQA · HLE · Vision #1",4,"Multimodal SOTA · HLE leader",
"https://gemini.google.com"],
["Grok 4 Heavy","xAI","#1d9bf0","closed","flagship","#1d9bf0","2025.11",
85.0,87.5,95,28.0,null,null,25.0,null,88.0,null,72.0,76,
256,32,null,6.0,["Image"],"Dense","Reasoning",1460,"Prop",3.0,15.0,
3,"X real-time search",4,"Math & reasoning specialist",
"https://x.ai/grok"],
["Claude Sonnet 4.6","Anthropic","#d97706","closed","flagship","#f59e0b","2025.10",
76.0,84.5,93,22.0,null,null,43.6,null,92.0,null,72.7,79,
200,64,null,2.0,["Image","Video"],"Dense","Reasoning",1482,"Prop",3.0,15.0,
4,"Mid-tier best pick",4,"Most-used · balanced",
"https://anthropic.com/claude"],
["GPT-OSS-120B","OpenAI","#059669","open","gptoss","#059669","2025.12",
90.0,80.9,97.9,12.0,null,null,16.2,null,null,null,null,72,
128,32,null,null,["Text"],"MoE","Reasoning(5.1B/116.8B)",1380,"Apache2",0,0,
5,"80GB single-GPU local",4,"MMLU open-source #1 · o4-mini class",
"https://huggingface.co/openai"],
["GPT-OSS-20B","OpenAI","#059669","open","gptoss","#34d399","2025.12",
85.3,71.5,98.7,7.0,null,null,null,null,null,null,null,68,
128,32,null,null,["Text"],"MoE","Reasoning(3.6B/20.9B)",1340,"Apache2",0,0,
5,"16GB edge · AIME best",4,"Tiny AIME 98.7% champion",
"https://huggingface.co/openai"],
["Qwen3.5-397B","Alibaba","#f97316","open","qwen","#f97316","2026.01",
87.8,88.4,91.3,32.0,null,null,38.0,null,92.6,null,76.4,85,
262,32,45,5.0,["Image","Video"],"MoE+Hybrid","Reasoning(17B/397B)",1445,"Apache2",null,null,
4,"Open-source flagship",5,"IFBench world #1 · 201 langs",
"https://huggingface.co/Qwen"],
["Qwen3.5-122B","Alibaba","#f97316","open","qwen","#fb923c","2026.01",
null,87.6,85.0,24.0,null,null,28.0,72.2,93.4,null,72.2,82,
262,32,null,6.0,["Image","Video"],"MoE+Hybrid","Reasoning(10B/122B)",1420,"Apache2",0.40,1.20,
5,"BFCL world #1",5,"BFCL +30% vs GPT-5 mini",
"https://huggingface.co/Qwen"],
["Qwen3.5-27B","Alibaba","#f97316","open","qwen","#fdba74","2026.01",
86.1,85.5,null,18.0,null,null,20.0,null,null,null,72.4,80,
262,32,null,5.5,["Image","Video"],"Dense","Reasoning(27B)",1395,"Apache2",null,null,
5,"Dense coding specialist",4,"SWE 72.4% · GPT-5 mini class",
"https://huggingface.co/Qwen"],
["Qwen3.5-35B","Alibaba","#f97316","open","qwen","#fed7aa","2026.01",
null,83.0,null,15.0,null,null,18.0,null,null,null,68.0,78,
262,32,194,4.0,["Image","Video"],"MoE+Hybrid","Reasoning(3B/35B)",1380,"Apache2",0.10,0.40,
5,"3B active · 194 tok/s",5,"Beats old 235B · local #1",
"https://huggingface.co/Qwen"],
["Qwen3.5-Flash","Alibaba","#f97316","closed","qwen","#ea580c","2026.01",
null,null,null,null,null,null,null,null,null,null,null,75,
1000,32,null,3.0,["Image","Video"],"MoE+Hybrid","Non-Reasoning",null,"Prop",0.10,0.40,
5,"Ultra-low cost · 1M ctx",4,"Cheaper than DeepSeek+multimodal",
"https://huggingface.co/Qwen"],
["Qwen3.5-9B","Alibaba","#f97316","open","qwen","#c2410c","2026.01",
null,81.7,null,10.0,null,null,null,null,null,null,null,73,
262,32,null,3.5,["Image","Video"],"Dense","Reasoning(9B)",1300,"Apache2",null,null,
5,"9B beats 120B",5,"GPQA 81.7>80.9 · RTX 3060",
"https://huggingface.co/Qwen"],
["Qwen3.5-4B","Alibaba","#f97316","open","qwen","#9a3412","2026.01",
null,null,null,null,null,null,null,null,null,null,null,65,
262,32,null,2.0,["Image","Video"],"Dense","Reasoning(4B)",null,"Apache2",null,null,
5,"First 4B multimodal",4,"Video & minimal agent",
"https://huggingface.co/Qwen"],
["DeepSeek V3.2","DeepSeek","#6366f1","open","value","#6366f1","2025.12",
null,79.9,89.3,22.0,null,null,15.6,null,87.5,null,66.0,78,
128,8,null,7.0,["Text"],"MoE","Non-Reasoning(37B/671B)",1430,"MIT",0.14,0.28,
5,"MIT · value legend",5,"Reddit fav · unbeatable price",
"https://huggingface.co/deepseek-ai/DeepSeek-V3"],
["DeepSeek R1","DeepSeek","#6366f1","open","value","#818cf8","2025.01",
85.0,82.0,87.5,14.0,null,null,18.0,null,83.3,null,49.2,74,
128,8,null,8.0,["Text"],"MoE","Reasoning(37B/671B)",1440,"MIT",0.55,2.19,
5,"Reasoning specialist",5,"Math/reasoning legend",
"https://huggingface.co/deepseek-ai/DeepSeek-R1"],
["Kimi K2.5","Moonshot","#8b5cf6","open","flagship","#8b5cf6","2025.11",
92.0,87.6,96.1,44.9,12.1,68.71,27.7,null,94.0,null,76.8,81,
200,32,null,5.0,["Image","Video"],"MoE","Reasoning(1T)",1447,"MIT",0.55,2.50,
5,"HLE · MMLU elite",4,"HLE 44.9% #1 class · agent",
"https://huggingface.co/moonshotai"],
["GLM-5","Zhipu AI","#14b8a6","open","value","#0d9488","2026.02",
87.0,86.0,92.7,30.5,null,null,77.8,null,null,52.0,null,76,
200,16,null,null,["Text"],"MoE","Reasoning(40B/745B)",1451,"MIT",0.35,0.39,
5,"AA 오픈소스 1위(50) · ELO 1451 최고",5,"SWE 77.8% · Huawei Ascend only · 2026.02.11",
"https://huggingface.co/zai-org/GLM-5"],
["Llama 4 Scout","Meta","#0081fb","open","flagship","#0081fb","2025.04",
null,73.0,85.0,12.0,null,null,5.2,null,85.0,null,55.0,70,
10000,16,null,2.0,["Image","Video"],"MoE","Non-Reasoning(17B/400B)",1340,"Meta",0.11,0.34,
4,"10M ctx revolution",3,"Local fav · 10M context",
"https://huggingface.co/meta-llama"],
["Mistral Large 3","Mistral","#ff7043","open","flagship","#ff7043","2025.11",
null,78.0,82.0,11.0,null,null,12.0,null,86.0,null,60.0,72,
256,16,null,0.3,["Image"],"MoE","Non-Reasoning(675B)",1320,"Apache2",2.0,6.0,
3,"TTFT 0.3s fastest",3,"GDPR · EU preference",
"https://huggingface.co/mistralai"],
["Gemini 3 Flash","Google","#4285f4","closed","flagship","#34a853","2025.12",
88.6,90.4,95.0,33.7,34.0,null,71.2,null,88.2,null,72.5,83,
1000,64,218,1.2,["Image","Video","Audio"],"Dense","Non-Reasoning+Thinking",1490,"Prop",0.50,3.00,
5,"Flash beats last-gen Pro · 218 tok/s",5,"GPQA 90.4% Flash level · HF trending #1",
"https://deepmind.google/technologies/gemini/flash/"],
["Llama 4 Maverick","Meta","#0081fb","open","flagship","#1877f2","2025.10",
80.5,69.8,82.0,18.0,null,null,12.3,null,83.0,null,73.0,74,
1000,16,null,4.5,["Image","Video"],"MoE","Non-Reasoning(17B/400B)",1390,"Llama4",0.22,0.88,
4,"1M ctx · enterprise cloud default",4,"AWS/Azure built-in · Maverick > Scout",
"https://huggingface.co/meta-llama"],
["Claude Haiku 4.5","Anthropic","#d97706","closed","flagship","#b45309","2025.09",
72.0,75.0,null,null,null,null,14.0,null,86.5,null,68.0,71,
200,8,null,0.4,["Image"],"Dense","Non-Reasoning",1350,"Prop",1.00,5.00,
5,"Pareto frontier · fastest Anthropic",5,"TTFT 0.4s · Terminal Bench 3rd",
"https://anthropic.com/claude"],
["Grok 4.1 Fast","xAI","#1d9bf0","closed","flagship","#0ea5e9","2025.11",
null,85.3,88.0,null,null,null,null,null,null,null,null,72,
2000,16,null,1.5,["Image","Video"],"Dense","Reasoning",1380,"Prop",0.20,0.80,
4,"$0.20/M · 2M ctx · cheapest frontier",4,"τ²-bench 100% · ultra low cost",
"https://x.ai/grok"],
["DeepSeek R2","DeepSeek","#6366f1","open","value","#4f46e5","2026.02",
87.0,88.0,93.8,null,null,null,null,null,84.0,null,null,76,
128,8,null,9.0,["Text"],"MoE","Reasoning(671B)",1450,"MIT",0.55,2.19,
5,"AIME 93.8% math king · MIT",5,"Math/science #1 · R1 successor",
"https://huggingface.co/deepseek-ai"],
["Phi-4","Microsoft","#00a4ef","open","value","#0078d4","2024.12",
null,73.0,null,null,null,null,null,null,null,null,72.0,68,
16,4,null,2.5,["Image"],"Dense","Non-Reasoning(14B)",1310,"MIT",null,null,
5,"14B beats 70B · MIT · edge #1",5,"HF trending top 1% · RTX 3060 OK",
"https://huggingface.co/microsoft/phi-4"],
// ========== 🇰🇷 한국 소버린 AI — 독자 AI 파운데이션 모델 정예팀 (2026.02 기준) ==========
["K-EXAONE","LG AI연구원","#a50034","open","korean","#c9002b","2025.12",
83.0,77.0,87.0,null,null,null,null,null,null,null,null,72,
260,16,null,null,["Image","Text"],"MoE","Reasoning(236B, K-독파모)",null,"Research",0,0,
5,"독파모 1차 1위 · AA 오픈웨이트 톱10",5,"LG 236B · GPT-OSS 120B 대비 103% · 소버린AI",
"https://huggingface.co/LGAI-EXAONE"],
["A.X K1","SK텔레콤","#e8002d","closed","korean","#ff1a1a","2025.12",
null,null,null,null,null,null,null,null,null,null,null,62,
64,16,null,null,["Text"],"MoE","Reasoning(500B, K-독파모)",null,"Apache2",0,0,
4,"국내 최대 500B · 한국어·산업 특화",4,"SKT 국내 첫 5천억 파라미터 · 소버린AI",
"https://www.sktelecom.com"],
["Solar Open 100B","업스테이지","#005baa","open","korean","#0a6fbb","2025.12",
80.4,68.1,84.3,null,null,null,74.2,null,null,null,null,68,
100,16,null,null,["Text"],"MoE","Reasoning(100B, K-독파모)",null,"Apache2",0,0,
5,"100B · AIME 84.3 · 19.7T 학습",5,"업스테이지 · 수학·코딩 특화 · 소버린AI",
"https://huggingface.co/upstage/Solar-Open-100B"],
["모티프 AI","모티프테크놀로지스","#2d6be4","closed","korean","#4285f4","2026.02",
null,null,null,null,null,null,null,null,null,null,null,55,
128,16,null,null,["Text"],"Dense","Non-Reasoning(K-독파모)",null,"Prop",0,0,
3,"패자부활전 선정 2026.02.20",3,"독파모 4번째 팀 · 벤치마크 공개 예정",
"https://motif.ai"]
];
// ========== COMPOSITE SCORE (null=0 페널티, 10개 기준 평균) ==========
function compScore(r){
const keys=[r[7],r[8],r[9],r[10],r[11],r[12],r[13],r[14],r[15],r[17]];
const nonNull=keys.filter(x=>x!==null&&x!==undefined);
if(!nonNull.length)return null;
// null은 0으로 처리 → 벤치마크 미제출 패널티 적용
const sum=keys.reduce((a,b)=>a+(b||0),0);
return Math.round(sum/10*10)/10;
}
// ========== GRADE COLOR ==========
function gc(v,mx){
if(v===null||v===undefined)return null;
const p=mx?v/mx*100:v;
if(p>=90)return"#6366f1";
if(p>=75)return"#0d9488";
if(p>=60)return"#d97706";
return"#e11d48";
}
// ========== SCORE CELL ==========
function scoreCell(v,max,cls){
if(v===null||v===undefined)return`<span class="na">—</span>`;
const c=gc(v,max),p=max?(v/max*100):v,pct=Math.min(p,100);
return`<div class="sc ${cls||''}"><span class="sn" style="color:${c}">${v}</span><div class="sb"><div class="sf" style="width:${pct}%;background:${c}"></div></div></div>`;
}
// ========== METACOG CELL ==========
function metaCell(v){
if(v===null||v===undefined)return`<span class="na">—</span>`;
const c=v>=65?"#6366f1":v>=55?"#0d9488":v>=45?"#d97706":"#e11d48";
const lbl=v>=65?"S":v>=55?"A":v>=45?"B":"C";
return`<div class="sc meta-col"><span class="sn" style="color:${c}">${v}<span style="font-size:7px;margin-left:1px;opacity:.7">${lbl}</span></span><div class="sb"><div class="sf" style="width:${Math.min(v,100)}%;background:${c}"></div></div></div>`;
}
// ========== ARC-AGI CELL ==========
function arcCell(v){
if(v===null||v===undefined)return`<span class="na">—</span>`;
const c=v>=75?"#0ea5e9":v>=40?"#06b6d4":v>=20?"#0891b2":"#64748b";
const pct=Math.min(v,100);
return`<div class="sc arc-col"><span class="sn" style="color:${c}">${v}%</span><div class="sb"><div class="sf" style="width:${pct}%;background:${c}"></div></div></div>`;
}
// ========== PROVIDER BADGE ==========
function provBadge(r){
const c=r[2],n=r[1];
const bg=c+'1a',brd=c+'40';
return`<span class="prov" style="background:${bg};color:${c};border-color:${brd}">${n}</span>`;
}
// ========== ARCH CELL ==========
function archCell(r){
const t=r[24],d=r[25];
const cls=t.includes("MoE")?"at-moe":t.includes("Hybrid")?"at-hyb":"at-den";
return`<div class="at"><span class="atb ${cls}">${t}</span><span style="font-size:7px;color:var(--text-muted);font-family:var(--font-mono)">${d}</span></div>`;
}
// ========== VISION CELL ==========
function visCell(vis){
if(!vis||vis.length===0)return`<span class="na">Text</span>`;
if(vis.includes("Text")&&vis.length===1)return`<span class="na">Text</span>`;
return`<div class="vis">${vis.map(v=>{
if(v==="Image")return`<span class="vb vi">Img</span>`;
if(v==="Video")return`<span class="vb vv">Vid</span>`;
if(v==="Audio")return`<span class="vb va">Aud</span>`;
return`<span class="vb vt">${v}</span>`;
}).join('')}</div>`;
}
// ========== LICENSE ==========
function licCell(l){
const m={"Apache2":"la","MIT":"lm","Prop":"lp","Meta":"ll"};
return`<span class="lic ${m[l]||'lp'}">${l==="Prop"?"Proprietary":l}</span>`;
}
// ========== PRICE ==========
function priceCell(r){
const i=r[28],o=r[29];
if(i===null||i===undefined)return`<span class="na">—</span>`;
if(i===0)return`<div class="pr"><span class="pri" style="color:#16a34a">Free</span><span class="pro">open weights</span></div>`;
return`<div class="pr"><span class="pri">$${i}</span><span class="pro">out $${o}</span></div>`;
}
// ========== COMPOSITE DISPLAY ==========
function compCell(r){
const cs=compScore(r);
if(cs===null)return`<span class="na">—</span>`;
const c=gc(cs,100);
return`<div class="comp"><span class="compN" style="color:${c}">${cs}</span><div class="compB"><div style="width:${Math.min(cs,100)}%;height:100%;background:${c};border-radius:2px;transition:width .9s"></div></div></div>`;
}
// ========== BUILD TABLE ==========
function buildTable(data){
const tb=document.getElementById('TB');
tb.innerHTML='';
data.forEach(r=>{
const isVal=r[30]>=4&&r[3]==='open';
const cs=compScore(r);
const tr=document.createElement('tr');
tr.className=isVal?'hl':'';
tr.dataset.group=r[4];
tr.dataset.type=r[3];
tr.dataset.arch=r[24]||'';
tr.dataset.vis=JSON.stringify(r[23]||[]);
tr.dataset.val=r[30]||0;
tr.dataset.name=r[0].toLowerCase();
tr.innerHTML=`
<td class="c-model">
<div class="mc">
<div class="mn">
<a href="${r[34]}" target="_blank">${r[0]}</a>
<span class="link-icon">↗</span>
${r[4]==='korean'?'<span style="font-size:11px;background:linear-gradient(135deg,#c9002b22,#00347822);border:1px solid #c9002b44;border-radius:4px;padding:1px 4px;color:#c9002b;font-weight:700;font-family:var(--font-mono)">🇰🇷 K-AI</span>':''}
</div>
<div class="ms">
<div class="dot" style="background:${r[5]}"></div>
<span class="pb ${r[3]==='open'?'ob':'cb'}">${r[3]}</span>
<span class="mp">${r[6]}</span>
</div>
</div>
</td>
<td>${provBadge(r)}</td>
<td>${compCell(r)}</td>
<td><span class="rel" style="font-family:var(--font-mono);font-size:9px;color:var(--text-muted)">${r[6]}</span></td>
<td data-col="4">${scoreCell(r[7],100)}</td>
<td data-col="5">${scoreCell(r[8],100)}</td>
<td data-col="6">${scoreCell(r[9],100)}</td>
<td data-col="7">${scoreCell(r[10],100)}</td>
<td data-col="8" class="arc-col">${arcCell(r[11])}</td>
<td data-col="9" class="meta-col">${metaCell(r[12])}</td>
<td data-col="10">${scoreCell(r[13],100)}</td>
<td data-col="11">${scoreCell(r[14],100)}</td>
<td data-col="12">${scoreCell(r[15],100)}</td>
<td data-col="13">${scoreCell(r[16],100)}</td>
<td data-col="14" style="opacity:.75">${scoreCell(r[17],100)}</td>
<td data-col="15">${scoreCell(r[18],100)}</td>
<td data-col="16"><span class="tk">${r[19]?r[19]+'K':'—'}</span></td>
<td data-col="17"><span class="tk">${r[20]?r[20]+'K':'—'}</span></td>
<td data-col="18">${r[21]?`<span style="font-family:var(--font-mono);font-size:10px;color:#0d9488">${r[21]}</span>`:'<span class="na">—</span>'}</td>
<td data-col="19">${r[22]!==null&&r[22]!==undefined?`<span style="font-family:var(--font-mono);font-size:10px;font-weight:700;color:${r[22]<=1?'#16a34a':r[22]<=3?'#d97706':'#e11d48'}">${r[22]}s</span>`:'<span class="na">—</span>'}</td>
<td data-col="20">${visCell(r[23])}</td>
<td data-col="21">${archCell(r)}</td>
<td data-col="22">${r[26]?`<span class="eloc" style="font-family:var(--font-mono);font-size:10px;font-weight:700">${r[26]}</span>`:'<span class="na">—</span>'}</td>
<td data-col="23">${licCell(r[27])}</td>
<td data-col="24">${priceCell(r)}</td>
`;
tb.appendChild(tr);
});
}
buildTable(D);
// ========== SORTING ==========
let sortDir=1,lastCol=-1;
function srt(col){
if(lastCol===col)sortDir*=-1; else sortDir=1;
lastCol=col;
const th=document.querySelectorAll('th');
th.forEach(t=>t.classList.remove('on'));
if(th[col])th[col].classList.add('on');
const arr=[...D].sort((a,b)=>{
let va,vb;
switch(col){
case 0:va=a[0];vb=b[0];break;
case 2:va=compScore(a)||0;vb=compScore(b)||0;break;
case 3:va=a[6];vb=b[6];break;
case 4:va=a[7]||0;vb=b[7]||0;break;
case 5:va=a[8]||0;vb=b[8]||0;break;
case 6:va=a[9]||0;vb=b[9]||0;break;
case 7:va=a[10]||0;vb=b[10]||0;break;
case 8:va=a[11]||0;vb=b[11]||0;break;
case 9:va=a[12]||0;vb=b[12]||0;break;
case 10:va=a[13]||0;vb=b[13]||0;break;
case 11:va=a[14]||0;vb=b[14]||0;break;
case 12:va=a[15]||0;vb=b[15]||0;break;
case 13:va=a[16]||0;vb=b[16]||0;break;
case 14:va=a[17]||0;vb=b[17]||0;break;
case 15:va=a[18]||0;vb=b[18]||0;break;
case 16:va=a[19]||0;vb=b[19]||0;break;
case 17:va=a[20]||0;vb=b[20]||0;break;
case 18:va=a[21]||0;vb=b[21]||0;break;
case 19:va=a[22]||99;vb=b[22]||99;break;
case 22:va=a[26]||0;vb=b[26]||0;break;
case 24:va=a[28]||0;vb=b[28]||0;break;
default:va=0;vb=0;
}
if(typeof va==='string')return va.localeCompare(vb)*sortDir;
return(va-vb)*sortDir;
});
buildTable(arr);
applyFilter(currentFilter);
applySearch(document.getElementById('searchBox').value);
applyHiddenCols();
}
// ========== 기본 정렬: Composite Score 내림차순 ==========
(function defaultSort(){
sortDir=-1; lastCol=2;
const arr=[...D].sort((a,b)=>(compScore(b)||0)-(compScore(a)||0));
buildTable(arr);
const th=document.querySelectorAll('th');
if(th[2])th[2].classList.add('on');
})();
let currentFilter='all';
function flt(f,btn){
currentFilter=f;
document.querySelectorAll('.fb').forEach(b=>b.classList.remove('on'));
btn.classList.add('on');
applyFilter(f);
}
function applyFilter(f){
document.querySelectorAll('#TB tr').forEach(tr=>{
const g=tr.dataset.group,tp=tr.dataset.type,arch=tr.dataset.arch;
const vis=JSON.parse(tr.dataset.vis||'[]'),val=parseInt(tr.dataset.val||0);
let show=true;
if(f==='open')show=tp==='open';
else if(f==='closed')show=tp==='closed';
else if(f==='qwen')show=g==='qwen';
else if(f==='gptoss')show=g==='gptoss';
else if(f==='reasoning')show=arch.toLowerCase().includes('reasoning');
else if(f==='moe')show=arch.toLowerCase().includes('moe');
else if(f==='vision')show=vis.some(v=>['Image','Video','Audio'].includes(v));
else if(f==='value')show=val>=4&&tp==='open';
else if(f==='flagship')show=g==='flagship';
else if(f==='korean')show=g==='korean';
if(!show)tr.classList.add('hidden'); else tr.classList.remove('hidden');
});
applySearch(document.getElementById('searchBox').value);
}
// ========== SEARCH ==========
function doSearch(q){applySearch(q);}
function applySearch(q){
const s=q.toLowerCase().trim();
document.querySelectorAll('#TB tr').forEach(tr=>{
if(!s){tr.classList.remove('search-hidden');return;}
const n=tr.dataset.name||'';
if(n.includes(s))tr.classList.remove('search-hidden');
else tr.classList.add('search-hidden');
});
}
// ========== COLUMN TOGGLE ==========
const colLabels={4:"MMLU-Pro",5:"GPQA◆",6:"AIME25",7:"HLE",8:"ARC-AGI-2",9:"Metacog",10:"SWE-Pro",11:"BFCL",12:"IFEval",13:"LCB",14:"SWE-V",15:"MMMLU",16:"CtxIn",17:"CtxOut",18:"tok/s",19:"TTFT",20:"Vision",21:"Arch",22:"ELO",23:"License",24:"$/M"};
const hiddenCols=new Set();
function buildColMenu(){
const menu=document.getElementById('colMenu');
menu.innerHTML='';
Object.entries(colLabels).forEach(([ci,label])=>{
const d=document.createElement('label');
d.className='col-chk';
d.innerHTML=`<input type="checkbox" checked onchange="toggleCol(${ci},this.checked)"> ${label}`;
menu.appendChild(d);
});
}
buildColMenu();
function toggleColMenu(){
const m=document.getElementById('colMenu');
m.classList.toggle('open');
}
document.addEventListener('click',e=>{
if(!e.target.closest('.col-toggle-wrap'))document.getElementById('colMenu').classList.remove('open');
});
function toggleCol(ci,show){
const sel=`[data-col="${ci}"]`;
document.querySelectorAll(sel).forEach(el=>{el.style.display=show?'':'none';});
const th=document.querySelector(`th[data-col="${ci}"]`);
if(th)th.style.display=show?'':'none';
if(show)hiddenCols.delete(ci); else hiddenCols.add(ci);
}
function applyHiddenCols(){
hiddenCols.forEach(ci=>toggleCol(ci,false));
}
// ========== TABS ==========
function showTab(id,el){
document.querySelectorAll('.tpane').forEach(p=>p.classList.remove('on'));
document.querySelectorAll('.tab').forEach(t=>t.classList.remove('on'));
document.getElementById(id).classList.add('on');
el.classList.add('on');
if(id==='charts'&&!chartsInit)initCharts();
}
let chartsInit=false;
// Init vertical ranking chart immediately
window.addEventListener('load',()=>{initVertRank();});
// ========== VERTICAL RANKING CHART (always shown in tab1) ==========
function initVertRank(){
const sorted=[...D].map(r=>({n:r[0],s:compScore(r),c:pColors[r[1]]||'#6366f1',prov:r[1]}))
.sort((a,b)=>{
const sa=a.s??-1, sb=b.s??-1;
return sb-sa;
});
const canvas=document.getElementById('cVertRank');
if(!canvas)return;
const W=Math.max(sorted.length*52+60,1100);
canvas.width=W; canvas.height=200;
const ctx=canvas.getContext('2d');
const PAD_L=40,PAD_R=20,PAD_T=16,PAD_B=60;
const chartW=W-PAD_L-PAD_R,chartH=200-PAD_T-PAD_B;
const barW=Math.min(40,chartW/sorted.length-8);
const maxS=Math.max(...sorted.map(x=>x.s));
const minS=0; // 0 기준으로 정직한 상대 비교
// Grid lines
[0,20,40,60,80].forEach(v=>{
const y=PAD_T+chartH-(v-minS)/(maxS-minS)*chartH;
if(y<PAD_T||y>PAD_T+chartH)return;
ctx.beginPath();ctx.strokeStyle=v===0?'rgba(15,23,42,.15)':'rgba(15,23,42,.05)';
ctx.lineWidth=v===0?1.5:.7;
ctx.moveTo(PAD_L,y);ctx.lineTo(W-PAD_R,y);ctx.stroke();
ctx.font='600 8px JetBrains Mono';ctx.fillStyle='#94a3b8';ctx.textAlign='right';
ctx.fillText(v,PAD_L-4,y+3);
});
const gap=(chartW-(barW*sorted.length))/(sorted.length+1);
sorted.forEach((d,i)=>{
const x=PAD_L+gap*(i+1)+barW*i;
const isNull=d.s===null||d.s===undefined;
const score=isNull?0:d.s;
const barH=isNull?5:Math.max((score-minS)/(maxS-minS)*chartH,5);
const y=PAD_T+chartH-barH;
const rank=i+1;
// benchmark coverage count
const benchKeys=[7,8,9,10,11,12,13,14,15,17];
const origR=D.find(r=>r[0]===d.n);
const covCnt=origR?benchKeys.filter(k=>origR[k]!==null&&origR[k]!==undefined).length:0;
// Bar gradient
const grad=ctx.createLinearGradient(0,y,0,PAD_T+chartH);
grad.addColorStop(0,isNull?'#cbd5e1':d.c+'ff');
grad.addColorStop(1,isNull?'#e2e8f0':d.c+'88');
ctx.fillStyle=grad;
ctx.beginPath();
ctx.roundRect(x,y,barW,barH,4);
ctx.fill();
// Score label on top
ctx.font='700 9px JetBrains Mono';ctx.fillStyle=isNull?'#94a3b8':d.c;ctx.textAlign='center';
ctx.fillText(isNull?'N/A':d.s,x+barW/2,y-12);
// Coverage badge (n/10)
ctx.font='500 7px JetBrains Mono';ctx.fillStyle='#94a3b8';ctx.textAlign='center';
ctx.fillText(covCnt+'/10',x+barW/2,y-3);
// Rank badge
ctx.fillStyle=d.c+'22';
ctx.fillRect(x,PAD_T+chartH+2,barW,14);
ctx.font='700 7px JetBrains Mono';ctx.fillStyle=d.c;ctx.textAlign='center';
ctx.fillText('#'+rank,x+barW/2,PAD_T+chartH+11);
// Model name (angled)
ctx.save();
ctx.translate(x+barW/2,PAD_T+chartH+22);
ctx.rotate(-Math.PI/4.5);
ctx.font='600 8px Sora,sans-serif';
ctx.fillStyle='#475569';ctx.textAlign='right';
const shortN=d.n.length>14?d.n.substring(0,13)+'…':d.n;
ctx.fillText(shortN,0,0);
ctx.restore();
});
// Legend
const provs=[...new Set(D.map(r=>r[1]))];
const leg=document.getElementById('vrankLegend');
if(leg){
leg.innerHTML=provs.map(p=>`<div class="vrl"><div class="vrl-dot" style="background:${pColors[p]||'#6366f1'}"></div>${p}</div>`).join('');
}
}
// ========== CHART COLORS ==========
const pColors={
"OpenAI":"#10a37f","Anthropic":"#d97706","Google":"#4285f4",
"xAI":"#1d9bf0","Alibaba":"#f97316","DeepSeek":"#6366f1",
"Moonshot":"#8b5cf6","Zhipu AI":"#14b8a6","Meta":"#0081fb","Mistral":"#ff7043",
"Microsoft":"#00a4ef",
"LG AI연구원":"#c9002b","SK텔레콤":"#e8002d","업스테이지":"#005baa","모티프테크놀로지스":"#2d6be4"
};
const gridC='rgba(15,23,42,.06)';
const tickC='#94a3b8';
function initCharts(){
chartsInit=true;
// 1. ARC-AGI-2 VERTICAL BAR
const arcData=D.filter(r=>r[11]!==null).map(r=>({n:r[0],v:r[11],c:pColors[r[1]]||'#6366f1'})).sort((a,b)=>b.v-a.v);
new Chart(document.getElementById('cArc'),{
type:'bar',
data:{labels:arcData.map(x=>x.n.length>10?x.n.substr(0,9)+'…':x.n),datasets:[{
label:'ARC-AGI-2 (%)',data:arcData.map(x=>x.v),
backgroundColor:arcData.map(x=>x.c+'bb'),borderColor:arcData.map(x=>x.c),
borderWidth:1.5,borderRadius:5,borderSkipped:false
}]},
options:{plugins:{legend:{display:false},tooltip:{callbacks:{label:c=>`ARC-AGI-2: ${c.raw}% — ${arcData[c.dataIndex].n}`}}},
scales:{y:{min:0,max:100,grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:9}}},
x:{grid:{display:false},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8},maxRotation:35}}}}
});
// 2. METACOG BASELINE vs METACOG (GROUPED VERTICAL)
const metaFull=[
{n:"Kimi K2.5",prov:"Moonshot",base:68.71,meta:78.54},
{n:"GPT-5.2",prov:"OpenAI",base:62.76,meta:75.5},
{n:"GLM-5",prov:"Zhipu AI",base:62.50,meta:75.0},
{n:"Gemini 3.1 Pro",prov:"Google",base:59.5,meta:77.08},
{n:"Claude Opus 4.6",prov:"Anthropic",base:56.04,meta:76.17},
].sort((a,b)=>b.base-a.base);
new Chart(document.getElementById('cMetaDelta'),{
type:'bar',
data:{
labels:metaFull.map(x=>x.n.length>12?x.n.substr(0,11)+'…':x.n),
datasets:[
{label:'Baseline Score',data:metaFull.map(x=>x.base),backgroundColor:metaFull.map(x=>pColors[x.prov]+'88'),borderColor:metaFull.map(x=>pColors[x.prov]),borderWidth:1.5,borderRadius:4},
{label:'MetaCog (self-corrected)',data:metaFull.map(x=>x.meta),backgroundColor:metaFull.map(x=>pColors[x.prov]+'33'),borderColor:metaFull.map(x=>pColors[x.prov]),borderWidth:2,borderRadius:4,borderDash:[4,2]}
]
},
options:{
plugins:{legend:{labels:{color:tickC,font:{family:'JetBrains Mono',size:8},boxWidth:10}},
tooltip:{callbacks:{afterBody:items=>{
if(items[0]){const i=items[0].dataIndex;return[` Δ gain: +${(metaFull[i].meta-metaFull[i].base).toFixed(2)}`];}
}}}
},
scales:{y:{min:45,max:85,grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:9}}},
x:{grid:{display:false},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8}}}}
}
});
// 3. RADAR TOP 6
const top6=["Claude Opus 4.6","GPT-5.2","Gemini 3.1 Pro","Kimi K2.5","Qwen3.5-397B","DeepSeek V3.2"];
const rColors=top6.map(n=>{const r=D.find(d=>d[0]===n);return r?pColors[r[1]]||'#6366f1':'#6366f1';});
const top6data=top6.map(n=>D.find(r=>r[0]===n));
const radarDatasets=top6data.map((r,i)=>({
label:r[0],
data:[r[7]||0,r[8]||0,r[9]||0,r[10]||0,Math.min((r[11]||0)*1.1,100),r[18]||0],
borderColor:rColors[i],backgroundColor:rColors[i]+'20',borderWidth:1.5,pointRadius:2.5,pointBackgroundColor:rColors[i]
}));
new Chart(document.getElementById('cRadar'),{
type:'radar',
data:{labels:['MMLU-Pro','GPQA◆','AIME25','HLE','ARC-AGI-2','MMMLU'],datasets:radarDatasets},
options:{plugins:{legend:{labels:{color:tickC,font:{family:'JetBrains Mono',size:7.5},boxWidth:10,padding:6}}},
scales:{r:{grid:{color:gridC},angleLines:{color:gridC},ticks:{display:false},
pointLabels:{color:tickC,font:{family:'JetBrains Mono',size:8.5}},suggestedMin:0,suggestedMax:100}}}
});
// 4. CAPABILITY DOMAIN BREAKDOWN
const domModels=["GPT-5.2","Claude Opus 4.6","Gemini 3.1 Pro","Kimi K2.5","Qwen3.5-397B","DeepSeek R1","GLM-5","Grok 4 Heavy"];
const domData=domModels.map(n=>{
const r=D.find(d=>d[0]===n);if(!r)return null;
const reasoning=[r[8],r[9],r[10]].filter(x=>x!==null);
const coding=[r[13],r[16]].filter(x=>x!==null);
const language=[r[7],r[18],r[15]].filter(x=>x!==null);
return{
n:n.length>12?n.substr(0,11)+'…':n,
c:pColors[r[1]]||'#6366f1',
reasoning:reasoning.length?Math.round(reasoning.reduce((a,b)=>a+b)/reasoning.length*10)/10:null,
coding:coding.length?Math.round(coding.reduce((a,b)=>a+b)/coding.length*10)/10:null,
language:language.length?Math.round(language.reduce((a,b)=>a+b)/language.length*10)/10:null
};
}).filter(Boolean);
new Chart(document.getElementById('cDomain'),{
type:'bar',
data:{
labels:domData.map(x=>x.n),
datasets:[
{label:'Reasoning (GPQA+AIME+HLE)',data:domData.map(x=>x.reasoning),backgroundColor:'rgba(99,102,241,.7)',borderColor:'#6366f1',borderWidth:1.5,borderRadius:3},
{label:'Coding (SWE-Pro+LCB)',data:domData.map(x=>x.coding),backgroundColor:'rgba(13,148,136,.7)',borderColor:'#0d9488',borderWidth:1.5,borderRadius:3},
{label:'Language (MMLU+MMMLU+IFEval)',data:domData.map(x=>x.language),backgroundColor:'rgba(217,119,6,.7)',borderColor:'#d97706',borderWidth:1.5,borderRadius:3}
]
},
options:{plugins:{legend:{labels:{color:tickC,font:{family:'JetBrains Mono',size:8},boxWidth:10,padding:5}}},
scales:{y:{min:0,max:100,grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:9}}},
x:{grid:{display:false},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8},maxRotation:30}}}}
});
// 5. PERF vs COST SCATTER
const scData=D.filter(r=>r[28]!==null&&r[28]!==undefined&&compScore(r)!==null).map(r=>({
n:r[0],x:r[28]===0?0.01:r[28],y:compScore(r),c:pColors[r[1]]||'#6366f1',prov:r[1]
}));
new Chart(document.getElementById('cScatter'),{
type:'scatter',
data:{datasets:[{
data:scData.map(x=>({x:x.x,y:x.y})),
backgroundColor:scData.map(x=>x.c+'cc'),borderColor:scData.map(x=>x.c),
pointRadius:scData.map((x,i)=>i<3?9:7),pointHoverRadius:11,borderWidth:1.5
}]},
options:{
plugins:{legend:{display:false},tooltip:{callbacks:{label:ctx=>{
const d=scData[ctx.dataIndex];return[`${d.n}`,`Score: ${d.y}`,`Price: $${d.x}/M`];
}}}},
scales:{
x:{title:{display:true,text:'Input Price ($/M tokens) — log scale',color:tickC,font:{size:8.5,family:'JetBrains Mono'}},type:'logarithmic',
grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8.5}}},
y:{title:{display:true,text:'Composite Score',color:tickC,font:{size:8.5,family:'JetBrains Mono'}},min:40,
grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8.5}}}
}
}
});
// 6. PROVIDER AVERAGE SCORE
const provGroups={};
D.forEach(r=>{const cs=compScore(r);if(cs&&r[1]){if(!provGroups[r[1]])provGroups[r[1]]=[];provGroups[r[1]].push(cs);}});
const provAvg=Object.entries(provGroups).map(([p,arr])=>({
p,avg:Math.round(arr.reduce((a,b)=>a+b)/arr.length*10)/10,
cnt:arr.length,c:pColors[p]||'#6366f1',max:Math.max(...arr),min:Math.min(...arr)
})).sort((a,b)=>b.avg-a.avg);
new Chart(document.getElementById('cProvider'),{
type:'bar',
data:{
labels:provAvg.map(x=>x.p),
datasets:[
{label:'Avg Score',data:provAvg.map(x=>x.avg),backgroundColor:provAvg.map(x=>x.c+'bb'),borderColor:provAvg.map(x=>x.c),borderWidth:1.5,borderRadius:5,borderSkipped:false},
{label:'Best Model',data:provAvg.map(x=>x.max),type:'line',borderColor:provAvg.map(x=>x.c),pointBackgroundColor:provAvg.map(x=>x.c),pointRadius:5,fill:false,tension:.3,borderWidth:2}
]
},
options:{plugins:{legend:{labels:{color:tickC,font:{family:'JetBrains Mono',size:8},boxWidth:10}}},
scales:{y:{min:40,grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:9}}},
x:{grid:{display:false},ticks:{color:tickC,font:{family:'JetBrains Mono',size:9},maxRotation:30}}}}
});
// 7. INTELLIGENCE TIMELINE BUBBLE
const dateMap={"2025.01":1,"2025.04":4,"2025.11":11,"2025.12":12,"2026.01":13,"2026.02":14};
const tlData=D.filter(r=>compScore(r)!==null).map(r=>({
n:r[0],x:dateMap[r[6]]||1,y:compScore(r),c:pColors[r[1]]||'#6366f1',
r:Math.log10((r[19]||100)*1000+1)*4+4
}));
new Chart(document.getElementById('cTimeline'),{
type:'bubble',
data:{datasets:[{
data:tlData.map(x=>({x:x.x,y:x.y,r:x.r})),
backgroundColor:tlData.map(x=>x.c+'88'),borderColor:tlData.map(x=>x.c),borderWidth:1.5
}]},
options:{plugins:{legend:{display:false},tooltip:{callbacks:{label:ctx=>{
const d=tlData[ctx.dataIndex];return[d.n,`Score: ${d.y}`];
}}}},
scales:{
x:{title:{display:true,text:'Release Timeline (months from Jan 2025)',color:tickC,font:{size:8.5,family:'JetBrains Mono'}},min:0,max:15,
grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8.5},callback:v=>['','Jan 25','','','Apr 25','','','','','','','Nov 25','Dec 25','Jan 26','Feb 26'][v]||''}},
y:{title:{display:true,text:'Composite Score',color:tickC,font:{size:8.5,family:'JetBrains Mono'}},min:40,
grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8.5}}}
}
}
});
// 8. OPEN vs CLOSED DISTRIBUTION
const openScores=D.filter(r=>r[3]==='open'&&compScore(r)!==null).map(r=>compScore(r)).sort((a,b)=>a-b);
const closedScores=D.filter(r=>r[3]==='closed'&&compScore(r)!==null).map(r=>compScore(r)).sort((a,b)=>a-b);
const allOpen=D.filter(r=>r[3]==='open'&&compScore(r)!==null).map(r=>({n:r[0],s:compScore(r),c:pColors[r[1]]||'#16a34a'}));
const allClosed=D.filter(r=>r[3]==='closed'&&compScore(r)!==null).map(r=>({n:r[0],s:compScore(r),c:pColors[r[1]]||'#6366f1'}));
new Chart(document.getElementById('cOpenClosed'),{
type:'scatter',
data:{datasets:[
{label:'Open-weight',data:allOpen.map((x,i)=>({x:0.2+Math.random()*.6,y:x.s})),
backgroundColor:allOpen.map(x=>x.c+'cc'),pointRadius:7,borderWidth:1.5,borderColor:allOpen.map(x=>x.c)},
{label:'Closed API',data:allClosed.map((x,i)=>({x:1.2+Math.random()*.6,y:x.s})),
backgroundColor:allClosed.map(x=>x.c+'cc'),pointRadius:8,borderWidth:1.5,borderColor:allClosed.map(x=>x.c)}
]},
options:{plugins:{legend:{labels:{color:tickC,font:{family:'JetBrains Mono',size:9},boxWidth:12}},
tooltip:{callbacks:{label:ctx=>{
const arr=ctx.datasetIndex===0?allOpen:allClosed;return arr[ctx.dataIndex]?`${arr[ctx.dataIndex].n}: ${arr[ctx.dataIndex].s}`:'';
}}}},
scales:{
x:{min:0,max:2,grid:{display:false},ticks:{color:tickC,font:{family:'JetBrains Mono',size:9},callback:v=>v===0.5?'Open-weight':v===1.5?'Closed API':''}},
y:{min:40,max:100,grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:9}},title:{display:true,text:'Composite Score',color:tickC,font:{size:8.5,family:'JetBrains Mono'}}}
}
}
});
// 9. BENCHMARK VARIANCE (min/max/mean)
const benchDefs=[
{k:7,l:'MMLU-Pro'},{k:8,l:'GPQA◆'},{k:9,l:'AIME25'},{k:10,l:'HLE'},
{k:11,l:'ARC-AGI-2'},{k:12,l:'Metacog'},{k:13,l:'SWE-Pro'},
{k:14,l:'BFCL'},{k:15,l:'IFEval'},{k:17,l:'SWE-V'},{k:18,l:'MMMLU'}
];
const varData=benchDefs.map(b=>{
const vals=D.map(r=>r[b.k]).filter(x=>x!==null&&x!==undefined);
if(!vals.length)return null;
const mn=Math.round(Math.min(...vals)*10)/10,mx=Math.round(Math.max(...vals)*10)/10;
const avg=Math.round(vals.reduce((a,v)=>a+v)/vals.length*10)/10;
return{l:b.l,mn,mx,avg,range:mx-mn};
}).filter(Boolean);
new Chart(document.getElementById('cVariance'),{
type:'bar',
data:{
labels:varData.map(x=>x.l),
datasets:[
{label:'Min',data:varData.map(x=>x.mn),backgroundColor:'rgba(225,29,72,.55)',borderColor:'#e11d48',borderWidth:1.2,borderRadius:2},
{label:'Mean',data:varData.map(x=>x.avg),backgroundColor:'rgba(99,102,241,.65)',borderColor:'#6366f1',borderWidth:1.2,borderRadius:2},
{label:'Max',data:varData.map(x=>x.mx),backgroundColor:'rgba(13,148,136,.55)',borderColor:'#0d9488',borderWidth:1.2,borderRadius:2}
]
},
options:{plugins:{legend:{labels:{color:tickC,font:{family:'JetBrains Mono',size:8},boxWidth:10}}},
scales:{y:{min:0,max:100,grid:{color:gridC},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8.5}}},
x:{grid:{display:false},ticks:{color:tickC,font:{family:'JetBrains Mono',size:8},maxRotation:30}}}}
});
// 10. HEATMAP — full width
const heatCols=['MMLU-P','GPQA','AIME25','HLE','ARC-AGI-2','Metacog','SWE-Pro','BFCL','IFEval','SWE-V','MMMLU'];
const heatKeys=[7,8,9,10,11,12,13,14,15,17,18];
const canvas=document.getElementById('cHeat');
const parentCard=canvas.closest('.chart-card');
// Use parent card's full inner width (accounting for padding)
const cardPad=36; // 18px padding × 2
const cW=parentCard ? (parentCard.clientWidth - cardPad) : (window.innerWidth - 80);
const nRows=D.length; // 31 models
const hH=34; // header height
const bH=38; // row height — taller for readability
const mW=130; // model name column width
const totalH=hH+nRows*bH+10;
canvas.width=cW;
canvas.height=totalH;
canvas.style.height=totalH+'px';
const ctx2=canvas.getContext('2d');
ctx2.clearRect(0,0,cW,totalH);
const bW=(cW-mW)/heatCols.length;
// Background alternating rows
D.forEach((r,i)=>{
const y=hH+i*bH;
ctx2.fillStyle=i%2===0?'rgba(248,249,252,0.8)':'rgba(255,255,255,0.6)';
ctx2.fillRect(0,y,cW,bH);
});
// Column headers
heatCols.forEach((h,j)=>{
const x=mW+(j+0.5)*bW;
// Column bg stripe
ctx2.fillStyle=j%2===0?'rgba(99,102,241,.04)':'rgba(99,102,241,.01)';
ctx2.fillRect(mW+j*bW,0,bW,totalH);
// Header text
ctx2.save();
ctx2.translate(x,hH-6);
ctx2.rotate(-Math.PI/6);
ctx2.font='700 9px JetBrains Mono';
ctx2.fillStyle='#6366f1';
ctx2.textAlign='right';
ctx2.fillText(h,0,0);
ctx2.restore();
});
// Vertical grid lines
ctx2.strokeStyle='rgba(226,229,240,0.8)';ctx2.lineWidth=1;
for(let j=0;j<=heatCols.length;j++){
ctx2.beginPath();ctx2.moveTo(mW+j*bW,0);ctx2.lineTo(mW+j*bW,totalH);ctx2.stroke();
}
// Horizontal grid lines
for(let i=0;i<=nRows;i++){
const y=hH+i*bH;
ctx2.strokeStyle='rgba(226,229,240,0.6)';ctx2.lineWidth=0.8;
ctx2.beginPath();ctx2.moveTo(0,y);ctx2.lineTo(cW,y);ctx2.stroke();
}
D.forEach((r,i)=>{
const y=hH+i*bH;
// Provider color bar on left edge
ctx2.fillStyle=pColors[r[1]]||'#6366f1';
ctx2.fillRect(0,y+1,4,bH-2);
// Row number
ctx2.font='600 8px JetBrains Mono';ctx2.fillStyle='#94a3b8';ctx2.textAlign='center';
ctx2.fillText(i+1,14,y+bH/2+3);
// Model name
ctx2.font='600 10px Sora,sans-serif';ctx2.fillStyle='#0f172a';ctx2.textAlign='left';
const nm=r[0].length>17?r[0].substr(0,16)+'…':r[0];
ctx2.fillText(nm,22,y+bH/2+3);
heatKeys.forEach((ki,j)=>{
const v=r[ki];
const cx=mW+j*bW;
if(v!==null&&v!==undefined){
const norm=Math.min(v/100,1);
const alpha=0.07+norm*0.85;
// Cell fill
ctx2.fillStyle=`rgba(99,102,241,${alpha})`;
ctx2.fillRect(cx+1,y+2,bW-2,bH-4);
// Score text
const fontSize=bW>50?10:bW>38?9:8;
ctx2.font=`700 ${fontSize}px JetBrains Mono`;
ctx2.fillStyle=alpha>0.52?'#3730a3':'#475569';
ctx2.textAlign='center';
ctx2.fillText(v,cx+bW/2,y+bH/2+3.5);
} else {
ctx2.fillStyle='rgba(241,245,249,0.9)';
ctx2.fillRect(cx+1,y+2,bW-2,bH-4);
ctx2.font='8px JetBrains Mono';
ctx2.fillStyle='#cbd5e1';ctx2.textAlign='center';
ctx2.fillText('—',cx+bW/2,y+bH/2+3);
}
});
});
}
</script>
</body>
</html>