Spaces:
Paused
Paused
CrispStrobe commited on
Commit ·
cf1221a
1
Parent(s): 388998e
feat: integrate Artificial Analysis API and correct Mistral family sizes
Browse files- scripts/fetch-benchmarks.js +132 -291
- scripts/fetch-providers.js +10 -3
- src/App.tsx +16 -1
scripts/fetch-benchmarks.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
'use strict';
|
| 2 |
|
| 3 |
/**
|
| 4 |
-
* Fetch benchmark data from
|
| 5 |
*
|
| 6 |
* Sources:
|
| 7 |
* 1. AchilleasDrakou/LLMStats on GitHub (71 curated models, self-reported benchmarks)
|
|
@@ -9,6 +9,7 @@
|
|
| 9 |
* 3. LiveBench (livebench.ai) — contamination-free, monthly, 70+ frontier models
|
| 10 |
* 4. Chatbot Arena (lmarena.ai) — 316 models with real ELO ratings from human votes
|
| 11 |
* 5. Aider (aider.chat) — code editing benchmark, 133 tasks per model
|
|
|
|
| 12 |
*
|
| 13 |
* Unified field names (0-1 scale unless noted):
|
| 14 |
* mmlu, mmlu_pro, gpqa, human_eval, math, gsm8k, mmmu,
|
|
@@ -18,17 +19,16 @@
|
|
| 18 |
* lb_math, lb_language, lb_if, lb_data_analysis
|
| 19 |
* arena_elo, arena_rank, arena_votes (Chatbot Arena; elo is raw ELO ~800-1500)
|
| 20 |
* aider_pass_rate (Aider edit bench, 0-1)
|
|
|
|
|
|
|
| 21 |
*
|
| 22 |
-
* Where
|
| 23 |
* LLMStats takes priority (it stores self-reported model-card values).
|
| 24 |
*
|
| 25 |
* Usage:
|
| 26 |
* node scripts/fetch-benchmarks.js # fetch all sources
|
|
|
|
| 27 |
* node scripts/fetch-benchmarks.js livebench # refresh LiveBench only
|
| 28 |
-
* node scripts/fetch-benchmarks.js arena # refresh Chatbot Arena only
|
| 29 |
-
* node scripts/fetch-benchmarks.js aider # refresh Aider only
|
| 30 |
-
* node scripts/fetch-benchmarks.js hf # refresh HF Leaderboard only
|
| 31 |
-
* node scripts/fetch-benchmarks.js llmstats # refresh LLMStats only
|
| 32 |
*/
|
| 33 |
|
| 34 |
const fs = require('fs');
|
|
@@ -170,18 +170,15 @@ async function fetchHFLeaderboard() {
|
|
| 170 |
const LB_GITHUB_TREE = 'https://api.github.com/repos/LiveBench/livebench.github.io/git/trees/main?recursive=1';
|
| 171 |
const LB_BASE_URL = 'https://livebench.ai';
|
| 172 |
|
| 173 |
-
// Suffixes LiveBench appends to model names that providers don't use.
|
| 174 |
-
// We strip these to produce a "base" name for matching.
|
| 175 |
const LB_SUFFIX_RE = new RegExp(
|
| 176 |
'(-thinking-(?:auto-)?(?:\\d+k-)?(?:(?:high|medium|low)-effort)?|' +
|
| 177 |
'-thinking(?:-(?:64k|32k|auto|minimal))?|' +
|
| 178 |
'-(?:high|medium|low)-effort|' +
|
| 179 |
'-base|-non-?reasoning|-(?:high|low|min)thinking|-nothinking)' +
|
| 180 |
-
'(?:-(?:high|medium|low)-effort)?$'
|
| 181 |
);
|
| 182 |
|
| 183 |
function lbBaseName(name) {
|
| 184 |
-
// Repeatedly strip known suffixes until stable
|
| 185 |
let prev;
|
| 186 |
let cur = name;
|
| 187 |
do { prev = cur; cur = cur.replace(LB_SUFFIX_RE, ''); } while (cur !== prev);
|
|
@@ -230,12 +227,10 @@ async function fetchLiveBench() {
|
|
| 230 |
const dates = tree.tree
|
| 231 |
.filter((f) => f.path.startsWith('public/table_') && f.path.endsWith('.csv'))
|
| 232 |
.map((f) => f.path.replace('public/table_', '').replace('.csv', ''))
|
| 233 |
-
.sort();
|
| 234 |
console.log(`${dates.length} releases (${dates[0]} → ${dates[dates.length - 1]})`);
|
| 235 |
|
| 236 |
-
// Use task→group mapping from the latest categories JSON (stable across releases)
|
| 237 |
const cats = await getJson(`${LB_BASE_URL}/categories_${dates[dates.length - 1]}.json`);
|
| 238 |
-
|
| 239 |
const taskToGroup = {};
|
| 240 |
for (const [cat, tasks] of Object.entries(cats)) {
|
| 241 |
const group =
|
|
@@ -248,35 +243,22 @@ async function fetchLiveBench() {
|
|
| 248 |
if (group) for (const t of tasks) taskToGroup[t] = group;
|
| 249 |
}
|
| 250 |
|
| 251 |
-
// Fetch all releases (oldest→newest), so newer results overwrite older ones per model
|
| 252 |
-
// Map: lb_name → entry (most recent release wins)
|
| 253 |
const byName = new Map();
|
| 254 |
for (const date of dates) {
|
| 255 |
let csv;
|
| 256 |
-
try {
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
console.warn(`\n ⚠ LiveBench ${date}: ${e.message}`);
|
| 260 |
-
continue;
|
| 261 |
-
}
|
| 262 |
-
for (const entry of parseLiveBenchCsv(csv, taskToGroup)) {
|
| 263 |
-
byName.set(entry.lb_name, entry); // newer release overwrites
|
| 264 |
-
}
|
| 265 |
process.stdout.write(` LiveBench: ${date}\r`);
|
| 266 |
}
|
| 267 |
-
|
| 268 |
const entries = [...byName.values()];
|
| 269 |
console.log(` LiveBench: ${entries.length} unique models across all releases`);
|
| 270 |
return entries;
|
| 271 |
}
|
| 272 |
|
| 273 |
function mergeLiveBench(entries, lbEntries) {
|
| 274 |
-
// Build two lookups:
|
| 275 |
-
// exact: normalized lb_name → entry
|
| 276 |
-
// base: normalized base-name (suffixes stripped) → best-scoring entry among variants
|
| 277 |
const exactMap = new Map();
|
| 278 |
-
const baseMap = new Map();
|
| 279 |
-
|
| 280 |
for (const lb of lbEntries) {
|
| 281 |
exactMap.set(normName(lb.lb_name), lb);
|
| 282 |
const base = normName(lbBaseName(lb.lb_name));
|
|
@@ -285,113 +267,36 @@ function mergeLiveBench(entries, lbEntries) {
|
|
| 285 |
if (!prev || (lb.lb_global || 0) > (prev.lb_global || 0)) baseMap.set(base, lb);
|
| 286 |
}
|
| 287 |
}
|
| 288 |
-
|
| 289 |
-
// Track which lb entries have been used (to avoid adding them as standalone new entries)
|
| 290 |
const usedLbNames = new Set();
|
| 291 |
-
|
| 292 |
let matched = 0;
|
| 293 |
for (const e of entries) {
|
| 294 |
-
const candidates = [
|
| 295 |
-
normName(e.name || ''),
|
| 296 |
-
normName((e.slug || '').split('/').pop() || ''),
|
| 297 |
-
normName((e.hf_id || '').split('/').pop() || ''),
|
| 298 |
-
].filter(Boolean);
|
| 299 |
-
|
| 300 |
let lb = null;
|
| 301 |
-
for (const c of candidates) {
|
| 302 |
-
|
| 303 |
-
if (lb) break;
|
| 304 |
-
}
|
| 305 |
-
if (lb) {
|
| 306 |
-
Object.assign(e, lb);
|
| 307 |
-
usedLbNames.add(lb.lb_name);
|
| 308 |
-
matched++;
|
| 309 |
-
}
|
| 310 |
}
|
| 311 |
-
|
| 312 |
-
// Add standalone entries for lbEntries not matched above.
|
| 313 |
-
// Skip variants whose base was already matched (avoid duplicating e.g. all -effort variants).
|
| 314 |
-
// Use the base model name (without -high-effort etc.) as the entry name so that
|
| 315 |
-
// provider model names (which have no effort suffixes) can find this entry.
|
| 316 |
const usedBases = new Set([...usedLbNames].map((n) => normName(lbBaseName(n))));
|
| 317 |
const newEntries = [];
|
| 318 |
for (const lb of lbEntries) {
|
| 319 |
if (usedLbNames.has(lb.lb_name)) continue;
|
| 320 |
const base = normName(lbBaseName(lb.lb_name));
|
| 321 |
-
if (usedBases.has(base)) continue;
|
| 322 |
-
// Only add the best-scoring variant of each base group
|
| 323 |
if (baseMap.get(base) === lb || exactMap.get(normName(lb.lb_name)) === lb) {
|
| 324 |
-
|
| 325 |
-
newEntries.push({ name: baseName, ...lb }); // name uses base; lb_name keeps variant
|
| 326 |
usedBases.add(base);
|
| 327 |
}
|
| 328 |
}
|
| 329 |
-
|
| 330 |
console.log(` LiveBench: ${matched} matched, ${newEntries.length} new entries`);
|
| 331 |
return [...entries, ...newEntries];
|
| 332 |
}
|
| 333 |
|
| 334 |
-
// ─── Merge ───────────────────────────────────────────────────────────────────
|
| 335 |
-
|
| 336 |
-
function mergeEntries(llmstats, hfEntries) {
|
| 337 |
-
// Build lookup: normalized LLMStats name/slug → entry index
|
| 338 |
-
const lsIdx = new Map();
|
| 339 |
-
llmstats.forEach((e, i) => {
|
| 340 |
-
lsIdx.set(normName(e.name), i);
|
| 341 |
-
const slugModel = e.slug?.split('/').pop() || '';
|
| 342 |
-
if (slugModel) lsIdx.set(normName(slugModel), i);
|
| 343 |
-
});
|
| 344 |
-
|
| 345 |
-
const merged = llmstats.map((e) => ({ ...e }));
|
| 346 |
-
const hfOnly = [];
|
| 347 |
-
|
| 348 |
-
for (const hf of hfEntries) {
|
| 349 |
-
// Try matching by the model name part of the HF ID
|
| 350 |
-
const modelPart = normName(hf.name);
|
| 351 |
-
// Also try stripping a leading word (org prefix embedded in model name like "Meta-Llama-...")
|
| 352 |
-
const modelWords = modelPart.split(' ');
|
| 353 |
-
const modelNoPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart;
|
| 354 |
-
|
| 355 |
-
const idx = lsIdx.get(modelPart) ?? lsIdx.get(modelNoPrefix);
|
| 356 |
-
if (idx !== undefined) {
|
| 357 |
-
// Merge HF fields into LLMStats entry (LLMStats wins for shared benchmarks)
|
| 358 |
-
const target = merged[idx];
|
| 359 |
-
if (!target.hf_id) target.hf_id = hf.hf_id;
|
| 360 |
-
if (!target.params_b) target.params_b = hf.params_b;
|
| 361 |
-
if (!target.ifeval) target.ifeval = hf.ifeval;
|
| 362 |
-
if (!target.bbh) target.bbh = hf.bbh;
|
| 363 |
-
if (!target.gpqa) target.gpqa = hf.gpqa;
|
| 364 |
-
if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
|
| 365 |
-
target.hf_math_lvl5 = hf.hf_math_lvl5;
|
| 366 |
-
target.hf_musr = hf.hf_musr;
|
| 367 |
-
target.hf_avg = hf.hf_avg;
|
| 368 |
-
} else {
|
| 369 |
-
hfOnly.push(hf);
|
| 370 |
-
}
|
| 371 |
-
}
|
| 372 |
-
|
| 373 |
-
return [...merged, ...hfOnly];
|
| 374 |
-
}
|
| 375 |
-
|
| 376 |
// ─── Chatbot Arena ───────────────────────────────────────────────────────────
|
| 377 |
|
| 378 |
async function fetchChatbotArena() {
|
| 379 |
process.stdout.write('Chatbot Arena: fetching RSC leaderboard... ');
|
| 380 |
-
|
| 381 |
-
// The lmarena.ai leaderboard page renders via React Server Components.
|
| 382 |
-
// Requesting with "RSC: 1" returns a streaming text/x-component payload that
|
| 383 |
-
// embeds the full leaderboard entries (rank, ELO rating, votes) in the server
|
| 384 |
-
// response — no authentication required.
|
| 385 |
const text = await getText('https://lmarena.ai/en/leaderboard/text', {
|
| 386 |
-
headers: {
|
| 387 |
-
'User-Agent': 'Mozilla/5.0',
|
| 388 |
-
'RSC': '1',
|
| 389 |
-
'Accept': 'text/x-component',
|
| 390 |
-
},
|
| 391 |
});
|
| 392 |
-
|
| 393 |
-
// Each RSC line has the format: <hex_id>:<json_value>
|
| 394 |
-
// Find the line containing "entries":[...] with ELO ratings
|
| 395 |
let entries = null;
|
| 396 |
for (const line of text.split('\n')) {
|
| 397 |
if (!line.includes('"entries":[') || !line.includes('"rating":')) continue;
|
|
@@ -404,10 +309,8 @@ async function fetchChatbotArena() {
|
|
| 404 |
entries = JSON.parse(line.substring(start, end));
|
| 405 |
break;
|
| 406 |
}
|
| 407 |
-
|
| 408 |
if (!entries) throw new Error('Could not find entries in RSC payload');
|
| 409 |
console.log(`${entries.length} models`);
|
| 410 |
-
|
| 411 |
return entries.map((e) => ({
|
| 412 |
arena_name: e.modelDisplayName,
|
| 413 |
arena_org: e.modelOrganization,
|
|
@@ -420,30 +323,17 @@ async function fetchChatbotArena() {
|
|
| 420 |
function mergeArena(entries, arenaEntries) {
|
| 421 |
const arenaMap = new Map();
|
| 422 |
for (const a of arenaEntries) arenaMap.set(normName(a.arena_name), a);
|
| 423 |
-
|
| 424 |
let matched = 0;
|
| 425 |
for (const e of entries) {
|
| 426 |
-
const candidates = [
|
| 427 |
-
normName(e.name || ''),
|
| 428 |
-
normName((e.lb_name) || ''),
|
| 429 |
-
normName((e.slug || '').split('/').pop() || ''),
|
| 430 |
-
normName((e.hf_id || '').split('/').pop() || ''),
|
| 431 |
-
];
|
| 432 |
const a = candidates.map((c) => arenaMap.get(c)).find(Boolean);
|
| 433 |
if (a) {
|
| 434 |
-
e.arena_elo
|
| 435 |
-
|
| 436 |
-
e.arena_votes = a.arena_votes;
|
| 437 |
-
arenaMap.delete(normName(a.arena_name));
|
| 438 |
-
matched++;
|
| 439 |
}
|
| 440 |
}
|
| 441 |
-
|
| 442 |
const newEntries = [];
|
| 443 |
-
for (const a of arenaMap.values()) {
|
| 444 |
-
newEntries.push({ name: a.arena_name, ...a });
|
| 445 |
-
}
|
| 446 |
-
|
| 447 |
console.log(` Arena: ${matched} matched, ${newEntries.length} new entries`);
|
| 448 |
return [...entries, ...newEntries];
|
| 449 |
}
|
|
@@ -455,10 +345,7 @@ const AIDER_RAW = 'https://raw.githubusercontent.com/Aider-AI/aider/main/aider/w
|
|
| 455 |
async function fetchAider() {
|
| 456 |
process.stdout.write('Aider: fetching edit leaderboard... ');
|
| 457 |
const text = await getText(AIDER_RAW);
|
| 458 |
-
|
| 459 |
const rows = yaml.load(text);
|
| 460 |
-
|
| 461 |
-
// Multiple runs per model — keep the one with the best pass_rate_1
|
| 462 |
const best = new Map();
|
| 463 |
for (const row of rows) {
|
| 464 |
if (!row.model || row.pass_rate_1 === undefined) continue;
|
|
@@ -466,15 +353,10 @@ async function fetchAider() {
|
|
| 466 |
const existing = best.get(key);
|
| 467 |
if (!existing || row.pass_rate_1 > existing.pass_rate_1) best.set(key, row);
|
| 468 |
}
|
| 469 |
-
|
| 470 |
const entries = [];
|
| 471 |
for (const row of best.values()) {
|
| 472 |
-
entries.push({
|
| 473 |
-
aider_model: row.model,
|
| 474 |
-
aider_pass_rate: row.pass_rate_1 / 100, // normalize 0-100 → 0-1
|
| 475 |
-
});
|
| 476 |
}
|
| 477 |
-
|
| 478 |
console.log(`${entries.length} models (best run each)`);
|
| 479 |
return entries;
|
| 480 |
}
|
|
@@ -482,217 +364,176 @@ async function fetchAider() {
|
|
| 482 |
function mergeAider(entries, aiderEntries) {
|
| 483 |
const aiderMap = new Map();
|
| 484 |
for (const a of aiderEntries) aiderMap.set(normName(a.aider_model), a);
|
| 485 |
-
|
| 486 |
let matched = 0;
|
| 487 |
for (const e of entries) {
|
| 488 |
-
const candidates = [
|
| 489 |
-
normName(e.name || ''),
|
| 490 |
-
normName((e.lb_name) || ''),
|
| 491 |
-
normName((e.slug || '').split('/').pop() || ''),
|
| 492 |
-
normName((e.hf_id || '').split('/').pop() || ''),
|
| 493 |
-
normName((e.arena_name) || ''),
|
| 494 |
-
];
|
| 495 |
const a = candidates.map((c) => aiderMap.get(c)).find(Boolean);
|
| 496 |
-
if (a) {
|
| 497 |
-
e.aider_pass_rate = a.aider_pass_rate;
|
| 498 |
-
aiderMap.delete(normName(a.aider_model));
|
| 499 |
-
matched++;
|
| 500 |
-
}
|
| 501 |
}
|
| 502 |
-
|
| 503 |
const newEntries = [];
|
| 504 |
-
for (const a of aiderMap.values()) {
|
| 505 |
-
newEntries.push({ name: a.aider_model, aider_pass_rate: a.aider_pass_rate });
|
| 506 |
-
}
|
| 507 |
-
|
| 508 |
console.log(` Aider: ${matched} matched, ${newEntries.length} new entries`);
|
| 509 |
return [...entries, ...newEntries];
|
| 510 |
}
|
| 511 |
|
| 512 |
-
// ───
|
| 513 |
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
arena: ['arena_name', 'arena_org', 'arena_elo', 'arena_rank', 'arena_votes'],
|
| 521 |
-
aider: ['aider_model', 'aider_pass_rate'],
|
| 522 |
-
};
|
| 523 |
|
| 524 |
-
|
| 525 |
-
const
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
livebench: 'lb_name',
|
| 529 |
-
arena: 'arena_elo',
|
| 530 |
-
aider: 'aider_pass_rate',
|
| 531 |
-
};
|
| 532 |
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
|
|
|
|
|
|
|
|
|
| 547 |
}
|
| 548 |
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
const
|
| 552 |
-
const nameMap = new Map();
|
| 553 |
-
entries.forEach((e, i) => {
|
| 554 |
-
if (e.name) nameMap.set(normName(e.name), i);
|
| 555 |
-
const slugModel = (e.slug || '').split('/').pop();
|
| 556 |
-
if (slugModel) nameMap.set(normName(slugModel), i);
|
| 557 |
-
});
|
| 558 |
|
| 559 |
let matched = 0;
|
| 560 |
-
const
|
| 561 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
const target = entries[idx];
|
| 568 |
-
for (const f of LS_FIELDS) { if (ls[f] !== undefined) target[f] = ls[f]; }
|
| 569 |
-
usedIdx.add(idx);
|
| 570 |
matched++;
|
| 571 |
-
} else {
|
| 572 |
-
newEntries.push({ ...ls });
|
| 573 |
}
|
| 574 |
}
|
| 575 |
|
| 576 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
return [...entries, ...newEntries];
|
| 578 |
}
|
| 579 |
|
| 580 |
-
// Merge
|
| 581 |
-
function mergeHFInto(entries, hfEntries) {
|
| 582 |
-
const nameMap = new Map();
|
| 583 |
-
entries.forEach((e, i) => {
|
| 584 |
-
if (e.name) nameMap.set(normName(e.name), i);
|
| 585 |
-
const slugModel = (e.slug || '').split('/').pop();
|
| 586 |
-
if (slugModel) nameMap.set(normName(slugModel), i);
|
| 587 |
-
});
|
| 588 |
-
|
| 589 |
-
let matched = 0;
|
| 590 |
-
const usedIdx = new Set();
|
| 591 |
-
const newEntries = [];
|
| 592 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
for (const hf of hfEntries) {
|
| 594 |
-
const modelPart
|
| 595 |
-
const modelWords
|
| 596 |
-
const
|
| 597 |
-
const
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
const target = entries[idx];
|
| 602 |
-
if (!target.hf_id) target.hf_id = hf.hf_id;
|
| 603 |
if (!target.params_b) target.params_b = hf.params_b;
|
| 604 |
-
|
| 605 |
-
if (!target.
|
| 606 |
-
if (!target.
|
| 607 |
-
if (!target.gpqa) target.gpqa = hf.gpqa;
|
| 608 |
if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
|
| 609 |
-
// HF-exclusive fields always updated
|
| 610 |
target.hf_math_lvl5 = hf.hf_math_lvl5;
|
| 611 |
-
target.hf_musr
|
| 612 |
-
target.hf_avg
|
| 613 |
-
|
| 614 |
-
matched++;
|
| 615 |
-
} else {
|
| 616 |
-
newEntries.push({ ...hf });
|
| 617 |
-
}
|
| 618 |
}
|
| 619 |
-
|
| 620 |
-
console.log(` HF: ${matched} matched, ${newEntries.length} new entries`);
|
| 621 |
-
return [...entries, ...newEntries];
|
| 622 |
}
|
| 623 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
async function refreshSource(source) {
|
| 625 |
if (!SOURCE_FIELDS[source]) {
|
| 626 |
console.error(`Unknown source "${source}". Valid: ${Object.keys(SOURCE_FIELDS).join(', ')}`);
|
| 627 |
process.exit(1);
|
| 628 |
}
|
| 629 |
-
|
| 630 |
console.log(`Refreshing benchmark source: ${source}\n`);
|
| 631 |
const existing = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
|
| 632 |
-
const
|
| 633 |
-
|
|
|
|
|
|
|
| 634 |
let result;
|
| 635 |
-
if (source === 'llmstats')
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
} else if (source === 'livebench') {
|
| 642 |
-
const data = await fetchLiveBench();
|
| 643 |
-
result = mergeLiveBench(stripped, data);
|
| 644 |
-
} else if (source === 'arena') {
|
| 645 |
-
const data = await fetchChatbotArena();
|
| 646 |
-
result = mergeArena(stripped, data);
|
| 647 |
-
} else if (source === 'aider') {
|
| 648 |
-
const data = await fetchAider();
|
| 649 |
-
result = mergeAider(stripped, data);
|
| 650 |
-
}
|
| 651 |
-
|
| 652 |
fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
|
| 653 |
-
console.log(`\nSaved ${result.length} entries to data/benchmarks.json`);
|
| 654 |
}
|
| 655 |
|
| 656 |
// ─── Main ────────────────────────────────────────────────────────────────────
|
| 657 |
|
| 658 |
async function main() {
|
| 659 |
const source = process.argv[2]?.toLowerCase();
|
|
|
|
| 660 |
|
| 661 |
-
|
| 662 |
-
if (source) {
|
| 663 |
-
await refreshSource(source);
|
| 664 |
-
return;
|
| 665 |
-
}
|
| 666 |
-
|
| 667 |
-
// Full rebuild — all sources
|
| 668 |
-
const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries] = await Promise.all([
|
| 669 |
fetchLLMStats(),
|
| 670 |
fetchHFLeaderboard(),
|
| 671 |
fetchLiveBench(),
|
| 672 |
fetchChatbotArena(),
|
| 673 |
fetchAider(),
|
|
|
|
| 674 |
]);
|
| 675 |
|
| 676 |
const merged = mergeEntries(llmstats, hfEntries);
|
| 677 |
const withLB = mergeLiveBench(merged, lbEntries);
|
| 678 |
const withAr = mergeArena(withLB, arenaEntries);
|
| 679 |
-
const
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
const lsOnlyCount = all.filter((e) => e.slug && !e.hf_id).length;
|
| 683 |
-
const bothCount = all.filter((e) => e.slug && e.hf_id).length;
|
| 684 |
-
const lbCount = all.filter((e) => e.lb_name).length;
|
| 685 |
-
const arenaCount = all.filter((e) => e.arena_elo).length;
|
| 686 |
-
const aiderCount = all.filter((e) => e.aider_pass_rate !== undefined).length;
|
| 687 |
console.log(`\nTotal entries: ${all.length}`);
|
| 688 |
-
console.log(`
|
| 689 |
-
console.log(` With LiveBench: ${lbCount} | With Arena ELO: ${arenaCount} | With Aider: ${aiderCount}`);
|
| 690 |
|
| 691 |
fs.writeFileSync(OUT_FILE, JSON.stringify(all, null, 2));
|
| 692 |
console.log(`Saved to data/benchmarks.json (${(fs.statSync(OUT_FILE).size / 1024).toFixed(0)} KB)`);
|
| 693 |
}
|
| 694 |
|
| 695 |
-
main().catch((err) => {
|
| 696 |
-
console.error('Fatal:', err);
|
| 697 |
-
process.exit(1);
|
| 698 |
-
});
|
|
|
|
| 1 |
'use strict';
|
| 2 |
|
| 3 |
/**
|
| 4 |
+
* Fetch benchmark data from six sources and merge into data/benchmarks.json.
|
| 5 |
*
|
| 6 |
* Sources:
|
| 7 |
* 1. AchilleasDrakou/LLMStats on GitHub (71 curated models, self-reported benchmarks)
|
|
|
|
| 9 |
* 3. LiveBench (livebench.ai) — contamination-free, monthly, 70+ frontier models
|
| 10 |
* 4. Chatbot Arena (lmarena.ai) — 316 models with real ELO ratings from human votes
|
| 11 |
* 5. Aider (aider.chat) — code editing benchmark, 133 tasks per model
|
| 12 |
+
* 6. Artificial Analysis (artificialanalysis.ai) — independent evaluations and speed benchmarks
|
| 13 |
*
|
| 14 |
* Unified field names (0-1 scale unless noted):
|
| 15 |
* mmlu, mmlu_pro, gpqa, human_eval, math, gsm8k, mmmu,
|
|
|
|
| 19 |
* lb_math, lb_language, lb_if, lb_data_analysis
|
| 20 |
* arena_elo, arena_rank, arena_votes (Chatbot Arena; elo is raw ELO ~800-1500)
|
| 21 |
* aider_pass_rate (Aider edit bench, 0-1)
|
| 22 |
+
* aa_id, aa_intelligence, aa_mmlu_pro, aa_gpqa, (Artificial Analysis)
|
| 23 |
+
* aa_livecodebench, aa_tokens_per_s, aa_latency_s
|
| 24 |
*
|
| 25 |
+
* Where multiple sources have data for the same benchmark,
|
| 26 |
* LLMStats takes priority (it stores self-reported model-card values).
|
| 27 |
*
|
| 28 |
* Usage:
|
| 29 |
* node scripts/fetch-benchmarks.js # fetch all sources
|
| 30 |
+
* node scripts/fetch-benchmarks.js aa # refresh Artificial Analysis only
|
| 31 |
* node scripts/fetch-benchmarks.js livebench # refresh LiveBench only
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
*/
|
| 33 |
|
| 34 |
const fs = require('fs');
|
|
|
|
| 170 |
const LB_GITHUB_TREE = 'https://api.github.com/repos/LiveBench/livebench.github.io/git/trees/main?recursive=1';
|
| 171 |
const LB_BASE_URL = 'https://livebench.ai';
|
| 172 |
|
|
|
|
|
|
|
| 173 |
const LB_SUFFIX_RE = new RegExp(
|
| 174 |
'(-thinking-(?:auto-)?(?:\\d+k-)?(?:(?:high|medium|low)-effort)?|' +
|
| 175 |
'-thinking(?:-(?:64k|32k|auto|minimal))?|' +
|
| 176 |
'-(?:high|medium|low)-effort|' +
|
| 177 |
'-base|-non-?reasoning|-(?:high|low|min)thinking|-nothinking)' +
|
| 178 |
+
'(?:-(?:high|medium|low)-effort)?$'
|
| 179 |
);
|
| 180 |
|
| 181 |
function lbBaseName(name) {
|
|
|
|
| 182 |
let prev;
|
| 183 |
let cur = name;
|
| 184 |
do { prev = cur; cur = cur.replace(LB_SUFFIX_RE, ''); } while (cur !== prev);
|
|
|
|
| 227 |
const dates = tree.tree
|
| 228 |
.filter((f) => f.path.startsWith('public/table_') && f.path.endsWith('.csv'))
|
| 229 |
.map((f) => f.path.replace('public/table_', '').replace('.csv', ''))
|
| 230 |
+
.sort();
|
| 231 |
console.log(`${dates.length} releases (${dates[0]} → ${dates[dates.length - 1]})`);
|
| 232 |
|
|
|
|
| 233 |
const cats = await getJson(`${LB_BASE_URL}/categories_${dates[dates.length - 1]}.json`);
|
|
|
|
| 234 |
const taskToGroup = {};
|
| 235 |
for (const [cat, tasks] of Object.entries(cats)) {
|
| 236 |
const group =
|
|
|
|
| 243 |
if (group) for (const t of tasks) taskToGroup[t] = group;
|
| 244 |
}
|
| 245 |
|
|
|
|
|
|
|
| 246 |
const byName = new Map();
|
| 247 |
for (const date of dates) {
|
| 248 |
let csv;
|
| 249 |
+
try { csv = await getText(`${LB_BASE_URL}/table_${date}.csv`); }
|
| 250 |
+
catch (e) { console.warn(`\n ⚠ LiveBench ${date}: ${e.message}`); continue; }
|
| 251 |
+
for (const entry of parseLiveBenchCsv(csv, taskToGroup)) byName.set(entry.lb_name, entry);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
process.stdout.write(` LiveBench: ${date}\r`);
|
| 253 |
}
|
|
|
|
| 254 |
const entries = [...byName.values()];
|
| 255 |
console.log(` LiveBench: ${entries.length} unique models across all releases`);
|
| 256 |
return entries;
|
| 257 |
}
|
| 258 |
|
| 259 |
function mergeLiveBench(entries, lbEntries) {
|
|
|
|
|
|
|
|
|
|
| 260 |
const exactMap = new Map();
|
| 261 |
+
const baseMap = new Map();
|
|
|
|
| 262 |
for (const lb of lbEntries) {
|
| 263 |
exactMap.set(normName(lb.lb_name), lb);
|
| 264 |
const base = normName(lbBaseName(lb.lb_name));
|
|
|
|
| 267 |
if (!prev || (lb.lb_global || 0) > (prev.lb_global || 0)) baseMap.set(base, lb);
|
| 268 |
}
|
| 269 |
}
|
|
|
|
|
|
|
| 270 |
const usedLbNames = new Set();
|
|
|
|
| 271 |
let matched = 0;
|
| 272 |
for (const e of entries) {
|
| 273 |
+
const candidates = [normName(e.name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')].filter(Boolean);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
let lb = null;
|
| 275 |
+
for (const c of candidates) { lb = exactMap.get(c) || baseMap.get(c); if (lb) break; }
|
| 276 |
+
if (lb) { Object.assign(e, lb); usedLbNames.add(lb.lb_name); matched++; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
const usedBases = new Set([...usedLbNames].map((n) => normName(lbBaseName(n))));
|
| 279 |
const newEntries = [];
|
| 280 |
for (const lb of lbEntries) {
|
| 281 |
if (usedLbNames.has(lb.lb_name)) continue;
|
| 282 |
const base = normName(lbBaseName(lb.lb_name));
|
| 283 |
+
if (usedBases.has(base)) continue;
|
|
|
|
| 284 |
if (baseMap.get(base) === lb || exactMap.get(normName(lb.lb_name)) === lb) {
|
| 285 |
+
newEntries.push({ name: lbBaseName(lb.lb_name), ...lb });
|
|
|
|
| 286 |
usedBases.add(base);
|
| 287 |
}
|
| 288 |
}
|
|
|
|
| 289 |
console.log(` LiveBench: ${matched} matched, ${newEntries.length} new entries`);
|
| 290 |
return [...entries, ...newEntries];
|
| 291 |
}
|
| 292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
// ─── Chatbot Arena ───────────────────────────────────────────────────────────
|
| 294 |
|
| 295 |
async function fetchChatbotArena() {
|
| 296 |
process.stdout.write('Chatbot Arena: fetching RSC leaderboard... ');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
const text = await getText('https://lmarena.ai/en/leaderboard/text', {
|
| 298 |
+
headers: { 'User-Agent': 'Mozilla/5.0', 'RSC': '1', 'Accept': 'text/x-component' },
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
});
|
|
|
|
|
|
|
|
|
|
| 300 |
let entries = null;
|
| 301 |
for (const line of text.split('\n')) {
|
| 302 |
if (!line.includes('"entries":[') || !line.includes('"rating":')) continue;
|
|
|
|
| 309 |
entries = JSON.parse(line.substring(start, end));
|
| 310 |
break;
|
| 311 |
}
|
|
|
|
| 312 |
if (!entries) throw new Error('Could not find entries in RSC payload');
|
| 313 |
console.log(`${entries.length} models`);
|
|
|
|
| 314 |
return entries.map((e) => ({
|
| 315 |
arena_name: e.modelDisplayName,
|
| 316 |
arena_org: e.modelOrganization,
|
|
|
|
| 323 |
function mergeArena(entries, arenaEntries) {
|
| 324 |
const arenaMap = new Map();
|
| 325 |
for (const a of arenaEntries) arenaMap.set(normName(a.arena_name), a);
|
|
|
|
| 326 |
let matched = 0;
|
| 327 |
for (const e of entries) {
|
| 328 |
+
const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
const a = candidates.map((c) => arenaMap.get(c)).find(Boolean);
|
| 330 |
if (a) {
|
| 331 |
+
e.arena_elo = a.arena_elo; e.arena_rank = a.arena_rank; e.arena_votes = a.arena_votes;
|
| 332 |
+
arenaMap.delete(normName(a.arena_name)); matched++;
|
|
|
|
|
|
|
|
|
|
| 333 |
}
|
| 334 |
}
|
|
|
|
| 335 |
const newEntries = [];
|
| 336 |
+
for (const a of arenaMap.values()) newEntries.push({ name: a.arena_name, ...a });
|
|
|
|
|
|
|
|
|
|
| 337 |
console.log(` Arena: ${matched} matched, ${newEntries.length} new entries`);
|
| 338 |
return [...entries, ...newEntries];
|
| 339 |
}
|
|
|
|
| 345 |
async function fetchAider() {
|
| 346 |
process.stdout.write('Aider: fetching edit leaderboard... ');
|
| 347 |
const text = await getText(AIDER_RAW);
|
|
|
|
| 348 |
const rows = yaml.load(text);
|
|
|
|
|
|
|
| 349 |
const best = new Map();
|
| 350 |
for (const row of rows) {
|
| 351 |
if (!row.model || row.pass_rate_1 === undefined) continue;
|
|
|
|
| 353 |
const existing = best.get(key);
|
| 354 |
if (!existing || row.pass_rate_1 > existing.pass_rate_1) best.set(key, row);
|
| 355 |
}
|
|
|
|
| 356 |
const entries = [];
|
| 357 |
for (const row of best.values()) {
|
| 358 |
+
entries.push({ aider_model: row.model, aider_pass_rate: row.pass_rate_1 / 100 });
|
|
|
|
|
|
|
|
|
|
| 359 |
}
|
|
|
|
| 360 |
console.log(`${entries.length} models (best run each)`);
|
| 361 |
return entries;
|
| 362 |
}
|
|
|
|
| 364 |
function mergeAider(entries, aiderEntries) {
|
| 365 |
const aiderMap = new Map();
|
| 366 |
for (const a of aiderEntries) aiderMap.set(normName(a.aider_model), a);
|
|
|
|
| 367 |
let matched = 0;
|
| 368 |
for (const e of entries) {
|
| 369 |
+
const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || ''), normName(e.arena_name || '')];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
const a = candidates.map((c) => aiderMap.get(c)).find(Boolean);
|
| 371 |
+
if (a) { e.aider_pass_rate = a.aider_pass_rate; aiderMap.delete(normName(a.aider_model)); matched++; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
}
|
|
|
|
| 373 |
const newEntries = [];
|
| 374 |
+
for (const a of aiderMap.values()) newEntries.push({ name: a.aider_model, aider_pass_rate: a.aider_pass_rate });
|
|
|
|
|
|
|
|
|
|
| 375 |
console.log(` Aider: ${matched} matched, ${newEntries.length} new entries`);
|
| 376 |
return [...entries, ...newEntries];
|
| 377 |
}
|
| 378 |
|
| 379 |
+
// ─── Artificial Analysis ───────────────────────────────────────────────────
|
| 380 |
|
| 381 |
+
async function fetchArtificialAnalysis() {
|
| 382 |
+
const apiKey = process.env.ARTIFICIAL_ANALYSIS_API_KEY;
|
| 383 |
+
if (!apiKey) {
|
| 384 |
+
console.log('Artificial Analysis: skipping (no API key found)');
|
| 385 |
+
return [];
|
| 386 |
+
}
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
+
process.stdout.write('Artificial Analysis: fetching benchmarks... ');
|
| 389 |
+
const res = await getJson('https://artificialanalysis.ai/api/v2/data/llms/models', {
|
| 390 |
+
headers: { 'x-api-key': apiKey },
|
| 391 |
+
});
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
+
if (!res.data) throw new Error('Invalid response from Artificial Analysis API');
|
| 394 |
+
console.log(`${res.data.length} models`);
|
| 395 |
+
|
| 396 |
+
return res.data.map((m) => {
|
| 397 |
+
const ev = m.evaluations || {};
|
| 398 |
+
return {
|
| 399 |
+
aa_id: m.id,
|
| 400 |
+
aa_name: m.name,
|
| 401 |
+
aa_slug: m.slug,
|
| 402 |
+
aa_intelligence: ev.artificial_analysis_intelligence_index, // typically 0-100
|
| 403 |
+
aa_mmlu_pro: ev.mmlu_pro, // 0-1
|
| 404 |
+
aa_gpqa: ev.gpqa, // 0-1
|
| 405 |
+
aa_livecodebench: ev.livecodebench, // 0-1
|
| 406 |
+
aa_tokens_per_s: m.median_output_tokens_per_second,
|
| 407 |
+
aa_latency_s: m.median_time_to_first_token_seconds,
|
| 408 |
+
};
|
| 409 |
+
});
|
| 410 |
}
|
| 411 |
|
| 412 |
+
function mergeArtificialAnalysis(entries, aaEntries) {
|
| 413 |
+
const aaMap = new Map();
|
| 414 |
+
for (const a of aaEntries) aaMap.set(normName(a.aa_name), a);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
|
| 416 |
let matched = 0;
|
| 417 |
+
for (const e of entries) {
|
| 418 |
+
const candidates = [
|
| 419 |
+
normName(e.name || ''),
|
| 420 |
+
normName(e.lb_name || ''),
|
| 421 |
+
normName((e.slug || '').split('/').pop() || ''),
|
| 422 |
+
normName((e.hf_id || '').split('/').pop() || ''),
|
| 423 |
+
normName(e.arena_name || ''),
|
| 424 |
+
].filter(Boolean);
|
| 425 |
|
| 426 |
+
const aa = candidates.map((c) => aaMap.get(c)).find(Boolean);
|
| 427 |
+
if (aa) {
|
| 428 |
+
Object.assign(e, aa);
|
| 429 |
+
aaMap.delete(normName(aa.aa_name));
|
|
|
|
|
|
|
|
|
|
| 430 |
matched++;
|
|
|
|
|
|
|
| 431 |
}
|
| 432 |
}
|
| 433 |
|
| 434 |
+
const newEntries = [];
|
| 435 |
+
for (const a of aaMap.values()) {
|
| 436 |
+
newEntries.push({ name: a.aa_name, ...a });
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
console.log(` AA: ${matched} matched, ${newEntries.length} new entries`);
|
| 440 |
return [...entries, ...newEntries];
|
| 441 |
}
|
| 442 |
|
| 443 |
+
// ─── Merge ───────────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
+
function mergeEntries(llmstats, hfEntries) {
|
| 446 |
+
const lsIdx = new Map();
|
| 447 |
+
llmstats.forEach((e, i) => {
|
| 448 |
+
lsIdx.set(normName(e.name), i);
|
| 449 |
+
const slugModel = e.slug?.split('/').pop() || '';
|
| 450 |
+
if (slugModel) lsIdx.set(normName(slugModel), i);
|
| 451 |
+
});
|
| 452 |
+
const merged = llmstats.map((e) => ({ ...e }));
|
| 453 |
+
const hfOnly = [];
|
| 454 |
for (const hf of hfEntries) {
|
| 455 |
+
const modelPart = normName(hf.name);
|
| 456 |
+
const modelWords = modelPart.split(' ');
|
| 457 |
+
const modelNoPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart;
|
| 458 |
+
const idx = lsIdx.get(modelPart) ?? lsIdx.get(modelNoPrefix);
|
| 459 |
+
if (idx !== undefined) {
|
| 460 |
+
const target = merged[idx];
|
| 461 |
+
if (!target.hf_id) target.hf_id = hf.hf_id;
|
|
|
|
|
|
|
| 462 |
if (!target.params_b) target.params_b = hf.params_b;
|
| 463 |
+
if (!target.ifeval) target.ifeval = hf.ifeval;
|
| 464 |
+
if (!target.bbh) target.bbh = hf.bbh;
|
| 465 |
+
if (!target.gpqa) target.gpqa = hf.gpqa;
|
|
|
|
| 466 |
if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
|
|
|
|
| 467 |
target.hf_math_lvl5 = hf.hf_math_lvl5;
|
| 468 |
+
target.hf_musr = hf.hf_musr;
|
| 469 |
+
target.hf_avg = hf.hf_avg;
|
| 470 |
+
} else hfOnly.push(hf);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
}
|
| 472 |
+
return [...merged, ...hfOnly];
|
|
|
|
|
|
|
| 473 |
}
|
| 474 |
|
| 475 |
+
// ─── Refresh ─────────────────────────────────────────────────────────────────
|
| 476 |
+
|
| 477 |
+
const SOURCE_FIELDS = {
|
| 478 |
+
llmstats: ['slug', 'mmlu', 'mmlu_pro', 'gpqa', 'human_eval', 'math', 'gsm8k', 'mmmu', 'hellaswag', 'ifeval', 'arc', 'drop', 'mbpp', 'mgsm', 'bbh'],
|
| 479 |
+
hf: ['hf_id', 'params_b', 'hf_math_lvl5', 'hf_musr', 'hf_avg'],
|
| 480 |
+
livebench: ['lb_name', 'lb_global', 'lb_reasoning', 'lb_coding', 'lb_math', 'lb_language', 'lb_if', 'lb_data_analysis'],
|
| 481 |
+
arena: ['arena_name', 'arena_org', 'arena_elo', 'arena_rank', 'arena_votes'],
|
| 482 |
+
aider: ['aider_model', 'aider_pass_rate'],
|
| 483 |
+
aa: ['aa_id', 'aa_intelligence', 'aa_mmlu_pro', 'aa_gpqa', 'aa_livecodebench', 'aa_tokens_per_s', 'aa_latency_s'],
|
| 484 |
+
};
|
| 485 |
+
|
| 486 |
+
const SOURCE_ID_FIELD = {
|
| 487 |
+
llmstats: 'slug', hf: 'hf_id', livebench: 'lb_name', arena: 'arena_elo', aider: 'aider_pass_rate', aa: 'aa_intelligence',
|
| 488 |
+
};
|
| 489 |
+
|
| 490 |
async function refreshSource(source) {
|
| 491 |
if (!SOURCE_FIELDS[source]) {
|
| 492 |
console.error(`Unknown source "${source}". Valid: ${Object.keys(SOURCE_FIELDS).join(', ')}`);
|
| 493 |
process.exit(1);
|
| 494 |
}
|
|
|
|
| 495 |
console.log(`Refreshing benchmark source: ${source}\n`);
|
| 496 |
const existing = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
|
| 497 |
+
const otherIdFields = Object.values(SOURCE_ID_FIELD).filter(f => f !== SOURCE_ID_FIELD[source]);
|
| 498 |
+
const stripped = existing.filter(e => otherIdFields.some(f => e[f] !== undefined)).map(e => {
|
| 499 |
+
const s = { ...e }; for (const f of SOURCE_FIELDS[source]) delete s[f]; return s;
|
| 500 |
+
});
|
| 501 |
let result;
|
| 502 |
+
if (source === 'llmstats') result = mergeLLMStatsInto(stripped, await fetchLLMStats());
|
| 503 |
+
else if (source === 'hf') result = mergeHFInto(stripped, await fetchHFLeaderboard());
|
| 504 |
+
else if (source === 'livebench') result = mergeLiveBench(stripped, await fetchLiveBench());
|
| 505 |
+
else if (source === 'arena') result = mergeArena(stripped, await fetchChatbotArena());
|
| 506 |
+
else if (source === 'aider') result = mergeAider(stripped, await fetchAider());
|
| 507 |
+
else if (source === 'aa') result = mergeArtificialAnalysis(stripped, await fetchArtificialAnalysis());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
|
|
|
|
| 509 |
}
|
| 510 |
|
| 511 |
// ─── Main ────────────────────────────────────────────────────────────────────
|
| 512 |
|
| 513 |
async function main() {
|
| 514 |
const source = process.argv[2]?.toLowerCase();
|
| 515 |
+
if (source) { await refreshSource(source); return; }
|
| 516 |
|
| 517 |
+
const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries] = await Promise.all([
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
fetchLLMStats(),
|
| 519 |
fetchHFLeaderboard(),
|
| 520 |
fetchLiveBench(),
|
| 521 |
fetchChatbotArena(),
|
| 522 |
fetchAider(),
|
| 523 |
+
fetchArtificialAnalysis(),
|
| 524 |
]);
|
| 525 |
|
| 526 |
const merged = mergeEntries(llmstats, hfEntries);
|
| 527 |
const withLB = mergeLiveBench(merged, lbEntries);
|
| 528 |
const withAr = mergeArena(withLB, arenaEntries);
|
| 529 |
+
const withAi = mergeAider(withAr, aiderEntries);
|
| 530 |
+
const all = mergeArtificialAnalysis(withAi, aaEntries);
|
| 531 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
console.log(`\nTotal entries: ${all.length}`);
|
| 533 |
+
console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length}`);
|
|
|
|
| 534 |
|
| 535 |
fs.writeFileSync(OUT_FILE, JSON.stringify(all, null, 2));
|
| 536 |
console.log(`Saved to data/benchmarks.json (${(fs.statSync(OUT_FILE).size / 1024).toFixed(0)} KB)`);
|
| 537 |
}
|
| 538 |
|
| 539 |
+
main().catch((err) => { console.error('Fatal:', err); process.exit(1); });
|
|
|
|
|
|
|
|
|
scripts/fetch-providers.js
CHANGED
|
@@ -219,22 +219,29 @@ const MANUAL_HF_ID_MAP = {
|
|
| 219 |
'flux 1 1 pro ultra': 'black-forest-labs/FLUX.1-pro',
|
| 220 |
'flux 1 fill pro': 'black-forest-labs/FLUX.1-pro',
|
| 221 |
'flux 1 kontext max': 'black-forest-labs/FLUX.1-pro',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
};
|
| 223 |
|
| 224 |
const MANUAL_SIZE_MAP = {
|
| 225 |
'BAAI/bge-m3': 0.57,
|
| 226 |
-
// FLUX
|
| 227 |
'black-forest-labs/FLUX.1-schnell': 12,
|
| 228 |
'black-forest-labs/FLUX.1-dev': 12,
|
| 229 |
'black-forest-labs/FLUX.1-pro': 12,
|
| 230 |
-
// FLUX.2 family (32B flagship architecture with Mistral-3 24B backbone)
|
| 231 |
'black-forest-labs/FLUX.2-dev': 32,
|
| 232 |
'black-forest-labs/FLUX.2-pro': 32,
|
| 233 |
'black-forest-labs/FLUX.2-flex': 32,
|
| 234 |
'black-forest-labs/FLUX.2-max': 32,
|
| 235 |
-
// FLUX.2 Klein (Optimized smaller versions)
|
| 236 |
'black-forest-labs/FLUX.2-klein-4B': 4,
|
| 237 |
'black-forest-labs/FLUX.2-klein-9B': 9,
|
|
|
|
|
|
|
|
|
|
| 238 |
};
|
| 239 |
|
| 240 |
// Propagate capabilities and size from benchmarks, OpenRouter, or HF Hub to all other providers' models.
|
|
|
|
| 219 |
'flux 1 1 pro ultra': 'black-forest-labs/FLUX.1-pro',
|
| 220 |
'flux 1 fill pro': 'black-forest-labs/FLUX.1-pro',
|
| 221 |
'flux 1 kontext max': 'black-forest-labs/FLUX.1-pro',
|
| 222 |
+
// Mistral mappings
|
| 223 |
+
'mistral large 2407': 'mistralai/Mistral-Large-Instruct-2407',
|
| 224 |
+
'mistral large latest': 'mistralai/Mistral-Large-Instruct-2407',
|
| 225 |
+
'mistral large 2': 'mistralai/Mistral-Large-Instruct-2407',
|
| 226 |
+
'mistral large 2411': 'mistralai/Mistral-Large-Instruct-2411',
|
| 227 |
+
'mistral large 3': 'mistralai/Mistral-Large-Instruct-2411',
|
| 228 |
};
|
| 229 |
|
| 230 |
const MANUAL_SIZE_MAP = {
|
| 231 |
'BAAI/bge-m3': 0.57,
|
| 232 |
+
// FLUX family
|
| 233 |
'black-forest-labs/FLUX.1-schnell': 12,
|
| 234 |
'black-forest-labs/FLUX.1-dev': 12,
|
| 235 |
'black-forest-labs/FLUX.1-pro': 12,
|
|
|
|
| 236 |
'black-forest-labs/FLUX.2-dev': 32,
|
| 237 |
'black-forest-labs/FLUX.2-pro': 32,
|
| 238 |
'black-forest-labs/FLUX.2-flex': 32,
|
| 239 |
'black-forest-labs/FLUX.2-max': 32,
|
|
|
|
| 240 |
'black-forest-labs/FLUX.2-klein-4B': 4,
|
| 241 |
'black-forest-labs/FLUX.2-klein-9B': 9,
|
| 242 |
+
// Mistral family
|
| 243 |
+
'mistralai/Mistral-Large-Instruct-2407': 123,
|
| 244 |
+
'mistralai/Mistral-Large-Instruct-2411': 675, // 41B active
|
| 245 |
};
|
| 246 |
|
| 247 |
// Propagate capabilities and size from benchmarks, OpenRouter, or HF Hub to all other providers' models.
|
src/App.tsx
CHANGED
|
@@ -74,6 +74,14 @@ interface BenchmarkEntry {
|
|
| 74 |
arena_votes?: number;
|
| 75 |
// Aider code editing benchmark (aider.chat)
|
| 76 |
aider_pass_rate?: number; // 0-1, first-pass success on 133 coding tasks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
}
|
| 78 |
|
| 79 |
const normalizeName = (s: string) =>
|
|
@@ -328,7 +336,9 @@ function App() {
|
|
| 328 |
case 'lb_if':
|
| 329 |
case 'lb_data_analysis':
|
| 330 |
case 'arena_elo':
|
| 331 |
-
case 'aider_pass_rate':
|
|
|
|
|
|
|
| 332 |
const bA = findBenchmark(a.name);
|
| 333 |
const bB = findBenchmark(b.name);
|
| 334 |
aValue = bA?.[sortConfig.key as keyof BenchmarkEntry] as number ?? -1;
|
|
@@ -476,6 +486,8 @@ function App() {
|
|
| 476 |
{showBenchmarks && <>
|
| 477 |
<th onClick={() => requestSort('arena_elo')} className="sortable" title="Chatbot Arena ELO (human preference votes)">Arena ELO {getSortIcon('arena_elo')}</th>
|
| 478 |
<th onClick={() => requestSort('aider_pass_rate')} className="sortable" title="Aider code editing benchmark (pass rate, 133 tasks)">Aider {getSortIcon('aider_pass_rate')}</th>
|
|
|
|
|
|
|
| 479 |
<th onClick={() => requestSort('lb_global')} className="sortable" title="LiveBench overall average (contamination-free)">LB {getSortIcon('lb_global')}</th>
|
| 480 |
<th onClick={() => requestSort('lb_math')} className="sortable" title="LiveBench Mathematics">LB-Math {getSortIcon('lb_math')}</th>
|
| 481 |
<th onClick={() => requestSort('lb_coding')} className="sortable" title="LiveBench Coding + Agentic Coding">LB-Code {getSortIcon('lb_coding')}</th>
|
|
@@ -555,6 +567,8 @@ function App() {
|
|
| 555 |
return <>
|
| 556 |
<td className="benchmark-cell">{bm?.arena_elo !== undefined ? Math.round(bm.arena_elo) : '–'}</td>
|
| 557 |
<td className="benchmark-cell">{fmt(bm?.aider_pass_rate)}</td>
|
|
|
|
|
|
|
| 558 |
<td className="benchmark-cell">{fmt(bm?.lb_global)}</td>
|
| 559 |
<td className="benchmark-cell">{fmt(bm?.lb_math)}</td>
|
| 560 |
<td className="benchmark-cell">{fmt(bm?.lb_coding)}</td>
|
|
@@ -578,6 +592,7 @@ function App() {
|
|
| 578 |
|
| 579 |
<footer>
|
| 580 |
<p>* All prices normalized to USD for comparison using 1 EUR = {EXCHANGE_RATE_EUR_TO_USD} USD.</p>
|
|
|
|
| 581 |
<p>Sorted by input price by default.</p>
|
| 582 |
</footer>
|
| 583 |
</div>
|
|
|
|
| 74 |
arena_votes?: number;
|
| 75 |
// Aider code editing benchmark (aider.chat)
|
| 76 |
aider_pass_rate?: number; // 0-1, first-pass success on 133 coding tasks
|
| 77 |
+
// Artificial Analysis (artificialanalysis.ai)
|
| 78 |
+
aa_id?: string;
|
| 79 |
+
aa_intelligence?: number; // 0-100 intelligence index
|
| 80 |
+
aa_mmlu_pro?: number;
|
| 81 |
+
aa_gpqa?: number;
|
| 82 |
+
aa_livecodebench?: number;
|
| 83 |
+
aa_tokens_per_s?: number;
|
| 84 |
+
aa_latency_s?: number;
|
| 85 |
}
|
| 86 |
|
| 87 |
const normalizeName = (s: string) =>
|
|
|
|
| 336 |
case 'lb_if':
|
| 337 |
case 'lb_data_analysis':
|
| 338 |
case 'arena_elo':
|
| 339 |
+
case 'aider_pass_rate':
|
| 340 |
+
case 'aa_intelligence':
|
| 341 |
+
case 'aa_tokens_per_s': {
|
| 342 |
const bA = findBenchmark(a.name);
|
| 343 |
const bB = findBenchmark(b.name);
|
| 344 |
aValue = bA?.[sortConfig.key as keyof BenchmarkEntry] as number ?? -1;
|
|
|
|
| 486 |
{showBenchmarks && <>
|
| 487 |
<th onClick={() => requestSort('arena_elo')} className="sortable" title="Chatbot Arena ELO (human preference votes)">Arena ELO {getSortIcon('arena_elo')}</th>
|
| 488 |
<th onClick={() => requestSort('aider_pass_rate')} className="sortable" title="Aider code editing benchmark (pass rate, 133 tasks)">Aider {getSortIcon('aider_pass_rate')}</th>
|
| 489 |
+
<th onClick={() => requestSort('aa_intelligence')} className="sortable" title="Artificial Analysis Intelligence Index (0-100)">AA Intel {getSortIcon('aa_intelligence')}</th>
|
| 490 |
+
<th onClick={() => requestSort('aa_tokens_per_s')} className="sortable" title="Artificial Analysis Median Speed (Tokens per Second)">AA Speed {getSortIcon('aa_tokens_per_s')}</th>
|
| 491 |
<th onClick={() => requestSort('lb_global')} className="sortable" title="LiveBench overall average (contamination-free)">LB {getSortIcon('lb_global')}</th>
|
| 492 |
<th onClick={() => requestSort('lb_math')} className="sortable" title="LiveBench Mathematics">LB-Math {getSortIcon('lb_math')}</th>
|
| 493 |
<th onClick={() => requestSort('lb_coding')} className="sortable" title="LiveBench Coding + Agentic Coding">LB-Code {getSortIcon('lb_coding')}</th>
|
|
|
|
| 567 |
return <>
|
| 568 |
<td className="benchmark-cell">{bm?.arena_elo !== undefined ? Math.round(bm.arena_elo) : '–'}</td>
|
| 569 |
<td className="benchmark-cell">{fmt(bm?.aider_pass_rate)}</td>
|
| 570 |
+
<td className="benchmark-cell">{bm?.aa_intelligence !== undefined ? Math.round(bm.aa_intelligence) : '–'}</td>
|
| 571 |
+
<td className="benchmark-cell">{bm?.aa_tokens_per_s !== undefined ? Math.round(bm.aa_tokens_per_s) : '–'}</td>
|
| 572 |
<td className="benchmark-cell">{fmt(bm?.lb_global)}</td>
|
| 573 |
<td className="benchmark-cell">{fmt(bm?.lb_math)}</td>
|
| 574 |
<td className="benchmark-cell">{fmt(bm?.lb_coding)}</td>
|
|
|
|
| 592 |
|
| 593 |
<footer>
|
| 594 |
<p>* All prices normalized to USD for comparison using 1 EUR = {EXCHANGE_RATE_EUR_TO_USD} USD.</p>
|
| 595 |
+
<p>Benchmark data from LLMStats, HF Leaderboard, LiveBench, Chatbot Arena, Aider, and <a href="https://artificialanalysis.ai/" target="_blank" rel="noopener noreferrer">Artificial Analysis</a>.</p>
|
| 596 |
<p>Sorted by input price by default.</p>
|
| 597 |
</footer>
|
| 598 |
</div>
|