CrispStrobe commited on
Commit
cf1221a
·
1 Parent(s): 388998e

feat: integrate Artificial Analysis API and correct Mistral family sizes

Browse files
Files changed (3) hide show
  1. scripts/fetch-benchmarks.js +132 -291
  2. scripts/fetch-providers.js +10 -3
  3. src/App.tsx +16 -1
scripts/fetch-benchmarks.js CHANGED
@@ -1,7 +1,7 @@
1
  'use strict';
2
 
3
  /**
4
- * Fetch benchmark data from five sources and merge into data/benchmarks.json.
5
  *
6
  * Sources:
7
  * 1. AchilleasDrakou/LLMStats on GitHub (71 curated models, self-reported benchmarks)
@@ -9,6 +9,7 @@
9
  * 3. LiveBench (livebench.ai) — contamination-free, monthly, 70+ frontier models
10
  * 4. Chatbot Arena (lmarena.ai) — 316 models with real ELO ratings from human votes
11
  * 5. Aider (aider.chat) — code editing benchmark, 133 tasks per model
 
12
  *
13
  * Unified field names (0-1 scale unless noted):
14
  * mmlu, mmlu_pro, gpqa, human_eval, math, gsm8k, mmmu,
@@ -18,17 +19,16 @@
18
  * lb_math, lb_language, lb_if, lb_data_analysis
19
  * arena_elo, arena_rank, arena_votes (Chatbot Arena; elo is raw ELO ~800-1500)
20
  * aider_pass_rate (Aider edit bench, 0-1)
 
 
21
  *
22
- * Where both sources have data for the same benchmark (gpqa, mmlu_pro, ifeval, bbh),
23
  * LLMStats takes priority (it stores self-reported model-card values).
24
  *
25
  * Usage:
26
  * node scripts/fetch-benchmarks.js # fetch all sources
 
27
  * node scripts/fetch-benchmarks.js livebench # refresh LiveBench only
28
- * node scripts/fetch-benchmarks.js arena # refresh Chatbot Arena only
29
- * node scripts/fetch-benchmarks.js aider # refresh Aider only
30
- * node scripts/fetch-benchmarks.js hf # refresh HF Leaderboard only
31
- * node scripts/fetch-benchmarks.js llmstats # refresh LLMStats only
32
  */
33
 
34
  const fs = require('fs');
@@ -170,18 +170,15 @@ async function fetchHFLeaderboard() {
170
  const LB_GITHUB_TREE = 'https://api.github.com/repos/LiveBench/livebench.github.io/git/trees/main?recursive=1';
171
  const LB_BASE_URL = 'https://livebench.ai';
172
 
173
- // Suffixes LiveBench appends to model names that providers don't use.
174
- // We strip these to produce a "base" name for matching.
175
  const LB_SUFFIX_RE = new RegExp(
176
  '(-thinking-(?:auto-)?(?:\\d+k-)?(?:(?:high|medium|low)-effort)?|' +
177
  '-thinking(?:-(?:64k|32k|auto|minimal))?|' +
178
  '-(?:high|medium|low)-effort|' +
179
  '-base|-non-?reasoning|-(?:high|low|min)thinking|-nothinking)' +
180
- '(?:-(?:high|medium|low)-effort)?$' // handle double-suffix like -thinking-64k-high-effort
181
  );
182
 
183
  function lbBaseName(name) {
184
- // Repeatedly strip known suffixes until stable
185
  let prev;
186
  let cur = name;
187
  do { prev = cur; cur = cur.replace(LB_SUFFIX_RE, ''); } while (cur !== prev);
@@ -230,12 +227,10 @@ async function fetchLiveBench() {
230
  const dates = tree.tree
231
  .filter((f) => f.path.startsWith('public/table_') && f.path.endsWith('.csv'))
232
  .map((f) => f.path.replace('public/table_', '').replace('.csv', ''))
233
- .sort(); // oldest first
234
  console.log(`${dates.length} releases (${dates[0]} → ${dates[dates.length - 1]})`);
235
 
236
- // Use task→group mapping from the latest categories JSON (stable across releases)
237
  const cats = await getJson(`${LB_BASE_URL}/categories_${dates[dates.length - 1]}.json`);
238
-
239
  const taskToGroup = {};
240
  for (const [cat, tasks] of Object.entries(cats)) {
241
  const group =
@@ -248,35 +243,22 @@ async function fetchLiveBench() {
248
  if (group) for (const t of tasks) taskToGroup[t] = group;
249
  }
250
 
251
- // Fetch all releases (oldest→newest), so newer results overwrite older ones per model
252
- // Map: lb_name → entry (most recent release wins)
253
  const byName = new Map();
254
  for (const date of dates) {
255
  let csv;
256
- try {
257
- csv = await getText(`${LB_BASE_URL}/table_${date}.csv`);
258
- } catch (e) {
259
- console.warn(`\n ⚠ LiveBench ${date}: ${e.message}`);
260
- continue;
261
- }
262
- for (const entry of parseLiveBenchCsv(csv, taskToGroup)) {
263
- byName.set(entry.lb_name, entry); // newer release overwrites
264
- }
265
  process.stdout.write(` LiveBench: ${date}\r`);
266
  }
267
-
268
  const entries = [...byName.values()];
269
  console.log(` LiveBench: ${entries.length} unique models across all releases`);
270
  return entries;
271
  }
272
 
273
  function mergeLiveBench(entries, lbEntries) {
274
- // Build two lookups:
275
- // exact: normalized lb_name → entry
276
- // base: normalized base-name (suffixes stripped) → best-scoring entry among variants
277
  const exactMap = new Map();
278
- const baseMap = new Map(); // base → best lb entry by lb_global
279
-
280
  for (const lb of lbEntries) {
281
  exactMap.set(normName(lb.lb_name), lb);
282
  const base = normName(lbBaseName(lb.lb_name));
@@ -285,113 +267,36 @@ function mergeLiveBench(entries, lbEntries) {
285
  if (!prev || (lb.lb_global || 0) > (prev.lb_global || 0)) baseMap.set(base, lb);
286
  }
287
  }
288
-
289
- // Track which lb entries have been used (to avoid adding them as standalone new entries)
290
  const usedLbNames = new Set();
291
-
292
  let matched = 0;
293
  for (const e of entries) {
294
- const candidates = [
295
- normName(e.name || ''),
296
- normName((e.slug || '').split('/').pop() || ''),
297
- normName((e.hf_id || '').split('/').pop() || ''),
298
- ].filter(Boolean);
299
-
300
  let lb = null;
301
- for (const c of candidates) {
302
- lb = exactMap.get(c) || baseMap.get(c);
303
- if (lb) break;
304
- }
305
- if (lb) {
306
- Object.assign(e, lb);
307
- usedLbNames.add(lb.lb_name);
308
- matched++;
309
- }
310
  }
311
-
312
- // Add standalone entries for lbEntries not matched above.
313
- // Skip variants whose base was already matched (avoid duplicating e.g. all -effort variants).
314
- // Use the base model name (without -high-effort etc.) as the entry name so that
315
- // provider model names (which have no effort suffixes) can find this entry.
316
  const usedBases = new Set([...usedLbNames].map((n) => normName(lbBaseName(n))));
317
  const newEntries = [];
318
  for (const lb of lbEntries) {
319
  if (usedLbNames.has(lb.lb_name)) continue;
320
  const base = normName(lbBaseName(lb.lb_name));
321
- if (usedBases.has(base)) continue; // a variant of a matched model — skip
322
- // Only add the best-scoring variant of each base group
323
  if (baseMap.get(base) === lb || exactMap.get(normName(lb.lb_name)) === lb) {
324
- const baseName = lbBaseName(lb.lb_name); // e.g. "claude-opus-4-5-20251101"
325
- newEntries.push({ name: baseName, ...lb }); // name uses base; lb_name keeps variant
326
  usedBases.add(base);
327
  }
328
  }
329
-
330
  console.log(` LiveBench: ${matched} matched, ${newEntries.length} new entries`);
331
  return [...entries, ...newEntries];
332
  }
333
 
334
- // ─── Merge ───────────────────────────────────────────────────────────────────
335
-
336
- function mergeEntries(llmstats, hfEntries) {
337
- // Build lookup: normalized LLMStats name/slug → entry index
338
- const lsIdx = new Map();
339
- llmstats.forEach((e, i) => {
340
- lsIdx.set(normName(e.name), i);
341
- const slugModel = e.slug?.split('/').pop() || '';
342
- if (slugModel) lsIdx.set(normName(slugModel), i);
343
- });
344
-
345
- const merged = llmstats.map((e) => ({ ...e }));
346
- const hfOnly = [];
347
-
348
- for (const hf of hfEntries) {
349
- // Try matching by the model name part of the HF ID
350
- const modelPart = normName(hf.name);
351
- // Also try stripping a leading word (org prefix embedded in model name like "Meta-Llama-...")
352
- const modelWords = modelPart.split(' ');
353
- const modelNoPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart;
354
-
355
- const idx = lsIdx.get(modelPart) ?? lsIdx.get(modelNoPrefix);
356
- if (idx !== undefined) {
357
- // Merge HF fields into LLMStats entry (LLMStats wins for shared benchmarks)
358
- const target = merged[idx];
359
- if (!target.hf_id) target.hf_id = hf.hf_id;
360
- if (!target.params_b) target.params_b = hf.params_b;
361
- if (!target.ifeval) target.ifeval = hf.ifeval;
362
- if (!target.bbh) target.bbh = hf.bbh;
363
- if (!target.gpqa) target.gpqa = hf.gpqa;
364
- if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
365
- target.hf_math_lvl5 = hf.hf_math_lvl5;
366
- target.hf_musr = hf.hf_musr;
367
- target.hf_avg = hf.hf_avg;
368
- } else {
369
- hfOnly.push(hf);
370
- }
371
- }
372
-
373
- return [...merged, ...hfOnly];
374
- }
375
-
376
  // ─── Chatbot Arena ───────────────────────────────────────────────────────────
377
 
378
  async function fetchChatbotArena() {
379
  process.stdout.write('Chatbot Arena: fetching RSC leaderboard... ');
380
-
381
- // The lmarena.ai leaderboard page renders via React Server Components.
382
- // Requesting with "RSC: 1" returns a streaming text/x-component payload that
383
- // embeds the full leaderboard entries (rank, ELO rating, votes) in the server
384
- // response — no authentication required.
385
  const text = await getText('https://lmarena.ai/en/leaderboard/text', {
386
- headers: {
387
- 'User-Agent': 'Mozilla/5.0',
388
- 'RSC': '1',
389
- 'Accept': 'text/x-component',
390
- },
391
  });
392
-
393
- // Each RSC line has the format: <hex_id>:<json_value>
394
- // Find the line containing "entries":[...] with ELO ratings
395
  let entries = null;
396
  for (const line of text.split('\n')) {
397
  if (!line.includes('"entries":[') || !line.includes('"rating":')) continue;
@@ -404,10 +309,8 @@ async function fetchChatbotArena() {
404
  entries = JSON.parse(line.substring(start, end));
405
  break;
406
  }
407
-
408
  if (!entries) throw new Error('Could not find entries in RSC payload');
409
  console.log(`${entries.length} models`);
410
-
411
  return entries.map((e) => ({
412
  arena_name: e.modelDisplayName,
413
  arena_org: e.modelOrganization,
@@ -420,30 +323,17 @@ async function fetchChatbotArena() {
420
  function mergeArena(entries, arenaEntries) {
421
  const arenaMap = new Map();
422
  for (const a of arenaEntries) arenaMap.set(normName(a.arena_name), a);
423
-
424
  let matched = 0;
425
  for (const e of entries) {
426
- const candidates = [
427
- normName(e.name || ''),
428
- normName((e.lb_name) || ''),
429
- normName((e.slug || '').split('/').pop() || ''),
430
- normName((e.hf_id || '').split('/').pop() || ''),
431
- ];
432
  const a = candidates.map((c) => arenaMap.get(c)).find(Boolean);
433
  if (a) {
434
- e.arena_elo = a.arena_elo;
435
- e.arena_rank = a.arena_rank;
436
- e.arena_votes = a.arena_votes;
437
- arenaMap.delete(normName(a.arena_name));
438
- matched++;
439
  }
440
  }
441
-
442
  const newEntries = [];
443
- for (const a of arenaMap.values()) {
444
- newEntries.push({ name: a.arena_name, ...a });
445
- }
446
-
447
  console.log(` Arena: ${matched} matched, ${newEntries.length} new entries`);
448
  return [...entries, ...newEntries];
449
  }
@@ -455,10 +345,7 @@ const AIDER_RAW = 'https://raw.githubusercontent.com/Aider-AI/aider/main/aider/w
455
  async function fetchAider() {
456
  process.stdout.write('Aider: fetching edit leaderboard... ');
457
  const text = await getText(AIDER_RAW);
458
-
459
  const rows = yaml.load(text);
460
-
461
- // Multiple runs per model — keep the one with the best pass_rate_1
462
  const best = new Map();
463
  for (const row of rows) {
464
  if (!row.model || row.pass_rate_1 === undefined) continue;
@@ -466,15 +353,10 @@ async function fetchAider() {
466
  const existing = best.get(key);
467
  if (!existing || row.pass_rate_1 > existing.pass_rate_1) best.set(key, row);
468
  }
469
-
470
  const entries = [];
471
  for (const row of best.values()) {
472
- entries.push({
473
- aider_model: row.model,
474
- aider_pass_rate: row.pass_rate_1 / 100, // normalize 0-100 → 0-1
475
- });
476
  }
477
-
478
  console.log(`${entries.length} models (best run each)`);
479
  return entries;
480
  }
@@ -482,217 +364,176 @@ async function fetchAider() {
482
  function mergeAider(entries, aiderEntries) {
483
  const aiderMap = new Map();
484
  for (const a of aiderEntries) aiderMap.set(normName(a.aider_model), a);
485
-
486
  let matched = 0;
487
  for (const e of entries) {
488
- const candidates = [
489
- normName(e.name || ''),
490
- normName((e.lb_name) || ''),
491
- normName((e.slug || '').split('/').pop() || ''),
492
- normName((e.hf_id || '').split('/').pop() || ''),
493
- normName((e.arena_name) || ''),
494
- ];
495
  const a = candidates.map((c) => aiderMap.get(c)).find(Boolean);
496
- if (a) {
497
- e.aider_pass_rate = a.aider_pass_rate;
498
- aiderMap.delete(normName(a.aider_model));
499
- matched++;
500
- }
501
  }
502
-
503
  const newEntries = [];
504
- for (const a of aiderMap.values()) {
505
- newEntries.push({ name: a.aider_model, aider_pass_rate: a.aider_pass_rate });
506
- }
507
-
508
  console.log(` Aider: ${matched} matched, ${newEntries.length} new entries`);
509
  return [...entries, ...newEntries];
510
  }
511
 
512
- // ─── Per-source partial refresh ──────────────────────────────────────────────
513
 
514
- // Fields owned by each source. Stripping these fields + removing source-only
515
- // entries allows re-running just one source without losing other sources' data.
516
- const SOURCE_FIELDS = {
517
- llmstats: ['slug', 'mmlu', 'mmlu_pro', 'gpqa', 'human_eval', 'math', 'gsm8k', 'mmmu', 'hellaswag', 'ifeval', 'arc', 'drop', 'mbpp', 'mgsm', 'bbh'],
518
- hf: ['hf_id', 'params_b', 'hf_math_lvl5', 'hf_musr', 'hf_avg'],
519
- livebench: ['lb_name', 'lb_global', 'lb_reasoning', 'lb_coding', 'lb_math', 'lb_language', 'lb_if', 'lb_data_analysis'],
520
- arena: ['arena_name', 'arena_org', 'arena_elo', 'arena_rank', 'arena_votes'],
521
- aider: ['aider_model', 'aider_pass_rate'],
522
- };
523
 
524
- // A unique field that only this source contributes (used to detect source-only entries).
525
- const SOURCE_ID_FIELD = {
526
- llmstats: 'slug',
527
- hf: 'hf_id',
528
- livebench: 'lb_name',
529
- arena: 'arena_elo',
530
- aider: 'aider_pass_rate',
531
- };
532
 
533
- const ALL_ID_FIELDS = Object.values(SOURCE_ID_FIELD);
534
-
535
- function stripSourceFields(entries, source) {
536
- const fields = SOURCE_FIELDS[source];
537
- const ownId = SOURCE_ID_FIELD[source];
538
- const otherId = ALL_ID_FIELDS.filter((id) => id !== ownId);
539
- return entries
540
- // Drop entries that only belong to this source (no other source data)
541
- .filter((e) => otherId.some((id) => e[id] !== undefined))
542
- .map((e) => {
543
- const stripped = { ...e };
544
- for (const f of fields) delete stripped[f];
545
- return stripped;
546
- });
 
 
 
547
  }
548
 
549
- // Merge freshly-fetched LLMStats data into an existing array of entries.
550
- function mergeLLMStatsInto(entries, llmstats) {
551
- const LS_FIELDS = SOURCE_FIELDS.llmstats;
552
- const nameMap = new Map();
553
- entries.forEach((e, i) => {
554
- if (e.name) nameMap.set(normName(e.name), i);
555
- const slugModel = (e.slug || '').split('/').pop();
556
- if (slugModel) nameMap.set(normName(slugModel), i);
557
- });
558
 
559
  let matched = 0;
560
- const usedIdx = new Set();
561
- const newEntries = [];
 
 
 
 
 
 
562
 
563
- for (const ls of llmstats) {
564
- const candidates = [normName(ls.name || ''), normName((ls.slug || '').split('/').pop())].filter(Boolean);
565
- const idx = candidates.map((c) => nameMap.get(c)).find((n) => n !== undefined);
566
- if (idx !== undefined && !usedIdx.has(idx)) {
567
- const target = entries[idx];
568
- for (const f of LS_FIELDS) { if (ls[f] !== undefined) target[f] = ls[f]; }
569
- usedIdx.add(idx);
570
  matched++;
571
- } else {
572
- newEntries.push({ ...ls });
573
  }
574
  }
575
 
576
- console.log(` LLMStats: ${matched} matched, ${newEntries.length} new entries`);
 
 
 
 
 
577
  return [...entries, ...newEntries];
578
  }
579
 
580
- // Merge freshly-fetched HF data into an existing array of entries.
581
- function mergeHFInto(entries, hfEntries) {
582
- const nameMap = new Map();
583
- entries.forEach((e, i) => {
584
- if (e.name) nameMap.set(normName(e.name), i);
585
- const slugModel = (e.slug || '').split('/').pop();
586
- if (slugModel) nameMap.set(normName(slugModel), i);
587
- });
588
-
589
- let matched = 0;
590
- const usedIdx = new Set();
591
- const newEntries = [];
592
 
 
 
 
 
 
 
 
 
 
593
  for (const hf of hfEntries) {
594
- const modelPart = normName(hf.name);
595
- const modelWords = modelPart.split(' ');
596
- const noPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart;
597
- const candidates = [modelPart, noPrefix].filter(Boolean);
598
- const idx = candidates.map((c) => nameMap.get(c)).find((n) => n !== undefined);
599
-
600
- if (idx !== undefined && !usedIdx.has(idx)) {
601
- const target = entries[idx];
602
- if (!target.hf_id) target.hf_id = hf.hf_id;
603
  if (!target.params_b) target.params_b = hf.params_b;
604
- // LLMStats takes priority for shared benchmarks
605
- if (!target.ifeval) target.ifeval = hf.ifeval;
606
- if (!target.bbh) target.bbh = hf.bbh;
607
- if (!target.gpqa) target.gpqa = hf.gpqa;
608
  if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
609
- // HF-exclusive fields always updated
610
  target.hf_math_lvl5 = hf.hf_math_lvl5;
611
- target.hf_musr = hf.hf_musr;
612
- target.hf_avg = hf.hf_avg;
613
- usedIdx.add(idx);
614
- matched++;
615
- } else {
616
- newEntries.push({ ...hf });
617
- }
618
  }
619
-
620
- console.log(` HF: ${matched} matched, ${newEntries.length} new entries`);
621
- return [...entries, ...newEntries];
622
  }
623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
  async function refreshSource(source) {
625
  if (!SOURCE_FIELDS[source]) {
626
  console.error(`Unknown source "${source}". Valid: ${Object.keys(SOURCE_FIELDS).join(', ')}`);
627
  process.exit(1);
628
  }
629
-
630
  console.log(`Refreshing benchmark source: ${source}\n`);
631
  const existing = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
632
- const stripped = stripSourceFields(existing, source);
633
-
 
 
634
  let result;
635
- if (source === 'llmstats') {
636
- const data = await fetchLLMStats();
637
- result = mergeLLMStatsInto(stripped, data);
638
- } else if (source === 'hf') {
639
- const data = await fetchHFLeaderboard();
640
- result = mergeHFInto(stripped, data);
641
- } else if (source === 'livebench') {
642
- const data = await fetchLiveBench();
643
- result = mergeLiveBench(stripped, data);
644
- } else if (source === 'arena') {
645
- const data = await fetchChatbotArena();
646
- result = mergeArena(stripped, data);
647
- } else if (source === 'aider') {
648
- const data = await fetchAider();
649
- result = mergeAider(stripped, data);
650
- }
651
-
652
  fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
653
- console.log(`\nSaved ${result.length} entries to data/benchmarks.json`);
654
  }
655
 
656
  // ─── Main ────────────────────────────────────────────────────────────────────
657
 
658
  async function main() {
659
  const source = process.argv[2]?.toLowerCase();
 
660
 
661
- // Per-source refresh mode
662
- if (source) {
663
- await refreshSource(source);
664
- return;
665
- }
666
-
667
- // Full rebuild — all sources
668
- const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries] = await Promise.all([
669
  fetchLLMStats(),
670
  fetchHFLeaderboard(),
671
  fetchLiveBench(),
672
  fetchChatbotArena(),
673
  fetchAider(),
 
674
  ]);
675
 
676
  const merged = mergeEntries(llmstats, hfEntries);
677
  const withLB = mergeLiveBench(merged, lbEntries);
678
  const withAr = mergeArena(withLB, arenaEntries);
679
- const all = mergeAider(withAr, aiderEntries);
680
-
681
- const hfOnlyCount = all.filter((e) => e.hf_id && !e.slug).length;
682
- const lsOnlyCount = all.filter((e) => e.slug && !e.hf_id).length;
683
- const bothCount = all.filter((e) => e.slug && e.hf_id).length;
684
- const lbCount = all.filter((e) => e.lb_name).length;
685
- const arenaCount = all.filter((e) => e.arena_elo).length;
686
- const aiderCount = all.filter((e) => e.aider_pass_rate !== undefined).length;
687
  console.log(`\nTotal entries: ${all.length}`);
688
- console.log(` LLMStats only: ${lsOnlyCount} | HF only: ${hfOnlyCount} | Both: ${bothCount}`);
689
- console.log(` With LiveBench: ${lbCount} | With Arena ELO: ${arenaCount} | With Aider: ${aiderCount}`);
690
 
691
  fs.writeFileSync(OUT_FILE, JSON.stringify(all, null, 2));
692
  console.log(`Saved to data/benchmarks.json (${(fs.statSync(OUT_FILE).size / 1024).toFixed(0)} KB)`);
693
  }
694
 
695
- main().catch((err) => {
696
- console.error('Fatal:', err);
697
- process.exit(1);
698
- });
 
1
  'use strict';
2
 
3
  /**
4
+ * Fetch benchmark data from six sources and merge into data/benchmarks.json.
5
  *
6
  * Sources:
7
  * 1. AchilleasDrakou/LLMStats on GitHub (71 curated models, self-reported benchmarks)
 
9
  * 3. LiveBench (livebench.ai) — contamination-free, monthly, 70+ frontier models
10
  * 4. Chatbot Arena (lmarena.ai) — 316 models with real ELO ratings from human votes
11
  * 5. Aider (aider.chat) — code editing benchmark, 133 tasks per model
12
+ * 6. Artificial Analysis (artificialanalysis.ai) — independent evaluations and speed benchmarks
13
  *
14
  * Unified field names (0-1 scale unless noted):
15
  * mmlu, mmlu_pro, gpqa, human_eval, math, gsm8k, mmmu,
 
19
  * lb_math, lb_language, lb_if, lb_data_analysis
20
  * arena_elo, arena_rank, arena_votes (Chatbot Arena; elo is raw ELO ~800-1500)
21
  * aider_pass_rate (Aider edit bench, 0-1)
22
+ * aa_id, aa_intelligence, aa_mmlu_pro, aa_gpqa, (Artificial Analysis)
23
+ * aa_livecodebench, aa_tokens_per_s, aa_latency_s
24
  *
25
+ * Where multiple sources have data for the same benchmark,
26
  * LLMStats takes priority (it stores self-reported model-card values).
27
  *
28
  * Usage:
29
  * node scripts/fetch-benchmarks.js # fetch all sources
30
+ * node scripts/fetch-benchmarks.js aa # refresh Artificial Analysis only
31
  * node scripts/fetch-benchmarks.js livebench # refresh LiveBench only
 
 
 
 
32
  */
33
 
34
  const fs = require('fs');
 
170
  const LB_GITHUB_TREE = 'https://api.github.com/repos/LiveBench/livebench.github.io/git/trees/main?recursive=1';
171
  const LB_BASE_URL = 'https://livebench.ai';
172
 
 
 
173
  const LB_SUFFIX_RE = new RegExp(
174
  '(-thinking-(?:auto-)?(?:\\d+k-)?(?:(?:high|medium|low)-effort)?|' +
175
  '-thinking(?:-(?:64k|32k|auto|minimal))?|' +
176
  '-(?:high|medium|low)-effort|' +
177
  '-base|-non-?reasoning|-(?:high|low|min)thinking|-nothinking)' +
178
+ '(?:-(?:high|medium|low)-effort)?$'
179
  );
180
 
181
  function lbBaseName(name) {
 
182
  let prev;
183
  let cur = name;
184
  do { prev = cur; cur = cur.replace(LB_SUFFIX_RE, ''); } while (cur !== prev);
 
227
  const dates = tree.tree
228
  .filter((f) => f.path.startsWith('public/table_') && f.path.endsWith('.csv'))
229
  .map((f) => f.path.replace('public/table_', '').replace('.csv', ''))
230
+ .sort();
231
  console.log(`${dates.length} releases (${dates[0]} → ${dates[dates.length - 1]})`);
232
 
 
233
  const cats = await getJson(`${LB_BASE_URL}/categories_${dates[dates.length - 1]}.json`);
 
234
  const taskToGroup = {};
235
  for (const [cat, tasks] of Object.entries(cats)) {
236
  const group =
 
243
  if (group) for (const t of tasks) taskToGroup[t] = group;
244
  }
245
 
 
 
246
  const byName = new Map();
247
  for (const date of dates) {
248
  let csv;
249
+ try { csv = await getText(`${LB_BASE_URL}/table_${date}.csv`); }
250
+ catch (e) { console.warn(`\n ⚠ LiveBench ${date}: ${e.message}`); continue; }
251
+ for (const entry of parseLiveBenchCsv(csv, taskToGroup)) byName.set(entry.lb_name, entry);
 
 
 
 
 
 
252
  process.stdout.write(` LiveBench: ${date}\r`);
253
  }
 
254
  const entries = [...byName.values()];
255
  console.log(` LiveBench: ${entries.length} unique models across all releases`);
256
  return entries;
257
  }
258
 
259
  function mergeLiveBench(entries, lbEntries) {
 
 
 
260
  const exactMap = new Map();
261
+ const baseMap = new Map();
 
262
  for (const lb of lbEntries) {
263
  exactMap.set(normName(lb.lb_name), lb);
264
  const base = normName(lbBaseName(lb.lb_name));
 
267
  if (!prev || (lb.lb_global || 0) > (prev.lb_global || 0)) baseMap.set(base, lb);
268
  }
269
  }
 
 
270
  const usedLbNames = new Set();
 
271
  let matched = 0;
272
  for (const e of entries) {
273
+ const candidates = [normName(e.name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')].filter(Boolean);
 
 
 
 
 
274
  let lb = null;
275
+ for (const c of candidates) { lb = exactMap.get(c) || baseMap.get(c); if (lb) break; }
276
+ if (lb) { Object.assign(e, lb); usedLbNames.add(lb.lb_name); matched++; }
 
 
 
 
 
 
 
277
  }
 
 
 
 
 
278
  const usedBases = new Set([...usedLbNames].map((n) => normName(lbBaseName(n))));
279
  const newEntries = [];
280
  for (const lb of lbEntries) {
281
  if (usedLbNames.has(lb.lb_name)) continue;
282
  const base = normName(lbBaseName(lb.lb_name));
283
+ if (usedBases.has(base)) continue;
 
284
  if (baseMap.get(base) === lb || exactMap.get(normName(lb.lb_name)) === lb) {
285
+ newEntries.push({ name: lbBaseName(lb.lb_name), ...lb });
 
286
  usedBases.add(base);
287
  }
288
  }
 
289
  console.log(` LiveBench: ${matched} matched, ${newEntries.length} new entries`);
290
  return [...entries, ...newEntries];
291
  }
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  // ─── Chatbot Arena ───────────────────────────────────────────────────────────
294
 
295
  async function fetchChatbotArena() {
296
  process.stdout.write('Chatbot Arena: fetching RSC leaderboard... ');
 
 
 
 
 
297
  const text = await getText('https://lmarena.ai/en/leaderboard/text', {
298
+ headers: { 'User-Agent': 'Mozilla/5.0', 'RSC': '1', 'Accept': 'text/x-component' },
 
 
 
 
299
  });
 
 
 
300
  let entries = null;
301
  for (const line of text.split('\n')) {
302
  if (!line.includes('"entries":[') || !line.includes('"rating":')) continue;
 
309
  entries = JSON.parse(line.substring(start, end));
310
  break;
311
  }
 
312
  if (!entries) throw new Error('Could not find entries in RSC payload');
313
  console.log(`${entries.length} models`);
 
314
  return entries.map((e) => ({
315
  arena_name: e.modelDisplayName,
316
  arena_org: e.modelOrganization,
 
323
  function mergeArena(entries, arenaEntries) {
324
  const arenaMap = new Map();
325
  for (const a of arenaEntries) arenaMap.set(normName(a.arena_name), a);
 
326
  let matched = 0;
327
  for (const e of entries) {
328
+ const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || '')];
 
 
 
 
 
329
  const a = candidates.map((c) => arenaMap.get(c)).find(Boolean);
330
  if (a) {
331
+ e.arena_elo = a.arena_elo; e.arena_rank = a.arena_rank; e.arena_votes = a.arena_votes;
332
+ arenaMap.delete(normName(a.arena_name)); matched++;
 
 
 
333
  }
334
  }
 
335
  const newEntries = [];
336
+ for (const a of arenaMap.values()) newEntries.push({ name: a.arena_name, ...a });
 
 
 
337
  console.log(` Arena: ${matched} matched, ${newEntries.length} new entries`);
338
  return [...entries, ...newEntries];
339
  }
 
345
  async function fetchAider() {
346
  process.stdout.write('Aider: fetching edit leaderboard... ');
347
  const text = await getText(AIDER_RAW);
 
348
  const rows = yaml.load(text);
 
 
349
  const best = new Map();
350
  for (const row of rows) {
351
  if (!row.model || row.pass_rate_1 === undefined) continue;
 
353
  const existing = best.get(key);
354
  if (!existing || row.pass_rate_1 > existing.pass_rate_1) best.set(key, row);
355
  }
 
356
  const entries = [];
357
  for (const row of best.values()) {
358
+ entries.push({ aider_model: row.model, aider_pass_rate: row.pass_rate_1 / 100 });
 
 
 
359
  }
 
360
  console.log(`${entries.length} models (best run each)`);
361
  return entries;
362
  }
 
364
  function mergeAider(entries, aiderEntries) {
365
  const aiderMap = new Map();
366
  for (const a of aiderEntries) aiderMap.set(normName(a.aider_model), a);
 
367
  let matched = 0;
368
  for (const e of entries) {
369
+ const candidates = [normName(e.name || ''), normName(e.lb_name || ''), normName((e.slug || '').split('/').pop() || ''), normName((e.hf_id || '').split('/').pop() || ''), normName(e.arena_name || '')];
 
 
 
 
 
 
370
  const a = candidates.map((c) => aiderMap.get(c)).find(Boolean);
371
+ if (a) { e.aider_pass_rate = a.aider_pass_rate; aiderMap.delete(normName(a.aider_model)); matched++; }
 
 
 
 
372
  }
 
373
  const newEntries = [];
374
+ for (const a of aiderMap.values()) newEntries.push({ name: a.aider_model, aider_pass_rate: a.aider_pass_rate });
 
 
 
375
  console.log(` Aider: ${matched} matched, ${newEntries.length} new entries`);
376
  return [...entries, ...newEntries];
377
  }
378
 
379
+ // ─── Artificial Analysis ───────────────────────────────────────────────────
380
 
381
+ async function fetchArtificialAnalysis() {
382
+ const apiKey = process.env.ARTIFICIAL_ANALYSIS_API_KEY;
383
+ if (!apiKey) {
384
+ console.log('Artificial Analysis: skipping (no API key found)');
385
+ return [];
386
+ }
 
 
 
387
 
388
+ process.stdout.write('Artificial Analysis: fetching benchmarks... ');
389
+ const res = await getJson('https://artificialanalysis.ai/api/v2/data/llms/models', {
390
+ headers: { 'x-api-key': apiKey },
391
+ });
 
 
 
 
392
 
393
+ if (!res.data) throw new Error('Invalid response from Artificial Analysis API');
394
+ console.log(`${res.data.length} models`);
395
+
396
+ return res.data.map((m) => {
397
+ const ev = m.evaluations || {};
398
+ return {
399
+ aa_id: m.id,
400
+ aa_name: m.name,
401
+ aa_slug: m.slug,
402
+ aa_intelligence: ev.artificial_analysis_intelligence_index, // typically 0-100
403
+ aa_mmlu_pro: ev.mmlu_pro, // 0-1
404
+ aa_gpqa: ev.gpqa, // 0-1
405
+ aa_livecodebench: ev.livecodebench, // 0-1
406
+ aa_tokens_per_s: m.median_output_tokens_per_second,
407
+ aa_latency_s: m.median_time_to_first_token_seconds,
408
+ };
409
+ });
410
  }
411
 
412
+ function mergeArtificialAnalysis(entries, aaEntries) {
413
+ const aaMap = new Map();
414
+ for (const a of aaEntries) aaMap.set(normName(a.aa_name), a);
 
 
 
 
 
 
415
 
416
  let matched = 0;
417
+ for (const e of entries) {
418
+ const candidates = [
419
+ normName(e.name || ''),
420
+ normName(e.lb_name || ''),
421
+ normName((e.slug || '').split('/').pop() || ''),
422
+ normName((e.hf_id || '').split('/').pop() || ''),
423
+ normName(e.arena_name || ''),
424
+ ].filter(Boolean);
425
 
426
+ const aa = candidates.map((c) => aaMap.get(c)).find(Boolean);
427
+ if (aa) {
428
+ Object.assign(e, aa);
429
+ aaMap.delete(normName(aa.aa_name));
 
 
 
430
  matched++;
 
 
431
  }
432
  }
433
 
434
+ const newEntries = [];
435
+ for (const a of aaMap.values()) {
436
+ newEntries.push({ name: a.aa_name, ...a });
437
+ }
438
+
439
+ console.log(` AA: ${matched} matched, ${newEntries.length} new entries`);
440
  return [...entries, ...newEntries];
441
  }
442
 
443
+ // ─── Merge ───────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
444
 
445
+ function mergeEntries(llmstats, hfEntries) {
446
+ const lsIdx = new Map();
447
+ llmstats.forEach((e, i) => {
448
+ lsIdx.set(normName(e.name), i);
449
+ const slugModel = e.slug?.split('/').pop() || '';
450
+ if (slugModel) lsIdx.set(normName(slugModel), i);
451
+ });
452
+ const merged = llmstats.map((e) => ({ ...e }));
453
+ const hfOnly = [];
454
  for (const hf of hfEntries) {
455
+ const modelPart = normName(hf.name);
456
+ const modelWords = modelPart.split(' ');
457
+ const modelNoPrefix = modelWords.length > 1 ? modelWords.slice(1).join(' ') : modelPart;
458
+ const idx = lsIdx.get(modelPart) ?? lsIdx.get(modelNoPrefix);
459
+ if (idx !== undefined) {
460
+ const target = merged[idx];
461
+ if (!target.hf_id) target.hf_id = hf.hf_id;
 
 
462
  if (!target.params_b) target.params_b = hf.params_b;
463
+ if (!target.ifeval) target.ifeval = hf.ifeval;
464
+ if (!target.bbh) target.bbh = hf.bbh;
465
+ if (!target.gpqa) target.gpqa = hf.gpqa;
 
466
  if (!target.mmlu_pro) target.mmlu_pro = hf.mmlu_pro;
 
467
  target.hf_math_lvl5 = hf.hf_math_lvl5;
468
+ target.hf_musr = hf.hf_musr;
469
+ target.hf_avg = hf.hf_avg;
470
+ } else hfOnly.push(hf);
 
 
 
 
471
  }
472
+ return [...merged, ...hfOnly];
 
 
473
  }
474
 
475
+ // ─── Refresh ─────────────────────────────────────────────────────────────────
476
+
477
+ const SOURCE_FIELDS = {
478
+ llmstats: ['slug', 'mmlu', 'mmlu_pro', 'gpqa', 'human_eval', 'math', 'gsm8k', 'mmmu', 'hellaswag', 'ifeval', 'arc', 'drop', 'mbpp', 'mgsm', 'bbh'],
479
+ hf: ['hf_id', 'params_b', 'hf_math_lvl5', 'hf_musr', 'hf_avg'],
480
+ livebench: ['lb_name', 'lb_global', 'lb_reasoning', 'lb_coding', 'lb_math', 'lb_language', 'lb_if', 'lb_data_analysis'],
481
+ arena: ['arena_name', 'arena_org', 'arena_elo', 'arena_rank', 'arena_votes'],
482
+ aider: ['aider_model', 'aider_pass_rate'],
483
+ aa: ['aa_id', 'aa_intelligence', 'aa_mmlu_pro', 'aa_gpqa', 'aa_livecodebench', 'aa_tokens_per_s', 'aa_latency_s'],
484
+ };
485
+
486
+ const SOURCE_ID_FIELD = {
487
+ llmstats: 'slug', hf: 'hf_id', livebench: 'lb_name', arena: 'arena_elo', aider: 'aider_pass_rate', aa: 'aa_intelligence',
488
+ };
489
+
490
  async function refreshSource(source) {
491
  if (!SOURCE_FIELDS[source]) {
492
  console.error(`Unknown source "${source}". Valid: ${Object.keys(SOURCE_FIELDS).join(', ')}`);
493
  process.exit(1);
494
  }
 
495
  console.log(`Refreshing benchmark source: ${source}\n`);
496
  const existing = JSON.parse(fs.readFileSync(OUT_FILE, 'utf8'));
497
+ const otherIdFields = Object.values(SOURCE_ID_FIELD).filter(f => f !== SOURCE_ID_FIELD[source]);
498
+ const stripped = existing.filter(e => otherIdFields.some(f => e[f] !== undefined)).map(e => {
499
+ const s = { ...e }; for (const f of SOURCE_FIELDS[source]) delete s[f]; return s;
500
+ });
501
  let result;
502
+ if (source === 'llmstats') result = mergeLLMStatsInto(stripped, await fetchLLMStats());
503
+ else if (source === 'hf') result = mergeHFInto(stripped, await fetchHFLeaderboard());
504
+ else if (source === 'livebench') result = mergeLiveBench(stripped, await fetchLiveBench());
505
+ else if (source === 'arena') result = mergeArena(stripped, await fetchChatbotArena());
506
+ else if (source === 'aider') result = mergeAider(stripped, await fetchAider());
507
+ else if (source === 'aa') result = mergeArtificialAnalysis(stripped, await fetchArtificialAnalysis());
 
 
 
 
 
 
 
 
 
 
 
508
  fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
 
509
  }
510
 
511
  // ─── Main ────────────────────────────────────────────────────────────────────
512
 
513
  async function main() {
514
  const source = process.argv[2]?.toLowerCase();
515
+ if (source) { await refreshSource(source); return; }
516
 
517
+ const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries] = await Promise.all([
 
 
 
 
 
 
 
518
  fetchLLMStats(),
519
  fetchHFLeaderboard(),
520
  fetchLiveBench(),
521
  fetchChatbotArena(),
522
  fetchAider(),
523
+ fetchArtificialAnalysis(),
524
  ]);
525
 
526
  const merged = mergeEntries(llmstats, hfEntries);
527
  const withLB = mergeLiveBench(merged, lbEntries);
528
  const withAr = mergeArena(withLB, arenaEntries);
529
+ const withAi = mergeAider(withAr, aiderEntries);
530
+ const all = mergeArtificialAnalysis(withAi, aaEntries);
531
+
 
 
 
 
 
532
  console.log(`\nTotal entries: ${all.length}`);
533
+ console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length}`);
 
534
 
535
  fs.writeFileSync(OUT_FILE, JSON.stringify(all, null, 2));
536
  console.log(`Saved to data/benchmarks.json (${(fs.statSync(OUT_FILE).size / 1024).toFixed(0)} KB)`);
537
  }
538
 
539
+ main().catch((err) => { console.error('Fatal:', err); process.exit(1); });
 
 
 
scripts/fetch-providers.js CHANGED
@@ -219,22 +219,29 @@ const MANUAL_HF_ID_MAP = {
219
  'flux 1 1 pro ultra': 'black-forest-labs/FLUX.1-pro',
220
  'flux 1 fill pro': 'black-forest-labs/FLUX.1-pro',
221
  'flux 1 kontext max': 'black-forest-labs/FLUX.1-pro',
 
 
 
 
 
 
222
  };
223
 
224
  const MANUAL_SIZE_MAP = {
225
  'BAAI/bge-m3': 0.57,
226
- // FLUX.1 family (Original 12B architecture)
227
  'black-forest-labs/FLUX.1-schnell': 12,
228
  'black-forest-labs/FLUX.1-dev': 12,
229
  'black-forest-labs/FLUX.1-pro': 12,
230
- // FLUX.2 family (32B flagship architecture with Mistral-3 24B backbone)
231
  'black-forest-labs/FLUX.2-dev': 32,
232
  'black-forest-labs/FLUX.2-pro': 32,
233
  'black-forest-labs/FLUX.2-flex': 32,
234
  'black-forest-labs/FLUX.2-max': 32,
235
- // FLUX.2 Klein (Optimized smaller versions)
236
  'black-forest-labs/FLUX.2-klein-4B': 4,
237
  'black-forest-labs/FLUX.2-klein-9B': 9,
 
 
 
238
  };
239
 
240
  // Propagate capabilities and size from benchmarks, OpenRouter, or HF Hub to all other providers' models.
 
219
  'flux 1 1 pro ultra': 'black-forest-labs/FLUX.1-pro',
220
  'flux 1 fill pro': 'black-forest-labs/FLUX.1-pro',
221
  'flux 1 kontext max': 'black-forest-labs/FLUX.1-pro',
222
+ // Mistral mappings
223
+ 'mistral large 2407': 'mistralai/Mistral-Large-Instruct-2407',
224
+ 'mistral large latest': 'mistralai/Mistral-Large-Instruct-2407',
225
+ 'mistral large 2': 'mistralai/Mistral-Large-Instruct-2407',
226
+ 'mistral large 2411': 'mistralai/Mistral-Large-Instruct-2411',
227
+ 'mistral large 3': 'mistralai/Mistral-Large-Instruct-2411',
228
  };
229
 
230
  const MANUAL_SIZE_MAP = {
231
  'BAAI/bge-m3': 0.57,
232
+ // FLUX family
233
  'black-forest-labs/FLUX.1-schnell': 12,
234
  'black-forest-labs/FLUX.1-dev': 12,
235
  'black-forest-labs/FLUX.1-pro': 12,
 
236
  'black-forest-labs/FLUX.2-dev': 32,
237
  'black-forest-labs/FLUX.2-pro': 32,
238
  'black-forest-labs/FLUX.2-flex': 32,
239
  'black-forest-labs/FLUX.2-max': 32,
 
240
  'black-forest-labs/FLUX.2-klein-4B': 4,
241
  'black-forest-labs/FLUX.2-klein-9B': 9,
242
+ // Mistral family
243
+ 'mistralai/Mistral-Large-Instruct-2407': 123,
244
+ 'mistralai/Mistral-Large-Instruct-2411': 675, // 41B active
245
  };
246
 
247
  // Propagate capabilities and size from benchmarks, OpenRouter, or HF Hub to all other providers' models.
src/App.tsx CHANGED
@@ -74,6 +74,14 @@ interface BenchmarkEntry {
74
  arena_votes?: number;
75
  // Aider code editing benchmark (aider.chat)
76
  aider_pass_rate?: number; // 0-1, first-pass success on 133 coding tasks
 
 
 
 
 
 
 
 
77
  }
78
 
79
  const normalizeName = (s: string) =>
@@ -328,7 +336,9 @@ function App() {
328
  case 'lb_if':
329
  case 'lb_data_analysis':
330
  case 'arena_elo':
331
- case 'aider_pass_rate': {
 
 
332
  const bA = findBenchmark(a.name);
333
  const bB = findBenchmark(b.name);
334
  aValue = bA?.[sortConfig.key as keyof BenchmarkEntry] as number ?? -1;
@@ -476,6 +486,8 @@ function App() {
476
  {showBenchmarks && <>
477
  <th onClick={() => requestSort('arena_elo')} className="sortable" title="Chatbot Arena ELO (human preference votes)">Arena ELO {getSortIcon('arena_elo')}</th>
478
  <th onClick={() => requestSort('aider_pass_rate')} className="sortable" title="Aider code editing benchmark (pass rate, 133 tasks)">Aider {getSortIcon('aider_pass_rate')}</th>
 
 
479
  <th onClick={() => requestSort('lb_global')} className="sortable" title="LiveBench overall average (contamination-free)">LB {getSortIcon('lb_global')}</th>
480
  <th onClick={() => requestSort('lb_math')} className="sortable" title="LiveBench Mathematics">LB-Math {getSortIcon('lb_math')}</th>
481
  <th onClick={() => requestSort('lb_coding')} className="sortable" title="LiveBench Coding + Agentic Coding">LB-Code {getSortIcon('lb_coding')}</th>
@@ -555,6 +567,8 @@ function App() {
555
  return <>
556
  <td className="benchmark-cell">{bm?.arena_elo !== undefined ? Math.round(bm.arena_elo) : '–'}</td>
557
  <td className="benchmark-cell">{fmt(bm?.aider_pass_rate)}</td>
 
 
558
  <td className="benchmark-cell">{fmt(bm?.lb_global)}</td>
559
  <td className="benchmark-cell">{fmt(bm?.lb_math)}</td>
560
  <td className="benchmark-cell">{fmt(bm?.lb_coding)}</td>
@@ -578,6 +592,7 @@ function App() {
578
 
579
  <footer>
580
  <p>* All prices normalized to USD for comparison using 1 EUR = {EXCHANGE_RATE_EUR_TO_USD} USD.</p>
 
581
  <p>Sorted by input price by default.</p>
582
  </footer>
583
  </div>
 
74
  arena_votes?: number;
75
  // Aider code editing benchmark (aider.chat)
76
  aider_pass_rate?: number; // 0-1, first-pass success on 133 coding tasks
77
+ // Artificial Analysis (artificialanalysis.ai)
78
+ aa_id?: string;
79
+ aa_intelligence?: number; // 0-100 intelligence index
80
+ aa_mmlu_pro?: number;
81
+ aa_gpqa?: number;
82
+ aa_livecodebench?: number;
83
+ aa_tokens_per_s?: number;
84
+ aa_latency_s?: number;
85
  }
86
 
87
  const normalizeName = (s: string) =>
 
336
  case 'lb_if':
337
  case 'lb_data_analysis':
338
  case 'arena_elo':
339
+ case 'aider_pass_rate':
340
+ case 'aa_intelligence':
341
+ case 'aa_tokens_per_s': {
342
  const bA = findBenchmark(a.name);
343
  const bB = findBenchmark(b.name);
344
  aValue = bA?.[sortConfig.key as keyof BenchmarkEntry] as number ?? -1;
 
486
  {showBenchmarks && <>
487
  <th onClick={() => requestSort('arena_elo')} className="sortable" title="Chatbot Arena ELO (human preference votes)">Arena ELO {getSortIcon('arena_elo')}</th>
488
  <th onClick={() => requestSort('aider_pass_rate')} className="sortable" title="Aider code editing benchmark (pass rate, 133 tasks)">Aider {getSortIcon('aider_pass_rate')}</th>
489
+ <th onClick={() => requestSort('aa_intelligence')} className="sortable" title="Artificial Analysis Intelligence Index (0-100)">AA Intel {getSortIcon('aa_intelligence')}</th>
490
+ <th onClick={() => requestSort('aa_tokens_per_s')} className="sortable" title="Artificial Analysis Median Speed (Tokens per Second)">AA Speed {getSortIcon('aa_tokens_per_s')}</th>
491
  <th onClick={() => requestSort('lb_global')} className="sortable" title="LiveBench overall average (contamination-free)">LB {getSortIcon('lb_global')}</th>
492
  <th onClick={() => requestSort('lb_math')} className="sortable" title="LiveBench Mathematics">LB-Math {getSortIcon('lb_math')}</th>
493
  <th onClick={() => requestSort('lb_coding')} className="sortable" title="LiveBench Coding + Agentic Coding">LB-Code {getSortIcon('lb_coding')}</th>
 
567
  return <>
568
  <td className="benchmark-cell">{bm?.arena_elo !== undefined ? Math.round(bm.arena_elo) : '–'}</td>
569
  <td className="benchmark-cell">{fmt(bm?.aider_pass_rate)}</td>
570
+ <td className="benchmark-cell">{bm?.aa_intelligence !== undefined ? Math.round(bm.aa_intelligence) : '–'}</td>
571
+ <td className="benchmark-cell">{bm?.aa_tokens_per_s !== undefined ? Math.round(bm.aa_tokens_per_s) : '–'}</td>
572
  <td className="benchmark-cell">{fmt(bm?.lb_global)}</td>
573
  <td className="benchmark-cell">{fmt(bm?.lb_math)}</td>
574
  <td className="benchmark-cell">{fmt(bm?.lb_coding)}</td>
 
592
 
593
  <footer>
594
  <p>* All prices normalized to USD for comparison using 1 EUR = {EXCHANGE_RATE_EUR_TO_USD} USD.</p>
595
+ <p>Benchmark data from LLMStats, HF Leaderboard, LiveBench, Chatbot Arena, Aider, and <a href="https://artificialanalysis.ai/" target="_blank" rel="noopener noreferrer">Artificial Analysis</a>.</p>
596
  <p>Sorted by input price by default.</p>
597
  </footer>
598
  </div>