CrispStrobe commited on
Commit
5afa393
Β·
1 Parent(s): 062b038

feat: extract MTEB benchmarks from Hugging Face README metadata

Browse files
Files changed (2) hide show
  1. data/benchmarks.json +16 -16
  2. scripts/fetch-benchmarks.js +79 -2
data/benchmarks.json CHANGED
@@ -72956,11 +72956,11 @@
72956
  {
72957
  "hf_id": "BAAI/bge-large-en-v1.5",
72958
  "name": "bge-large-en-v1.5",
72959
- "mteb_avg": 46.8,
72960
- "mteb_retrieval": 38.8,
72961
  "sources": {
72962
- "mteb_avg": "mteb",
72963
- "mteb_retrieval": "mteb"
72964
  }
72965
  },
72966
  {
@@ -72986,20 +72986,20 @@
72986
  {
72987
  "hf_id": "intfloat/e5-mistral-7b-instruct",
72988
  "name": "e5-mistral-7b-instruct",
72989
- "mteb_avg": 62.08,
72990
- "mteb_retrieval": 55.06,
72991
  "sources": {
72992
- "mteb_avg": "mteb",
72993
- "mteb_retrieval": "mteb"
72994
  }
72995
  },
72996
  {
72997
  "hf_id": "BAAI/bge-multilingual-gemma2",
72998
- "mteb_avg": 70.3,
72999
- "mteb_retrieval": 67.5,
73000
  "sources": {
73001
- "mteb_avg": "manual",
73002
- "mteb_retrieval": "manual"
73003
  }
73004
  },
73005
  {
@@ -73013,11 +73013,11 @@
73013
  },
73014
  {
73015
  "hf_id": "BAAI/bge-en-icl",
73016
- "mteb_avg": 64.9,
73017
- "mteb_retrieval": 58.2,
73018
  "sources": {
73019
- "mteb_avg": "manual",
73020
- "mteb_retrieval": "manual"
73021
  }
73022
  }
73023
  ]
 
72956
  {
72957
  "hf_id": "BAAI/bge-large-en-v1.5",
72958
  "name": "bge-large-en-v1.5",
72959
+ "mteb_avg": 57.25,
72960
+ "mteb_retrieval": 48.93,
72961
  "sources": {
72962
+ "mteb_avg": "hf-readme",
72963
+ "mteb_retrieval": "hf-readme"
72964
  }
72965
  },
72966
  {
 
72986
  {
72987
  "hf_id": "intfloat/e5-mistral-7b-instruct",
72988
  "name": "e5-mistral-7b-instruct",
72989
+ "mteb_avg": 70.58,
72990
+ "mteb_retrieval": 58.58,
72991
  "sources": {
72992
+ "mteb_avg": "hf-readme",
72993
+ "mteb_retrieval": "hf-readme"
72994
  }
72995
  },
72996
  {
72997
  "hf_id": "BAAI/bge-multilingual-gemma2",
72998
+ "mteb_avg": 69.07,
72999
+ "mteb_retrieval": 63.75,
73000
  "sources": {
73001
+ "mteb_avg": "hf-readme",
73002
+ "mteb_retrieval": "hf-readme"
73003
  }
73004
  },
73005
  {
 
73013
  },
73014
  {
73015
  "hf_id": "BAAI/bge-en-icl",
73016
+ "mteb_avg": 71.68,
73017
+ "mteb_retrieval": 62.16,
73018
  "sources": {
73019
+ "mteb_avg": "hf-readme",
73020
+ "mteb_retrieval": "hf-readme"
73021
  }
73022
  }
73023
  ]
scripts/fetch-benchmarks.js CHANGED
@@ -667,13 +667,88 @@ async function refreshSource(source) {
667
  fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
668
  }
669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
670
  // ─── Main ────────────────────────────────────────────────────────────────────
671
 
672
  async function main() {
673
  const source = process.argv[2]?.toLowerCase();
674
  if (source) { await refreshSource(source); return; }
675
 
676
- const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries, mtebEntries] = await Promise.all([
677
  fetchLLMStats(),
678
  fetchHFLeaderboard(),
679
  fetchLiveBench(),
@@ -681,6 +756,7 @@ async function main() {
681
  fetchAider(),
682
  fetchArtificialAnalysis(),
683
  fetchMTEB(),
 
684
  ]);
685
 
686
  const merged = mergeEntries(llmstats, hfEntries);
@@ -688,7 +764,8 @@ async function main() {
688
  const withAr = mergeArena(withLB, arenaEntries);
689
  const withAi = mergeAider(withAr, aiderEntries);
690
  const withAA = mergeArtificialAnalysis(withAi, aaEntries);
691
- const all = mergeMTEB(withAA, mtebEntries);
 
692
 
693
  console.log(`\nTotal entries: ${all.length}`);
694
  console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length} | MTEB: ${all.filter(e => e.mteb_avg !== undefined).length}`);
 
667
  fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
668
  }
669
 
670
+ // ─── HF README Evaluation ──────────────────────────────────────────────────
671
+
672
+ async function fetchHFReadmeBenchmarks() {
673
+ const providersPath = path.join(__dirname, '..', 'data', 'providers.json');
674
+ if (!fs.existsSync(providersPath)) return [];
675
+
676
+ const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers;
677
+ const hfIds = new Set();
678
+ providers.forEach(p => p.models.forEach(m => { if (m.hf_id) hfIds.add(m.hf_id); }));
679
+
680
+ process.stdout.write(`HF README: checking ${hfIds.size} models... `);
681
+ const results = [];
682
+
683
+ const BATCH = 10;
684
+ const ids = Array.from(hfIds);
685
+ for (let i = 0; i < ids.length; i += BATCH) {
686
+ const batch = ids.slice(i, i + BATCH);
687
+ const rows = await Promise.all(batch.map(async (hfId) => {
688
+ try {
689
+ const url = `https://huggingface.co/${hfId}/raw/main/README.md`;
690
+ const text = await getText(url, { retries: 1 });
691
+ if (!text.startsWith('---')) return null;
692
+
693
+ const endYaml = text.indexOf('---', 3);
694
+ if (endYaml === -1) return null;
695
+
696
+ const yamlText = text.substring(3, endYaml);
697
+ const meta = yaml.load(yamlText);
698
+ if (!meta || !meta['model-index']) return null;
699
+
700
+ let total = 0, count = 0, retTotal = 0, retCount = 0;
701
+ const modelIndex = Array.isArray(meta['model-index']) ? meta['model-index'] : [meta['model-index']];
702
+ modelIndex.forEach(mi => {
703
+ (mi.results || []).forEach(res => {
704
+ const isMTEB = (res.dataset?.type || '').toLowerCase().includes('mteb') ||
705
+ (res.dataset?.name || '').toLowerCase().includes('mteb') ||
706
+ (res.task?.type || '').toLowerCase().includes('retrieval');
707
+ if (!isMTEB) return;
708
+
709
+ const mainMetric = (res.metrics || []).find(m => m.type === 'main_score' || m.type === 'ndcg_at_10' || m.type === 'accuracy');
710
+ if (mainMetric && typeof mainMetric.value === 'number') {
711
+ const val = mainMetric.value;
712
+ const norm = val <= 1.0 ? val * 100 : val;
713
+ total += norm; count++;
714
+
715
+ const taskType = (res.task?.type || '').toLowerCase();
716
+ if (taskType.includes('retrieval') || taskType.includes('search')) {
717
+ retTotal += norm; retCount++;
718
+ }
719
+ }
720
+ });
721
+ });
722
+
723
+ if (count > 0) {
724
+ return {
725
+ hf_id: hfId,
726
+ name: hfId.split('/').pop(),
727
+ mteb_avg: Math.round(total / count * 100) / 100,
728
+ mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined,
729
+ sources: { mteb_avg: 'hf-readme', mteb_retrieval: retCount > 0 ? 'hf-readme' : undefined }
730
+ };
731
+ }
732
+ } catch (e) {
733
+ return null;
734
+ }
735
+ return null;
736
+ }));
737
+ rows.forEach(r => { if (r) results.push(r); });
738
+ process.stdout.write(` HF README: ${Math.min(i + BATCH, ids.length)}/${ids.length}\r`);
739
+ }
740
+
741
+ console.log(`\n HF README: ${results.length} models enriched from metadata`);
742
+ return results;
743
+ }
744
+
745
  // ─── Main ────────────────────────────────────────────────────────────────────
746
 
747
  async function main() {
748
  const source = process.argv[2]?.toLowerCase();
749
  if (source) { await refreshSource(source); return; }
750
 
751
+ const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries, mtebEntries, readmeEntries] = await Promise.all([
752
  fetchLLMStats(),
753
  fetchHFLeaderboard(),
754
  fetchLiveBench(),
 
756
  fetchAider(),
757
  fetchArtificialAnalysis(),
758
  fetchMTEB(),
759
+ fetchHFReadmeBenchmarks(),
760
  ]);
761
 
762
  const merged = mergeEntries(llmstats, hfEntries);
 
764
  const withAr = mergeArena(withLB, arenaEntries);
765
  const withAi = mergeAider(withAr, aiderEntries);
766
  const withAA = mergeArtificialAnalysis(withAi, aaEntries);
767
+ const withMTEB = mergeMTEB(withAA, mtebEntries);
768
+ const all = mergeMTEB(withMTEB, readmeEntries);
769
 
770
  console.log(`\nTotal entries: ${all.length}`);
771
  console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length} | MTEB: ${all.filter(e => e.mteb_avg !== undefined).length}`);