Spaces:
Running
Running
CrispStrobe commited on
Commit Β·
5afa393
1
Parent(s): 062b038
feat: extract MTEB benchmarks from Hugging Face README metadata
Browse files- data/benchmarks.json +16 -16
- scripts/fetch-benchmarks.js +79 -2
data/benchmarks.json
CHANGED
|
@@ -72956,11 +72956,11 @@
|
|
| 72956 |
{
|
| 72957 |
"hf_id": "BAAI/bge-large-en-v1.5",
|
| 72958 |
"name": "bge-large-en-v1.5",
|
| 72959 |
-
"mteb_avg":
|
| 72960 |
-
"mteb_retrieval":
|
| 72961 |
"sources": {
|
| 72962 |
-
"mteb_avg": "
|
| 72963 |
-
"mteb_retrieval": "
|
| 72964 |
}
|
| 72965 |
},
|
| 72966 |
{
|
|
@@ -72986,20 +72986,20 @@
|
|
| 72986 |
{
|
| 72987 |
"hf_id": "intfloat/e5-mistral-7b-instruct",
|
| 72988 |
"name": "e5-mistral-7b-instruct",
|
| 72989 |
-
"mteb_avg":
|
| 72990 |
-
"mteb_retrieval":
|
| 72991 |
"sources": {
|
| 72992 |
-
"mteb_avg": "
|
| 72993 |
-
"mteb_retrieval": "
|
| 72994 |
}
|
| 72995 |
},
|
| 72996 |
{
|
| 72997 |
"hf_id": "BAAI/bge-multilingual-gemma2",
|
| 72998 |
-
"mteb_avg":
|
| 72999 |
-
"mteb_retrieval":
|
| 73000 |
"sources": {
|
| 73001 |
-
"mteb_avg": "
|
| 73002 |
-
"mteb_retrieval": "
|
| 73003 |
}
|
| 73004 |
},
|
| 73005 |
{
|
|
@@ -73013,11 +73013,11 @@
|
|
| 73013 |
},
|
| 73014 |
{
|
| 73015 |
"hf_id": "BAAI/bge-en-icl",
|
| 73016 |
-
"mteb_avg":
|
| 73017 |
-
"mteb_retrieval":
|
| 73018 |
"sources": {
|
| 73019 |
-
"mteb_avg": "
|
| 73020 |
-
"mteb_retrieval": "
|
| 73021 |
}
|
| 73022 |
}
|
| 73023 |
]
|
|
|
|
| 72956 |
{
|
| 72957 |
"hf_id": "BAAI/bge-large-en-v1.5",
|
| 72958 |
"name": "bge-large-en-v1.5",
|
| 72959 |
+
"mteb_avg": 57.25,
|
| 72960 |
+
"mteb_retrieval": 48.93,
|
| 72961 |
"sources": {
|
| 72962 |
+
"mteb_avg": "hf-readme",
|
| 72963 |
+
"mteb_retrieval": "hf-readme"
|
| 72964 |
}
|
| 72965 |
},
|
| 72966 |
{
|
|
|
|
| 72986 |
{
|
| 72987 |
"hf_id": "intfloat/e5-mistral-7b-instruct",
|
| 72988 |
"name": "e5-mistral-7b-instruct",
|
| 72989 |
+
"mteb_avg": 70.58,
|
| 72990 |
+
"mteb_retrieval": 58.58,
|
| 72991 |
"sources": {
|
| 72992 |
+
"mteb_avg": "hf-readme",
|
| 72993 |
+
"mteb_retrieval": "hf-readme"
|
| 72994 |
}
|
| 72995 |
},
|
| 72996 |
{
|
| 72997 |
"hf_id": "BAAI/bge-multilingual-gemma2",
|
| 72998 |
+
"mteb_avg": 69.07,
|
| 72999 |
+
"mteb_retrieval": 63.75,
|
| 73000 |
"sources": {
|
| 73001 |
+
"mteb_avg": "hf-readme",
|
| 73002 |
+
"mteb_retrieval": "hf-readme"
|
| 73003 |
}
|
| 73004 |
},
|
| 73005 |
{
|
|
|
|
| 73013 |
},
|
| 73014 |
{
|
| 73015 |
"hf_id": "BAAI/bge-en-icl",
|
| 73016 |
+
"mteb_avg": 71.68,
|
| 73017 |
+
"mteb_retrieval": 62.16,
|
| 73018 |
"sources": {
|
| 73019 |
+
"mteb_avg": "hf-readme",
|
| 73020 |
+
"mteb_retrieval": "hf-readme"
|
| 73021 |
}
|
| 73022 |
}
|
| 73023 |
]
|
scripts/fetch-benchmarks.js
CHANGED
|
@@ -667,13 +667,88 @@ async function refreshSource(source) {
|
|
| 667 |
fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
|
| 668 |
}
|
| 669 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
// βββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 671 |
|
| 672 |
async function main() {
|
| 673 |
const source = process.argv[2]?.toLowerCase();
|
| 674 |
if (source) { await refreshSource(source); return; }
|
| 675 |
|
| 676 |
-
const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries, mtebEntries] = await Promise.all([
|
| 677 |
fetchLLMStats(),
|
| 678 |
fetchHFLeaderboard(),
|
| 679 |
fetchLiveBench(),
|
|
@@ -681,6 +756,7 @@ async function main() {
|
|
| 681 |
fetchAider(),
|
| 682 |
fetchArtificialAnalysis(),
|
| 683 |
fetchMTEB(),
|
|
|
|
| 684 |
]);
|
| 685 |
|
| 686 |
const merged = mergeEntries(llmstats, hfEntries);
|
|
@@ -688,7 +764,8 @@ async function main() {
|
|
| 688 |
const withAr = mergeArena(withLB, arenaEntries);
|
| 689 |
const withAi = mergeAider(withAr, aiderEntries);
|
| 690 |
const withAA = mergeArtificialAnalysis(withAi, aaEntries);
|
| 691 |
-
const
|
|
|
|
| 692 |
|
| 693 |
console.log(`\nTotal entries: ${all.length}`);
|
| 694 |
console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length} | MTEB: ${all.filter(e => e.mteb_avg !== undefined).length}`);
|
|
|
|
| 667 |
fs.writeFileSync(OUT_FILE, JSON.stringify(result, null, 2));
|
| 668 |
}
|
| 669 |
|
| 670 |
+
// βββ HF README Evaluation ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 671 |
+
|
| 672 |
+
async function fetchHFReadmeBenchmarks() {
|
| 673 |
+
const providersPath = path.join(__dirname, '..', 'data', 'providers.json');
|
| 674 |
+
if (!fs.existsSync(providersPath)) return [];
|
| 675 |
+
|
| 676 |
+
const providers = JSON.parse(fs.readFileSync(providersPath, 'utf8')).providers;
|
| 677 |
+
const hfIds = new Set();
|
| 678 |
+
providers.forEach(p => p.models.forEach(m => { if (m.hf_id) hfIds.add(m.hf_id); }));
|
| 679 |
+
|
| 680 |
+
process.stdout.write(`HF README: checking ${hfIds.size} models... `);
|
| 681 |
+
const results = [];
|
| 682 |
+
|
| 683 |
+
const BATCH = 10;
|
| 684 |
+
const ids = Array.from(hfIds);
|
| 685 |
+
for (let i = 0; i < ids.length; i += BATCH) {
|
| 686 |
+
const batch = ids.slice(i, i + BATCH);
|
| 687 |
+
const rows = await Promise.all(batch.map(async (hfId) => {
|
| 688 |
+
try {
|
| 689 |
+
const url = `https://huggingface.co/${hfId}/raw/main/README.md`;
|
| 690 |
+
const text = await getText(url, { retries: 1 });
|
| 691 |
+
if (!text.startsWith('---')) return null;
|
| 692 |
+
|
| 693 |
+
const endYaml = text.indexOf('---', 3);
|
| 694 |
+
if (endYaml === -1) return null;
|
| 695 |
+
|
| 696 |
+
const yamlText = text.substring(3, endYaml);
|
| 697 |
+
const meta = yaml.load(yamlText);
|
| 698 |
+
if (!meta || !meta['model-index']) return null;
|
| 699 |
+
|
| 700 |
+
let total = 0, count = 0, retTotal = 0, retCount = 0;
|
| 701 |
+
const modelIndex = Array.isArray(meta['model-index']) ? meta['model-index'] : [meta['model-index']];
|
| 702 |
+
modelIndex.forEach(mi => {
|
| 703 |
+
(mi.results || []).forEach(res => {
|
| 704 |
+
const isMTEB = (res.dataset?.type || '').toLowerCase().includes('mteb') ||
|
| 705 |
+
(res.dataset?.name || '').toLowerCase().includes('mteb') ||
|
| 706 |
+
(res.task?.type || '').toLowerCase().includes('retrieval');
|
| 707 |
+
if (!isMTEB) return;
|
| 708 |
+
|
| 709 |
+
const mainMetric = (res.metrics || []).find(m => m.type === 'main_score' || m.type === 'ndcg_at_10' || m.type === 'accuracy');
|
| 710 |
+
if (mainMetric && typeof mainMetric.value === 'number') {
|
| 711 |
+
const val = mainMetric.value;
|
| 712 |
+
const norm = val <= 1.0 ? val * 100 : val;
|
| 713 |
+
total += norm; count++;
|
| 714 |
+
|
| 715 |
+
const taskType = (res.task?.type || '').toLowerCase();
|
| 716 |
+
if (taskType.includes('retrieval') || taskType.includes('search')) {
|
| 717 |
+
retTotal += norm; retCount++;
|
| 718 |
+
}
|
| 719 |
+
}
|
| 720 |
+
});
|
| 721 |
+
});
|
| 722 |
+
|
| 723 |
+
if (count > 0) {
|
| 724 |
+
return {
|
| 725 |
+
hf_id: hfId,
|
| 726 |
+
name: hfId.split('/').pop(),
|
| 727 |
+
mteb_avg: Math.round(total / count * 100) / 100,
|
| 728 |
+
mteb_retrieval: retCount > 0 ? Math.round(retTotal / retCount * 100) / 100 : undefined,
|
| 729 |
+
sources: { mteb_avg: 'hf-readme', mteb_retrieval: retCount > 0 ? 'hf-readme' : undefined }
|
| 730 |
+
};
|
| 731 |
+
}
|
| 732 |
+
} catch (e) {
|
| 733 |
+
return null;
|
| 734 |
+
}
|
| 735 |
+
return null;
|
| 736 |
+
}));
|
| 737 |
+
rows.forEach(r => { if (r) results.push(r); });
|
| 738 |
+
process.stdout.write(` HF README: ${Math.min(i + BATCH, ids.length)}/${ids.length}\r`);
|
| 739 |
+
}
|
| 740 |
+
|
| 741 |
+
console.log(`\n HF README: ${results.length} models enriched from metadata`);
|
| 742 |
+
return results;
|
| 743 |
+
}
|
| 744 |
+
|
| 745 |
// βββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 746 |
|
| 747 |
async function main() {
|
| 748 |
const source = process.argv[2]?.toLowerCase();
|
| 749 |
if (source) { await refreshSource(source); return; }
|
| 750 |
|
| 751 |
+
const [llmstats, hfEntries, lbEntries, arenaEntries, aiderEntries, aaEntries, mtebEntries, readmeEntries] = await Promise.all([
|
| 752 |
fetchLLMStats(),
|
| 753 |
fetchHFLeaderboard(),
|
| 754 |
fetchLiveBench(),
|
|
|
|
| 756 |
fetchAider(),
|
| 757 |
fetchArtificialAnalysis(),
|
| 758 |
fetchMTEB(),
|
| 759 |
+
fetchHFReadmeBenchmarks(),
|
| 760 |
]);
|
| 761 |
|
| 762 |
const merged = mergeEntries(llmstats, hfEntries);
|
|
|
|
| 764 |
const withAr = mergeArena(withLB, arenaEntries);
|
| 765 |
const withAi = mergeAider(withAr, aiderEntries);
|
| 766 |
const withAA = mergeArtificialAnalysis(withAi, aaEntries);
|
| 767 |
+
const withMTEB = mergeMTEB(withAA, mtebEntries);
|
| 768 |
+
const all = mergeMTEB(withMTEB, readmeEntries);
|
| 769 |
|
| 770 |
console.log(`\nTotal entries: ${all.length}`);
|
| 771 |
console.log(` With LiveBench: ${all.filter(e => e.lb_name).length} | Arena: ${all.filter(e => e.arena_elo).length} | Aider: ${all.filter(e => e.aider_pass_rate !== undefined).length} | AA: ${all.filter(e => e.aa_intelligence !== undefined).length} | MTEB: ${all.filter(e => e.mteb_avg !== undefined).length}`);
|