SimbaBench / server.js
elmadany's picture
Update server.js
4d3a843 verified
raw
history blame
6.3 kB
const express = require('express');
const app = express();
const PORT = 7860;
const HF_TOKEN = process.env.HF_TOKEN;
const REPO_ID = process.env.SIMBA_DATA;
let CACHED_DATA = null;
app.use(express.static('public'));
// --- Fetch Helper ---
async function fetchHFFile(filename) {
const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${filename}`;
console.log(`Downloading: ${filename}...`);
const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
if (!res.ok) return [];
const text = await res.text();
return text.split('\n').filter(line => line.trim()).map(line => {
try { return JSON.parse(line); } catch(e) { return null; }
}).filter(x => x);
}
// --- List Files Helper ---
async function listHFFiles(folder) {
const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`;
const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
if (!res.ok) return [];
const files = await res.json();
return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path);
}
// --- Processor ---
async function loadAndProcessData() {
console.log("Starting Data Sync...");
// 1. Metadata
const metaList = await fetchHFFile("final_results/metadata.jsonl");
const langMap = {};
// Map code key to full metadata object
metaList.forEach(m => { langMap[m.lang_code_key] = m; });
// 2. Load Others
const ttsData = await fetchHFFile("final_results/tts_results.jsonl");
const slidData = await fetchHFFile("final_results/slid_results.jsonl");
// 3. Load ASR
const files = await listHFFiles("final_results");
let asrRecords = [];
for (const file of files) {
if (file.includes('metadata') || file.includes('tts_') || file.includes('slid_')) continue;
const records = await fetchHFFile(file);
asrRecords.push(...records);
}
// 4. Process ASR
const families = new Set();
const models = new Set();
const familyData = {};
asrRecords.forEach(r => {
if (r.task !== 'asr') return;
// Use metadata to get Family and ISO
const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code };
const fam = meta.family;
const mod = r.model_name;
families.add(fam);
models.add(mod);
if (!familyData[fam]) familyData[fam] = {};
if (!familyData[fam][mod]) familyData[fam][mod] = {};
// Store by Language Name
familyData[fam][mod][meta.name] = {
wer: (r.datasets_avg_wer || 0) * 100,
cer: (r.datasets_avg_cer || 0) * 100,
iso: meta.lang_code_key // Store ISO here
};
});
// 5. Format ASR By Family
const asrByFamily = {};
Array.from(families).sort().forEach(fam => {
const modData = familyData[fam];
// Collect Languages with their ISO codes
const langMapForFamily = new Map();
Object.values(modData).forEach(m => {
Object.keys(m).forEach(langName => {
// Get ISO from the stored data object
const iso = m[langName].iso;
langMapForFamily.set(langName, iso);
});
});
// Create array of objects: { name: "Amharic", iso: "amh" }
const languages = Array.from(langMapForFamily.keys()).sort().map(name => ({
name: name,
iso: langMapForFamily.get(name)
}));
const rows = Object.keys(modData).map(modName => {
const scores = modData[modName];
let sW=0, sC=0, n=0;
languages.forEach(l => {
if(scores[l.name]) { sW += scores[l.name].wer; sC += scores[l.name].cer; n++; }
});
const row = {
Model: modName,
Avg_WER: n ? sW/n : 0,
Avg_CER: n ? sC/n : 0
};
languages.forEach(l => {
if(scores[l.name]) {
row[`WER_${l.name}`] = scores[l.name].wer;
row[`CER_${l.name}`] = scores[l.name].cer;
}
});
return row;
});
rows.sort((a,b) => a.Avg_WER - b.Avg_WER);
asrByFamily[fam] = { data: rows, languages: languages };
});
// 6. Format ASR By Model
const asrByModel = {};
Array.from(models).forEach(mod => {
const rows = [];
asrRecords.forEach(r => {
if(r.model_name === mod && r.task === 'asr') {
const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code };
rows.push({
Language: meta.name,
ISO: meta.lang_code_key, // Include ISO
Family: meta.family,
WER: (r.datasets_avg_wer || 0) * 100,
CER: (r.datasets_avg_cer || 0) * 100
});
}
});
rows.sort((a,b) => a.Language.localeCompare(b.Language));
asrByModel[mod] = rows;
});
// 7. Format TTS
const ttsGrouped = {};
const ttsModels = new Set();
ttsData.forEach(r => {
ttsModels.add(r.model);
if(!ttsGrouped[r.model]) ttsGrouped[r.model] = [];
// Add ISO if available in metadata, else guess or use code
const meta = langMap[r.lang_code] || { lang_code_key: r.lang_code };
r.iso = meta.lang_code_key;
ttsGrouped[r.model].push(r);
});
CACHED_DATA = {
metadata: {
families: Array.from(families).sort(),
models: Array.from(models).sort(),
tts_models: Array.from(ttsModels).sort()
},
asr: { by_family: asrByFamily, by_model: asrByModel },
tts: ttsGrouped,
slid: slidData
};
console.log("Data Ready.");
}
app.get('/api/data', async (req, res) => {
if (!CACHED_DATA) {
try { await loadAndProcessData(); }
catch (e) { return res.status(500).json({error: e.message}); }
}
res.json(CACHED_DATA);
});
app.listen(PORT, '0.0.0.0', () => {
console.log(`Server running on ${PORT}`);
loadAndProcessData();
});