SimbaBench / server.js
elmadany's picture
Update server.js
07445a4 verified
const express = require('express');
const app = express();
const PORT = 7860;
const HF_TOKEN = process.env.HF_TOKEN;
const REPO_ID = process.env.SIMBA_DATA;
let CACHED_DATA = null;
app.use(express.static('public'));
// --- Fetch Helper ---
async function fetchHFFile(path) {
const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${path}`;
console.log(`[Loading] ${path}...`);
const headers = {};
if (HF_TOKEN) {
headers["Authorization"] = `Bearer ${HF_TOKEN}`;
}
try {
const res = await fetch(url, { headers });
if (!res.ok) {
console.error(`[Error] Failed to fetch ${path}: ${res.status} ${res.statusText}`);
return [];
}
const text = await res.text();
return text.split('\n')
.filter(line => line.trim())
.map(line => {
try { return JSON.parse(line); } catch(e) { return null; }
})
.filter(x => x);
} catch (err) {
console.error(`[Error] Network error fetching ${path}:`, err);
return [];
}
}
// --- List Files Helper ---
async function listHFFiles(folder) {
const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`;
const headers = {};
if (HF_TOKEN) {
headers["Authorization"] = `Bearer ${HF_TOKEN}`;
}
try {
const res = await fetch(url, { headers });
if (!res.ok) {
console.error(`[Error] Failed to list files: ${res.status}`);
return [];
}
const files = await res.json();
// Filter only .jsonl files
return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path);
} catch (err) {
console.error("[Error] Network error listing files:", err);
return [];
}
}
// --- Processor ---
async function loadAndProcessData() {
console.log("--- Starting Data Sync ---");
// 1. Fetch Metadata
const metaList = await fetchHFFile("final_results/metadata.jsonl");
const langMap = {};
metaList.forEach(m => { langMap[m.lang_code_key] = m; });
// 2. Scan ALL files
const files = await listHFFiles("final_results");
let allRecords = [];
for (const file of files) {
if (file.endsWith('metadata.jsonl')) continue;
const records = await fetchHFFile(file);
allRecords.push(...records);
}
console.log(`[Processing] Loaded ${allRecords.length} total records. Bucketing by task...`);
// 3. Bucket Records by Task (Checking both 'task' and 'data_type')
const ttsRecords = [];
const slidRecords = [];
const asrRecords = [];
allRecords.forEach(r => {
// Robust check: look for 'task' OR 'data_type'
const identifier = r.task || r.data_type;
if (!identifier) return;
const task = identifier.toLowerCase().trim();
if (task === 'tts') ttsRecords.push(r);
else if (task === 'slid') slidRecords.push(r);
else if (task === 'asr') asrRecords.push(r);
});
console.log(`[Counts] ASR: ${asrRecords.length}, TTS: ${ttsRecords.length}, SLID: ${slidRecords.length}`);
// --- PROCESS SLID ---
const slidMap = {};
slidRecords.forEach(r => {
const code = r.lang_code;
const meta = langMap[code] || { name: code, family: 'Unknown' };
if (!slidMap[code]) {
slidMap[code] = { Language: meta.name, ISO: code };
}
const modelName = r.model_name || r.model || "Unknown Model";
slidMap[code][modelName] = r.f1_score;
});
const slidFinal = Object.values(slidMap);
// --- PROCESS TTS ---
const ttsGrouped = {};
const ttsModels = new Set();
ttsRecords.forEach(r => {
const modelName = r.model || r.model_name || "Unknown Model";
ttsModels.add(modelName);
if(!ttsGrouped[modelName]) ttsGrouped[modelName] = [];
const meta = langMap[r.lang_code] || { name: r.language || r.lang_code, lang_code_key: r.lang_code };
// Standardize structure for frontend
r.iso = r.lang_code;
r.Language = meta.name; // Ensure capital 'Language' key exists for dropdown
r.language = meta.name; // Ensure lowercase 'language' key exists for table lookups
ttsGrouped[modelName].push(r);
});
// --- PROCESS ASR ---
const families = new Set();
const models = new Set();
const familyData = {};
asrRecords.forEach(r => {
const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code };
const fam = meta.family;
const mod = r.model_name || r.model;
families.add(fam);
models.add(mod);
if (!familyData[fam]) familyData[fam] = {};
if (!familyData[fam][mod]) familyData[fam][mod] = {};
familyData[fam][mod][meta.name] = {
wer: (r.datasets_avg_wer || 0) * 100,
cer: (r.datasets_avg_cer || 0) * 100,
iso: meta.lang_code_key
};
});
// Format ASR By Family
const asrByFamily = {};
Array.from(families).sort().forEach(fam => {
const modData = familyData[fam];
const langMapForFamily = new Map();
Object.values(modData).forEach(m => {
Object.keys(m).forEach(langName => {
langMapForFamily.set(langName, m[langName].iso);
});
});
const languages = Array.from(langMapForFamily.keys()).sort().map(name => ({
name: name,
iso: langMapForFamily.get(name)
}));
const rows = Object.keys(modData).map(modName => {
const scores = modData[modName];
let sW=0, sC=0, n=0;
languages.forEach(l => {
if(scores[l.name]) { sW += scores[l.name].wer; sC += scores[l.name].cer; n++; }
});
const row = { Model: modName, Avg_WER: n ? sW/n : 0, Avg_CER: n ? sC/n : 0 };
languages.forEach(l => {
if(scores[l.name]) {
row[`WER_${l.name}`] = scores[l.name].wer;
row[`CER_${l.name}`] = scores[l.name].cer;
}
});
return row;
});
rows.sort((a,b) => a.Avg_WER - b.Avg_WER);
asrByFamily[fam] = { data: rows, languages: languages };
});
// Format ASR By Model
const asrByModel = {};
Array.from(models).forEach(mod => {
const rows = [];
asrRecords.forEach(r => {
if((r.model_name || r.model) === mod) {
const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code };
rows.push({
Language: meta.name,
ISO: meta.lang_code_key,
Family: meta.family,
WER: (r.datasets_avg_wer || 0) * 100,
CER: (r.datasets_avg_cer || 0) * 100
});
}
});
rows.sort((a,b) => a.Language.localeCompare(b.Language));
asrByModel[mod] = rows;
});
CACHED_DATA = {
metadata: {
families: Array.from(families).sort(),
models: Array.from(models).sort(),
tts_models: Array.from(ttsModels).sort()
},
asr: { by_family: asrByFamily, by_model: asrByModel },
tts: ttsGrouped,
slid: slidFinal
};
console.log("--- Data Ready ---");
}
app.get('/api/data', async (req, res) => {
if (!CACHED_DATA) {
try { await loadAndProcessData(); }
catch (e) { return res.status(500).json({error: e.message}); }
}
res.json(CACHED_DATA);
});
app.listen(PORT, '0.0.0.0', () => {
console.log(`Server running on ${PORT}`);
loadAndProcessData();
});