SimbaBench / server.js
elmadany's picture
Create server.js
6226dc5 verified
raw
history blame
5.95 kB
const express = require('express');
const app = express();
const PORT = 7860;
// Environment Variables from HF Space Settings
const HF_TOKEN = process.env.HF_TOKEN;
const REPO_ID = process.env.SIMBA_DATA;
// In-memory cache
let CACHED_DATA = null;
// Serve static files from 'public' folder
app.use(express.static('public'));
// --- Helper: Download File from HF ---
async function fetchHFFile(filename) {
const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${filename}`;
console.log(`Downloading: ${filename}...`);
const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
if (!res.ok) {
console.warn(`Failed to fetch ${filename}: ${res.statusText}`);
return [];
}
const text = await res.text();
// Parse JSONL
return text.split('\n').filter(line => line.trim()).map(line => {
try { return JSON.parse(line); } catch(e) { return null; }
}).filter(x => x);
}
// --- Helper: List Files in Folder ---
async function listHFFiles(folder) {
const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`;
const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
if (!res.ok) return [];
const files = await res.json();
return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path);
}
// --- Main Processor ---
async function loadAndProcessData() {
console.log("Starting Data Sync...");
// 1. Load Metadata
const metaList = await fetchHFFile("final_results/metadata.jsonl");
const langMap = {};
metaList.forEach(m => { langMap[m.lang_code_key] = m; });
// 2. Load TTS & SLID
const ttsData = await fetchHFFile("final_results/tts_results.jsonl");
const slidData = await fetchHFFile("final_results/slid_results.jsonl");
// 3. Load ASR Files
const files = await listHFFiles("final_results");
let asrRecords = [];
for (const file of files) {
// Skip metadata/tts/slid if they appear in list
if (file.includes('metadata') || file.includes('tts_') || file.includes('slid_')) continue;
const records = await fetchHFFile(file);
asrRecords.push(...records);
}
// 4. Process ASR Data
const families = new Set();
const models = new Set();
const familyData = {}; // { Family: { Model: { Lang: {wer, cer} } } }
asrRecords.forEach(r => {
if (r.task !== 'asr') return;
// Enrich
const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown' };
const fam = meta.family;
const mod = r.model_name;
families.add(fam);
models.add(mod);
// Init structure
if (!familyData[fam]) familyData[fam] = {};
if (!familyData[fam][mod]) familyData[fam][mod] = {};
familyData[fam][mod][meta.name] = {
wer: (r.datasets_avg_wer || 0) * 100,
cer: (r.datasets_avg_cer || 0) * 100
};
});
// 5. Format ASR By Family
const asrByFamily = {};
Array.from(families).sort().forEach(fam => {
const modData = familyData[fam];
// Find all languages for this family
const langsSet = new Set();
Object.values(modData).forEach(m => Object.keys(m).forEach(l => langsSet.add(l)));
const langs = Array.from(langsSet).sort();
// Build Rows
const rows = Object.keys(modData).map(modName => {
const scores = modData[modName];
let sW=0, sC=0, n=0;
langs.forEach(l => {
if(scores[l]) { sW += scores[l].wer; sC += scores[l].cer; n++; }
});
const row = {
Model: modName,
Avg_WER: n ? sW/n : 0,
Avg_CER: n ? sC/n : 0
};
// Add individual lang scores
langs.forEach(l => {
if(scores[l]) {
row[`WER_${l}`] = scores[l].wer;
row[`CER_${l}`] = scores[l].cer;
}
});
return row;
});
// Sort by WER
rows.sort((a,b) => a.Avg_WER - b.Avg_WER);
asrByFamily[fam] = { data: rows, languages: langs };
});
// 6. Format ASR By Model
const asrByModel = {};
Array.from(models).forEach(mod => {
const rows = [];
asrRecords.forEach(r => {
if(r.model_name === mod && r.task === 'asr') {
const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown' };
rows.push({
Language: meta.name,
Family: meta.family,
WER: (r.datasets_avg_wer || 0) * 100,
CER: (r.datasets_avg_cer || 0) * 100
});
}
});
rows.sort((a,b) => a.Language.localeCompare(b.Language));
asrByModel[mod] = rows;
});
// 7. Format TTS
const ttsGrouped = {};
const ttsModels = new Set();
ttsData.forEach(r => {
ttsModels.add(r.model);
if(!ttsGrouped[r.model]) ttsGrouped[r.model] = [];
ttsGrouped[r.model].push(r);
});
CACHED_DATA = {
metadata: {
families: Array.from(families).sort(),
models: Array.from(models).sort(),
tts_models: Array.from(ttsModels).sort()
},
asr: { by_family: asrByFamily, by_model: asrByModel },
tts: ttsGrouped,
slid: slidData
};
console.log("Data Processing Complete.");
}
// API Route
app.get('/api/data', async (req, res) => {
if (!CACHED_DATA) {
try { await loadAndProcessData(); }
catch (e) { return res.status(500).json({error: e.message}); }
}
res.json(CACHED_DATA);
});
// Start
app.listen(PORT, '0.0.0.0', () => {
console.log(`Server listening on port ${PORT}`);
loadAndProcessData(); // Initial load
});