const express = require('express'); const app = express(); const PORT = 7860; const HF_TOKEN = process.env.HF_TOKEN; const REPO_ID = process.env.SIMBA_DATA; let CACHED_DATA = null; app.use(express.static('public')); // --- Fetch Helper --- async function fetchHFFile(path) { const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${path}`; console.log(`[Loading] ${path}...`); const headers = {}; if (HF_TOKEN) { headers["Authorization"] = `Bearer ${HF_TOKEN}`; } try { const res = await fetch(url, { headers }); if (!res.ok) { console.error(`[Error] Failed to fetch ${path}: ${res.status} ${res.statusText}`); return []; } const text = await res.text(); return text.split('\n') .filter(line => line.trim()) .map(line => { try { return JSON.parse(line); } catch(e) { return null; } }) .filter(x => x); } catch (err) { console.error(`[Error] Network error fetching ${path}:`, err); return []; } } // --- List Files Helper --- async function listHFFiles(folder) { const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`; const headers = {}; if (HF_TOKEN) { headers["Authorization"] = `Bearer ${HF_TOKEN}`; } try { const res = await fetch(url, { headers }); if (!res.ok) { console.error(`[Error] Failed to list files: ${res.status}`); return []; } const files = await res.json(); // Filter only .jsonl files return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path); } catch (err) { console.error("[Error] Network error listing files:", err); return []; } } // --- Processor --- async function loadAndProcessData() { console.log("--- Starting Data Sync ---"); // 1. Fetch Metadata const metaList = await fetchHFFile("final_results/metadata.jsonl"); const langMap = {}; metaList.forEach(m => { langMap[m.lang_code_key] = m; }); // 2. Scan ALL files const files = await listHFFiles("final_results"); let allRecords = []; for (const file of files) { if (file.endsWith('metadata.jsonl')) continue; const records = await fetchHFFile(file); allRecords.push(...records); } console.log(`[Processing] Loaded ${allRecords.length} total records. Bucketing by task...`); // 3. Bucket Records by Task (Checking both 'task' and 'data_type') const ttsRecords = []; const slidRecords = []; const asrRecords = []; allRecords.forEach(r => { // Robust check: look for 'task' OR 'data_type' const identifier = r.task || r.data_type; if (!identifier) return; const task = identifier.toLowerCase().trim(); if (task === 'tts') ttsRecords.push(r); else if (task === 'slid') slidRecords.push(r); else if (task === 'asr') asrRecords.push(r); }); console.log(`[Counts] ASR: ${asrRecords.length}, TTS: ${ttsRecords.length}, SLID: ${slidRecords.length}`); // --- PROCESS SLID --- const slidMap = {}; slidRecords.forEach(r => { const code = r.lang_code; const meta = langMap[code] || { name: code, family: 'Unknown' }; if (!slidMap[code]) { slidMap[code] = { Language: meta.name, ISO: code }; } const modelName = r.model_name || r.model || "Unknown Model"; slidMap[code][modelName] = r.f1_score; }); const slidFinal = Object.values(slidMap); // --- PROCESS TTS --- const ttsGrouped = {}; const ttsModels = new Set(); ttsRecords.forEach(r => { const modelName = r.model || r.model_name || "Unknown Model"; ttsModels.add(modelName); if(!ttsGrouped[modelName]) ttsGrouped[modelName] = []; const meta = langMap[r.lang_code] || { name: r.language || r.lang_code, lang_code_key: r.lang_code }; // Standardize structure for frontend r.iso = r.lang_code; r.Language = meta.name; // Ensure capital 'Language' key exists for dropdown r.language = meta.name; // Ensure lowercase 'language' key exists for table lookups ttsGrouped[modelName].push(r); }); // --- PROCESS ASR --- const families = new Set(); const models = new Set(); const familyData = {}; asrRecords.forEach(r => { const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code }; const fam = meta.family; const mod = r.model_name || r.model; families.add(fam); models.add(mod); if (!familyData[fam]) familyData[fam] = {}; if (!familyData[fam][mod]) familyData[fam][mod] = {}; familyData[fam][mod][meta.name] = { wer: (r.datasets_avg_wer || 0) * 100, cer: (r.datasets_avg_cer || 0) * 100, iso: meta.lang_code_key }; }); // Format ASR By Family const asrByFamily = {}; Array.from(families).sort().forEach(fam => { const modData = familyData[fam]; const langMapForFamily = new Map(); Object.values(modData).forEach(m => { Object.keys(m).forEach(langName => { langMapForFamily.set(langName, m[langName].iso); }); }); const languages = Array.from(langMapForFamily.keys()).sort().map(name => ({ name: name, iso: langMapForFamily.get(name) })); const rows = Object.keys(modData).map(modName => { const scores = modData[modName]; let sW=0, sC=0, n=0; languages.forEach(l => { if(scores[l.name]) { sW += scores[l.name].wer; sC += scores[l.name].cer; n++; } }); const row = { Model: modName, Avg_WER: n ? sW/n : 0, Avg_CER: n ? sC/n : 0 }; languages.forEach(l => { if(scores[l.name]) { row[`WER_${l.name}`] = scores[l.name].wer; row[`CER_${l.name}`] = scores[l.name].cer; } }); return row; }); rows.sort((a,b) => a.Avg_WER - b.Avg_WER); asrByFamily[fam] = { data: rows, languages: languages }; }); // Format ASR By Model const asrByModel = {}; Array.from(models).forEach(mod => { const rows = []; asrRecords.forEach(r => { if((r.model_name || r.model) === mod) { const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code }; rows.push({ Language: meta.name, ISO: meta.lang_code_key, Family: meta.family, WER: (r.datasets_avg_wer || 0) * 100, CER: (r.datasets_avg_cer || 0) * 100 }); } }); rows.sort((a,b) => a.Language.localeCompare(b.Language)); asrByModel[mod] = rows; }); CACHED_DATA = { metadata: { families: Array.from(families).sort(), models: Array.from(models).sort(), tts_models: Array.from(ttsModels).sort() }, asr: { by_family: asrByFamily, by_model: asrByModel }, tts: ttsGrouped, slid: slidFinal }; console.log("--- Data Ready ---"); } app.get('/api/data', async (req, res) => { if (!CACHED_DATA) { try { await loadAndProcessData(); } catch (e) { return res.status(500).json({error: e.message}); } } res.json(CACHED_DATA); }); app.listen(PORT, '0.0.0.0', () => { console.log(`Server running on ${PORT}`); loadAndProcessData(); });