const express = require('express'); const app = express(); const PORT = 7860; // Environment Variables from HF Space Settings const HF_TOKEN = process.env.HF_TOKEN; const REPO_ID = process.env.SIMBA_DATA; // In-memory cache let CACHED_DATA = null; // Serve static files from 'public' folder app.use(express.static('public')); // --- Helper: Download File from HF --- async function fetchHFFile(filename) { const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${filename}`; console.log(`Downloading: ${filename}...`); const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } }); if (!res.ok) { console.warn(`Failed to fetch ${filename}: ${res.statusText}`); return []; } const text = await res.text(); // Parse JSONL return text.split('\n').filter(line => line.trim()).map(line => { try { return JSON.parse(line); } catch(e) { return null; } }).filter(x => x); } // --- Helper: List Files in Folder --- async function listHFFiles(folder) { const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`; const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } }); if (!res.ok) return []; const files = await res.json(); return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path); } // --- Main Processor --- async function loadAndProcessData() { console.log("Starting Data Sync..."); // 1. Load Metadata const metaList = await fetchHFFile("final_results/metadata.jsonl"); const langMap = {}; metaList.forEach(m => { langMap[m.lang_code_key] = m; }); // 2. Load TTS & SLID const ttsData = await fetchHFFile("final_results/tts_results.jsonl"); const slidData = await fetchHFFile("final_results/slid_results.jsonl"); // 3. Load ASR Files const files = await listHFFiles("final_results"); let asrRecords = []; for (const file of files) { // Skip metadata/tts/slid if they appear in list if (file.includes('metadata') || file.includes('tts_') || file.includes('slid_')) continue; const records = await fetchHFFile(file); asrRecords.push(...records); } // 4. Process ASR Data const families = new Set(); const models = new Set(); const familyData = {}; // { Family: { Model: { Lang: {wer, cer} } } } asrRecords.forEach(r => { if (r.task !== 'asr') return; // Enrich const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown' }; const fam = meta.family; const mod = r.model_name; families.add(fam); models.add(mod); // Init structure if (!familyData[fam]) familyData[fam] = {}; if (!familyData[fam][mod]) familyData[fam][mod] = {}; familyData[fam][mod][meta.name] = { wer: (r.datasets_avg_wer || 0) * 100, cer: (r.datasets_avg_cer || 0) * 100 }; }); // 5. Format ASR By Family const asrByFamily = {}; Array.from(families).sort().forEach(fam => { const modData = familyData[fam]; // Find all languages for this family const langsSet = new Set(); Object.values(modData).forEach(m => Object.keys(m).forEach(l => langsSet.add(l))); const langs = Array.from(langsSet).sort(); // Build Rows const rows = Object.keys(modData).map(modName => { const scores = modData[modName]; let sW=0, sC=0, n=0; langs.forEach(l => { if(scores[l]) { sW += scores[l].wer; sC += scores[l].cer; n++; } }); const row = { Model: modName, Avg_WER: n ? sW/n : 0, Avg_CER: n ? sC/n : 0 }; // Add individual lang scores langs.forEach(l => { if(scores[l]) { row[`WER_${l}`] = scores[l].wer; row[`CER_${l}`] = scores[l].cer; } }); return row; }); // Sort by WER rows.sort((a,b) => a.Avg_WER - b.Avg_WER); asrByFamily[fam] = { data: rows, languages: langs }; }); // 6. Format ASR By Model const asrByModel = {}; Array.from(models).forEach(mod => { const rows = []; asrRecords.forEach(r => { if(r.model_name === mod && r.task === 'asr') { const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown' }; rows.push({ Language: meta.name, Family: meta.family, WER: (r.datasets_avg_wer || 0) * 100, CER: (r.datasets_avg_cer || 0) * 100 }); } }); rows.sort((a,b) => a.Language.localeCompare(b.Language)); asrByModel[mod] = rows; }); // 7. Format TTS const ttsGrouped = {}; const ttsModels = new Set(); ttsData.forEach(r => { ttsModels.add(r.model); if(!ttsGrouped[r.model]) ttsGrouped[r.model] = []; ttsGrouped[r.model].push(r); }); CACHED_DATA = { metadata: { families: Array.from(families).sort(), models: Array.from(models).sort(), tts_models: Array.from(ttsModels).sort() }, asr: { by_family: asrByFamily, by_model: asrByModel }, tts: ttsGrouped, slid: slidData }; console.log("Data Processing Complete."); } // API Route app.get('/api/data', async (req, res) => { if (!CACHED_DATA) { try { await loadAndProcessData(); } catch (e) { return res.status(500).json({error: e.message}); } } res.json(CACHED_DATA); }); // Start app.listen(PORT, '0.0.0.0', () => { console.log(`Server listening on port ${PORT}`); loadAndProcessData(); // Initial load });