Spaces:
Running
Running
| const express = require('express'); | |
| const app = express(); | |
| const PORT = 7860; | |
| const HF_TOKEN = process.env.HF_TOKEN; | |
| const REPO_ID = process.env.SIMBA_DATA; | |
| let CACHED_DATA = null; | |
| app.use(express.static('public')); | |
| // --- Fetch Helper --- | |
| async function fetchHFFile(filename) { | |
| const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${filename}`; | |
| console.log(`Downloading: ${filename}...`); | |
| const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } }); | |
| if (!res.ok) return []; | |
| const text = await res.text(); | |
| return text.split('\n').filter(line => line.trim()).map(line => { | |
| try { return JSON.parse(line); } catch(e) { return null; } | |
| }).filter(x => x); | |
| } | |
| // --- List Files Helper --- | |
| async function listHFFiles(folder) { | |
| const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`; | |
| const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } }); | |
| if (!res.ok) return []; | |
| const files = await res.json(); | |
| return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path); | |
| } | |
| // --- Processor --- | |
| async function loadAndProcessData() { | |
| console.log("Starting Data Sync..."); | |
| // 1. Metadata | |
| const metaList = await fetchHFFile("final_results/metadata.jsonl"); | |
| const langMap = {}; | |
| // Map code key to full metadata object | |
| metaList.forEach(m => { langMap[m.lang_code_key] = m; }); | |
| // 2. Load Others | |
| const ttsData = await fetchHFFile("final_results/tts_results.jsonl"); | |
| const slidData = await fetchHFFile("final_results/slid_results.jsonl"); | |
| // 3. Load ASR | |
| const files = await listHFFiles("final_results"); | |
| let asrRecords = []; | |
| for (const file of files) { | |
| if (file.includes('metadata') || file.includes('tts_') || file.includes('slid_')) continue; | |
| const records = await fetchHFFile(file); | |
| asrRecords.push(...records); | |
| } | |
| // 4. Process ASR | |
| const families = new Set(); | |
| const models = new Set(); | |
| const familyData = {}; | |
| asrRecords.forEach(r => { | |
| if (r.task !== 'asr') return; | |
| // Use metadata to get Family and ISO | |
| const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code }; | |
| const fam = meta.family; | |
| const mod = r.model_name; | |
| families.add(fam); | |
| models.add(mod); | |
| if (!familyData[fam]) familyData[fam] = {}; | |
| if (!familyData[fam][mod]) familyData[fam][mod] = {}; | |
| // Store by Language Name | |
| familyData[fam][mod][meta.name] = { | |
| wer: (r.datasets_avg_wer || 0) * 100, | |
| cer: (r.datasets_avg_cer || 0) * 100, | |
| iso: meta.lang_code_key // Store ISO here | |
| }; | |
| }); | |
| // 5. Format ASR By Family | |
| const asrByFamily = {}; | |
| Array.from(families).sort().forEach(fam => { | |
| const modData = familyData[fam]; | |
| // Collect Languages with their ISO codes | |
| const langMapForFamily = new Map(); | |
| Object.values(modData).forEach(m => { | |
| Object.keys(m).forEach(langName => { | |
| // Get ISO from the stored data object | |
| const iso = m[langName].iso; | |
| langMapForFamily.set(langName, iso); | |
| }); | |
| }); | |
| // Create array of objects: { name: "Amharic", iso: "amh" } | |
| const languages = Array.from(langMapForFamily.keys()).sort().map(name => ({ | |
| name: name, | |
| iso: langMapForFamily.get(name) | |
| })); | |
| const rows = Object.keys(modData).map(modName => { | |
| const scores = modData[modName]; | |
| let sW=0, sC=0, n=0; | |
| languages.forEach(l => { | |
| if(scores[l.name]) { sW += scores[l.name].wer; sC += scores[l.name].cer; n++; } | |
| }); | |
| const row = { | |
| Model: modName, | |
| Avg_WER: n ? sW/n : 0, | |
| Avg_CER: n ? sC/n : 0 | |
| }; | |
| languages.forEach(l => { | |
| if(scores[l.name]) { | |
| row[`WER_${l.name}`] = scores[l.name].wer; | |
| row[`CER_${l.name}`] = scores[l.name].cer; | |
| } | |
| }); | |
| return row; | |
| }); | |
| rows.sort((a,b) => a.Avg_WER - b.Avg_WER); | |
| asrByFamily[fam] = { data: rows, languages: languages }; | |
| }); | |
| // 6. Format ASR By Model | |
| const asrByModel = {}; | |
| Array.from(models).forEach(mod => { | |
| const rows = []; | |
| asrRecords.forEach(r => { | |
| if(r.model_name === mod && r.task === 'asr') { | |
| const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code }; | |
| rows.push({ | |
| Language: meta.name, | |
| ISO: meta.lang_code_key, // Include ISO | |
| Family: meta.family, | |
| WER: (r.datasets_avg_wer || 0) * 100, | |
| CER: (r.datasets_avg_cer || 0) * 100 | |
| }); | |
| } | |
| }); | |
| rows.sort((a,b) => a.Language.localeCompare(b.Language)); | |
| asrByModel[mod] = rows; | |
| }); | |
| // 7. Format TTS | |
| const ttsGrouped = {}; | |
| const ttsModels = new Set(); | |
| ttsData.forEach(r => { | |
| ttsModels.add(r.model); | |
| if(!ttsGrouped[r.model]) ttsGrouped[r.model] = []; | |
| // Add ISO if available in metadata, else guess or use code | |
| const meta = langMap[r.lang_code] || { lang_code_key: r.lang_code }; | |
| r.iso = meta.lang_code_key; | |
| ttsGrouped[r.model].push(r); | |
| }); | |
| CACHED_DATA = { | |
| metadata: { | |
| families: Array.from(families).sort(), | |
| models: Array.from(models).sort(), | |
| tts_models: Array.from(ttsModels).sort() | |
| }, | |
| asr: { by_family: asrByFamily, by_model: asrByModel }, | |
| tts: ttsGrouped, | |
| slid: slidData | |
| }; | |
| console.log("Data Ready."); | |
| } | |
| app.get('/api/data', async (req, res) => { | |
| if (!CACHED_DATA) { | |
| try { await loadAndProcessData(); } | |
| catch (e) { return res.status(500).json({error: e.message}); } | |
| } | |
| res.json(CACHED_DATA); | |
| }); | |
| app.listen(PORT, '0.0.0.0', () => { | |
| console.log(`Server running on ${PORT}`); | |
| loadAndProcessData(); | |
| }); |