Spaces:
Running
Running
| const express = require('express'); | |
| const app = express(); | |
| const PORT = 7860; | |
| const HF_TOKEN = process.env.HF_TOKEN; | |
| const REPO_ID = process.env.SIMBA_DATA; | |
| let CACHED_DATA = null; | |
| app.use(express.static('public')); | |
| // --- Fetch Helper --- | |
| async function fetchHFFile(path) { | |
| const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${path}`; | |
| console.log(`[Loading] ${path}...`); | |
| const headers = {}; | |
| if (HF_TOKEN) { | |
| headers["Authorization"] = `Bearer ${HF_TOKEN}`; | |
| } | |
| try { | |
| const res = await fetch(url, { headers }); | |
| if (!res.ok) { | |
| console.error(`[Error] Failed to fetch ${path}: ${res.status} ${res.statusText}`); | |
| return []; | |
| } | |
| const text = await res.text(); | |
| return text.split('\n') | |
| .filter(line => line.trim()) | |
| .map(line => { | |
| try { return JSON.parse(line); } catch(e) { return null; } | |
| }) | |
| .filter(x => x); | |
| } catch (err) { | |
| console.error(`[Error] Network error fetching ${path}:`, err); | |
| return []; | |
| } | |
| } | |
| // --- List Files Helper --- | |
| async function listHFFiles(folder) { | |
| const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`; | |
| const headers = {}; | |
| if (HF_TOKEN) { | |
| headers["Authorization"] = `Bearer ${HF_TOKEN}`; | |
| } | |
| try { | |
| const res = await fetch(url, { headers }); | |
| if (!res.ok) { | |
| console.error(`[Error] Failed to list files: ${res.status}`); | |
| return []; | |
| } | |
| const files = await res.json(); | |
| // Filter only .jsonl files | |
| return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path); | |
| } catch (err) { | |
| console.error("[Error] Network error listing files:", err); | |
| return []; | |
| } | |
| } | |
| // --- Processor --- | |
| async function loadAndProcessData() { | |
| console.log("--- Starting Data Sync ---"); | |
| // 1. Fetch Metadata | |
| const metaList = await fetchHFFile("final_results/metadata.jsonl"); | |
| const langMap = {}; | |
| metaList.forEach(m => { langMap[m.lang_code_key] = m; }); | |
| // 2. Scan ALL files | |
| const files = await listHFFiles("final_results"); | |
| let allRecords = []; | |
| for (const file of files) { | |
| if (file.endsWith('metadata.jsonl')) continue; | |
| const records = await fetchHFFile(file); | |
| allRecords.push(...records); | |
| } | |
| console.log(`[Processing] Loaded ${allRecords.length} total records. Bucketing by task...`); | |
| // 3. Bucket Records by Task (Checking both 'task' and 'data_type') | |
| const ttsRecords = []; | |
| const slidRecords = []; | |
| const asrRecords = []; | |
| allRecords.forEach(r => { | |
| // Robust check: look for 'task' OR 'data_type' | |
| const identifier = r.task || r.data_type; | |
| if (!identifier) return; | |
| const task = identifier.toLowerCase().trim(); | |
| if (task === 'tts') ttsRecords.push(r); | |
| else if (task === 'slid') slidRecords.push(r); | |
| else if (task === 'asr') asrRecords.push(r); | |
| }); | |
| console.log(`[Counts] ASR: ${asrRecords.length}, TTS: ${ttsRecords.length}, SLID: ${slidRecords.length}`); | |
| // --- PROCESS SLID --- | |
| const slidMap = {}; | |
| slidRecords.forEach(r => { | |
| const code = r.lang_code; | |
| const meta = langMap[code] || { name: code, family: 'Unknown' }; | |
| if (!slidMap[code]) { | |
| slidMap[code] = { Language: meta.name, ISO: code }; | |
| } | |
| const modelName = r.model_name || r.model || "Unknown Model"; | |
| slidMap[code][modelName] = r.f1_score; | |
| }); | |
| const slidFinal = Object.values(slidMap); | |
| // --- PROCESS TTS --- | |
| const ttsGrouped = {}; | |
| const ttsModels = new Set(); | |
| ttsRecords.forEach(r => { | |
| const modelName = r.model || r.model_name || "Unknown Model"; | |
| ttsModels.add(modelName); | |
| if(!ttsGrouped[modelName]) ttsGrouped[modelName] = []; | |
| const meta = langMap[r.lang_code] || { name: r.language || r.lang_code, lang_code_key: r.lang_code }; | |
| // Standardize structure for frontend | |
| r.iso = r.lang_code; | |
| r.Language = meta.name; // Ensure capital 'Language' key exists for dropdown | |
| r.language = meta.name; // Ensure lowercase 'language' key exists for table lookups | |
| ttsGrouped[modelName].push(r); | |
| }); | |
| // --- PROCESS ASR --- | |
| const families = new Set(); | |
| const models = new Set(); | |
| const familyData = {}; | |
| asrRecords.forEach(r => { | |
| const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code }; | |
| const fam = meta.family; | |
| const mod = r.model_name || r.model; | |
| families.add(fam); | |
| models.add(mod); | |
| if (!familyData[fam]) familyData[fam] = {}; | |
| if (!familyData[fam][mod]) familyData[fam][mod] = {}; | |
| familyData[fam][mod][meta.name] = { | |
| wer: (r.datasets_avg_wer || 0) * 100, | |
| cer: (r.datasets_avg_cer || 0) * 100, | |
| iso: meta.lang_code_key | |
| }; | |
| }); | |
| // Format ASR By Family | |
| const asrByFamily = {}; | |
| Array.from(families).sort().forEach(fam => { | |
| const modData = familyData[fam]; | |
| const langMapForFamily = new Map(); | |
| Object.values(modData).forEach(m => { | |
| Object.keys(m).forEach(langName => { | |
| langMapForFamily.set(langName, m[langName].iso); | |
| }); | |
| }); | |
| const languages = Array.from(langMapForFamily.keys()).sort().map(name => ({ | |
| name: name, | |
| iso: langMapForFamily.get(name) | |
| })); | |
| const rows = Object.keys(modData).map(modName => { | |
| const scores = modData[modName]; | |
| let sW=0, sC=0, n=0; | |
| languages.forEach(l => { | |
| if(scores[l.name]) { sW += scores[l.name].wer; sC += scores[l.name].cer; n++; } | |
| }); | |
| const row = { Model: modName, Avg_WER: n ? sW/n : 0, Avg_CER: n ? sC/n : 0 }; | |
| languages.forEach(l => { | |
| if(scores[l.name]) { | |
| row[`WER_${l.name}`] = scores[l.name].wer; | |
| row[`CER_${l.name}`] = scores[l.name].cer; | |
| } | |
| }); | |
| return row; | |
| }); | |
| rows.sort((a,b) => a.Avg_WER - b.Avg_WER); | |
| asrByFamily[fam] = { data: rows, languages: languages }; | |
| }); | |
| // Format ASR By Model | |
| const asrByModel = {}; | |
| Array.from(models).forEach(mod => { | |
| const rows = []; | |
| asrRecords.forEach(r => { | |
| if((r.model_name || r.model) === mod) { | |
| const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code }; | |
| rows.push({ | |
| Language: meta.name, | |
| ISO: meta.lang_code_key, | |
| Family: meta.family, | |
| WER: (r.datasets_avg_wer || 0) * 100, | |
| CER: (r.datasets_avg_cer || 0) * 100 | |
| }); | |
| } | |
| }); | |
| rows.sort((a,b) => a.Language.localeCompare(b.Language)); | |
| asrByModel[mod] = rows; | |
| }); | |
| CACHED_DATA = { | |
| metadata: { | |
| families: Array.from(families).sort(), | |
| models: Array.from(models).sort(), | |
| tts_models: Array.from(ttsModels).sort() | |
| }, | |
| asr: { by_family: asrByFamily, by_model: asrByModel }, | |
| tts: ttsGrouped, | |
| slid: slidFinal | |
| }; | |
| console.log("--- Data Ready ---"); | |
| } | |
| app.get('/api/data', async (req, res) => { | |
| if (!CACHED_DATA) { | |
| try { await loadAndProcessData(); } | |
| catch (e) { return res.status(500).json({error: e.message}); } | |
| } | |
| res.json(CACHED_DATA); | |
| }); | |
| app.listen(PORT, '0.0.0.0', () => { | |
| console.log(`Server running on ${PORT}`); | |
| loadAndProcessData(); | |
| }); |