File size: 5,947 Bytes
6226dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
const express = require('express');
const app = express();
const PORT = 7860;

// Environment Variables from HF Space Settings
const HF_TOKEN = process.env.HF_TOKEN;
const REPO_ID = process.env.SIMBA_DATA; 

// In-memory cache
let CACHED_DATA = null;

// Serve static files from 'public' folder
app.use(express.static('public'));

// --- Helper: Download File from HF ---
async function fetchHFFile(filename) {
    const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${filename}`;
    console.log(`Downloading: ${filename}...`);
    const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
    if (!res.ok) {
        console.warn(`Failed to fetch ${filename}: ${res.statusText}`);
        return [];
    }
    const text = await res.text();
    // Parse JSONL
    return text.split('\n').filter(line => line.trim()).map(line => {
        try { return JSON.parse(line); } catch(e) { return null; }
    }).filter(x => x);
}

// --- Helper: List Files in Folder ---
async function listHFFiles(folder) {
    const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`;
    const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
    if (!res.ok) return [];
    const files = await res.json();
    return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path);
}

// --- Main Processor ---
async function loadAndProcessData() {
    console.log("Starting Data Sync...");
    
    // 1. Load Metadata
    const metaList = await fetchHFFile("final_results/metadata.jsonl");
    const langMap = {};
    metaList.forEach(m => { langMap[m.lang_code_key] = m; });

    // 2. Load TTS & SLID
    const ttsData = await fetchHFFile("final_results/tts_results.jsonl");
    const slidData = await fetchHFFile("final_results/slid_results.jsonl");

    // 3. Load ASR Files
    const files = await listHFFiles("final_results");
    let asrRecords = [];
    for (const file of files) {
        // Skip metadata/tts/slid if they appear in list
        if (file.includes('metadata') || file.includes('tts_') || file.includes('slid_')) continue;
        const records = await fetchHFFile(file);
        asrRecords.push(...records);
    }

    // 4. Process ASR Data
    const families = new Set();
    const models = new Set();
    const familyData = {}; // { Family: { Model: { Lang: {wer, cer} } } }

    asrRecords.forEach(r => {
        if (r.task !== 'asr') return;
        
        // Enrich
        const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown' };
        const fam = meta.family;
        const mod = r.model_name;

        families.add(fam);
        models.add(mod);

        // Init structure
        if (!familyData[fam]) familyData[fam] = {};
        if (!familyData[fam][mod]) familyData[fam][mod] = {};

        familyData[fam][mod][meta.name] = {
            wer: (r.datasets_avg_wer || 0) * 100,
            cer: (r.datasets_avg_cer || 0) * 100
        };
    });

    // 5. Format ASR By Family
    const asrByFamily = {};
    Array.from(families).sort().forEach(fam => {
        const modData = familyData[fam];
        
        // Find all languages for this family
        const langsSet = new Set();
        Object.values(modData).forEach(m => Object.keys(m).forEach(l => langsSet.add(l)));
        const langs = Array.from(langsSet).sort();

        // Build Rows
        const rows = Object.keys(modData).map(modName => {
            const scores = modData[modName];
            let sW=0, sC=0, n=0;
            
            langs.forEach(l => {
                if(scores[l]) { sW += scores[l].wer; sC += scores[l].cer; n++; }
            });

            const row = {
                Model: modName,
                Avg_WER: n ? sW/n : 0,
                Avg_CER: n ? sC/n : 0
            };
            
            // Add individual lang scores
            langs.forEach(l => {
                if(scores[l]) {
                    row[`WER_${l}`] = scores[l].wer;
                    row[`CER_${l}`] = scores[l].cer;
                }
            });
            return row;
        });

        // Sort by WER
        rows.sort((a,b) => a.Avg_WER - b.Avg_WER);
        asrByFamily[fam] = { data: rows, languages: langs };
    });

    // 6. Format ASR By Model
    const asrByModel = {};
    Array.from(models).forEach(mod => {
        const rows = [];
        asrRecords.forEach(r => {
            if(r.model_name === mod && r.task === 'asr') {
                const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown' };
                rows.push({
                    Language: meta.name,
                    Family: meta.family,
                    WER: (r.datasets_avg_wer || 0) * 100,
                    CER: (r.datasets_avg_cer || 0) * 100
                });
            }
        });
        rows.sort((a,b) => a.Language.localeCompare(b.Language));
        asrByModel[mod] = rows;
    });

    // 7. Format TTS
    const ttsGrouped = {};
    const ttsModels = new Set();
    ttsData.forEach(r => {
        ttsModels.add(r.model);
        if(!ttsGrouped[r.model]) ttsGrouped[r.model] = [];
        ttsGrouped[r.model].push(r);
    });

    CACHED_DATA = {
        metadata: {
            families: Array.from(families).sort(),
            models: Array.from(models).sort(),
            tts_models: Array.from(ttsModels).sort()
        },
        asr: { by_family: asrByFamily, by_model: asrByModel },
        tts: ttsGrouped,
        slid: slidData
    };
    console.log("Data Processing Complete.");
}

// API Route
app.get('/api/data', async (req, res) => {
    if (!CACHED_DATA) {
        try { await loadAndProcessData(); } 
        catch (e) { return res.status(500).json({error: e.message}); }
    }
    res.json(CACHED_DATA);
});

// Start
app.listen(PORT, '0.0.0.0', () => {
    console.log(`Server listening on port ${PORT}`);
    loadAndProcessData(); // Initial load
});