elmadany commited on
Commit
d77393e
·
verified ·
1 Parent(s): 2b8ba04

Update server.js

Browse files
Files changed (1) hide show
  1. server.js +83 -75
server.js CHANGED
@@ -10,83 +10,114 @@ let CACHED_DATA = null;
10
  app.use(express.static('public'));
11
 
12
  // --- Fetch Helper ---
13
- async function fetchHFFile(filename) {
14
- const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${filename}`;
15
- console.log(`Downloading: ${filename}...`);
16
- const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
17
- if (!res.ok) return [];
18
- const text = await res.text();
19
- return text.split('\n').filter(line => line.trim()).map(line => {
20
- try { return JSON.parse(line); } catch(e) { return null; }
21
- }).filter(x => x);
 
 
 
 
 
 
 
 
 
 
 
22
  }
23
 
24
  // --- List Files Helper ---
25
  async function listHFFiles(folder) {
26
  const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`;
27
- const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
28
- if (!res.ok) return [];
29
- const files = await res.json();
30
- return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path);
 
 
 
 
 
 
 
 
 
31
  }
32
 
33
  // --- Processor ---
34
  async function loadAndProcessData() {
35
- console.log("Starting Data Sync...");
36
 
37
- // 1. Metadata
38
  const metaList = await fetchHFFile("final_results/metadata.jsonl");
39
  const langMap = {};
40
- // Map code key to full metadata object
41
  metaList.forEach(m => { langMap[m.lang_code_key] = m; });
42
 
43
- // 2. Load TTS & SLID
44
- const ttsData = await fetchHFFile("final_results/tts_results.jsonl");
45
- const slidRecords = await fetchHFFile("final_results/slid_results.jsonl");
 
 
 
 
 
 
 
 
 
 
46
 
47
- // --- NEW SLID PROCESSING LOGIC ---
48
- // Pivot from Long Format -> Wide Format for Frontend
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  const slidMap = {};
50
  slidRecords.forEach(r => {
51
- // r = { task: "SLID", lang_code: "bin", model_name: "MMS...", f1_score: 6.25 }
52
  const code = r.lang_code;
53
-
54
- // Find language name from metadata, fallback to code if missing
55
  const meta = langMap[code] || { name: code, family: 'Unknown' };
56
 
57
  if (!slidMap[code]) {
58
- slidMap[code] = {
59
- Language: meta.name,
60
- ISO: code
61
- // Model columns will be added dynamically below
62
- };
63
  }
64
-
65
- // Assign score to the model key (e.g., row["MMS-LID-1024"] = 6.25)
66
  slidMap[code][r.model_name] = r.f1_score;
67
  });
68
- // Convert Map back to Array
69
  const slidFinal = Object.values(slidMap);
70
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- // 3. Load ASR Files
73
- const files = await listHFFiles("final_results");
74
- let asrRecords = [];
75
- for (const file of files) {
76
- if (file.includes('metadata') || file.includes('tts_') || file.includes('slid_')) continue;
77
- const records = await fetchHFFile(file);
78
- asrRecords.push(...records);
79
- }
80
-
81
- // 4. Process ASR
82
  const families = new Set();
83
  const models = new Set();
84
  const familyData = {};
85
 
86
  asrRecords.forEach(r => {
87
- if (r.task !== 'asr') return;
88
-
89
- // Use metadata to get Family and ISO
90
  const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code };
91
  const fam = meta.family;
92
  const mod = r.model_name;
@@ -97,7 +128,6 @@ async function loadAndProcessData() {
97
  if (!familyData[fam]) familyData[fam] = {};
98
  if (!familyData[fam][mod]) familyData[fam][mod] = {};
99
 
100
- // Store by Language Name
101
  familyData[fam][mod][meta.name] = {
102
  wer: (r.datasets_avg_wer || 0) * 100,
103
  cer: (r.datasets_avg_cer || 0) * 100,
@@ -105,17 +135,14 @@ async function loadAndProcessData() {
105
  };
106
  });
107
 
108
- // 5. Format ASR By Family
109
  const asrByFamily = {};
110
  Array.from(families).sort().forEach(fam => {
111
  const modData = familyData[fam];
112
-
113
- // Collect Languages with their ISO codes
114
  const langMapForFamily = new Map();
115
  Object.values(modData).forEach(m => {
116
  Object.keys(m).forEach(langName => {
117
- const iso = m[langName].iso;
118
- langMapForFamily.set(langName, iso);
119
  });
120
  });
121
 
@@ -127,17 +154,10 @@ async function loadAndProcessData() {
127
  const rows = Object.keys(modData).map(modName => {
128
  const scores = modData[modName];
129
  let sW=0, sC=0, n=0;
130
-
131
  languages.forEach(l => {
132
  if(scores[l.name]) { sW += scores[l.name].wer; sC += scores[l.name].cer; n++; }
133
  });
134
-
135
- const row = {
136
- Model: modName,
137
- Avg_WER: n ? sW/n : 0,
138
- Avg_CER: n ? sC/n : 0
139
- };
140
-
141
  languages.forEach(l => {
142
  if(scores[l.name]) {
143
  row[`WER_${l.name}`] = scores[l.name].wer;
@@ -146,17 +166,16 @@ async function loadAndProcessData() {
146
  });
147
  return row;
148
  });
149
-
150
  rows.sort((a,b) => a.Avg_WER - b.Avg_WER);
151
  asrByFamily[fam] = { data: rows, languages: languages };
152
  });
153
 
154
- // 6. Format ASR By Model
155
  const asrByModel = {};
156
  Array.from(models).forEach(mod => {
157
  const rows = [];
158
  asrRecords.forEach(r => {
159
- if(r.model_name === mod && r.task === 'asr') {
160
  const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code };
161
  rows.push({
162
  Language: meta.name,
@@ -171,17 +190,6 @@ async function loadAndProcessData() {
171
  asrByModel[mod] = rows;
172
  });
173
 
174
- // 7. Format TTS
175
- const ttsGrouped = {};
176
- const ttsModels = new Set();
177
- ttsData.forEach(r => {
178
- ttsModels.add(r.model);
179
- if(!ttsGrouped[r.model]) ttsGrouped[r.model] = [];
180
- const meta = langMap[r.lang_code] || { lang_code_key: r.lang_code };
181
- r.iso = meta.lang_code_key;
182
- ttsGrouped[r.model].push(r);
183
- });
184
-
185
  CACHED_DATA = {
186
  metadata: {
187
  families: Array.from(families).sort(),
@@ -190,9 +198,9 @@ async function loadAndProcessData() {
190
  },
191
  asr: { by_family: asrByFamily, by_model: asrByModel },
192
  tts: ttsGrouped,
193
- slid: slidFinal // Pass the pivoted data
194
  };
195
- console.log("Data Ready.");
196
  }
197
 
198
  app.get('/api/data', async (req, res) => {
 
10
  app.use(express.static('public'));
11
 
12
  // --- Fetch Helper ---
13
+ async function fetchHFFile(path) {
14
+ const url = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${path}`;
15
+ console.log(`[Loading] ${path}...`);
16
+ try {
17
+ const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
18
+ if (!res.ok) {
19
+ console.error(`[Error] Failed to fetch ${path}: ${res.status} ${res.statusText}`);
20
+ return [];
21
+ }
22
+ const text = await res.text();
23
+ return text.split('\n')
24
+ .filter(line => line.trim())
25
+ .map(line => {
26
+ try { return JSON.parse(line); } catch(e) { return null; }
27
+ })
28
+ .filter(x => x);
29
+ } catch (err) {
30
+ console.error(`[Error] Network error fetching ${path}:`, err);
31
+ return [];
32
+ }
33
  }
34
 
35
  // --- List Files Helper ---
36
  async function listHFFiles(folder) {
37
  const url = `https://huggingface.co/api/datasets/${REPO_ID}/tree/main/${folder}`;
38
+ try {
39
+ const res = await fetch(url, { headers: { "Authorization": `Bearer ${HF_TOKEN}` } });
40
+ if (!res.ok) {
41
+ console.error(`[Error] Failed to list files: ${res.status}`);
42
+ return [];
43
+ }
44
+ const files = await res.json();
45
+ // Filter only .jsonl files
46
+ return files.filter(f => f.path.endsWith('.jsonl')).map(f => f.path);
47
+ } catch (err) {
48
+ console.error("[Error] Network error listing files:", err);
49
+ return [];
50
+ }
51
  }
52
 
53
  // --- Processor ---
54
  async function loadAndProcessData() {
55
+ console.log("--- Starting Data Sync ---");
56
 
57
+ // 1. Fetch Metadata (Crucial for language names)
58
  const metaList = await fetchHFFile("final_results/metadata.jsonl");
59
  const langMap = {};
 
60
  metaList.forEach(m => { langMap[m.lang_code_key] = m; });
61
 
62
+ // 2. Scan ALL files in final_results
63
+ const files = await listHFFiles("final_results");
64
+
65
+ let allRecords = [];
66
+
67
+ // Load every file found (except metadata which we already loaded)
68
+ for (const file of files) {
69
+ if (file.endsWith('metadata.jsonl')) continue;
70
+ const records = await fetchHFFile(file);
71
+ allRecords.push(...records);
72
+ }
73
+
74
+ console.log(`[Processing] Loaded ${allRecords.length} total records. Sorting by task...`);
75
 
76
+ // 3. Bucket Records by Task
77
+ const ttsRecords = [];
78
+ const slidRecords = [];
79
+ const asrRecords = [];
80
+
81
+ allRecords.forEach(r => {
82
+ if (!r.task) return;
83
+ const task = r.task.toLowerCase().trim();
84
+
85
+ if (task === 'tts') ttsRecords.push(r);
86
+ else if (task === 'slid') slidRecords.push(r);
87
+ else if (task === 'asr') asrRecords.push(r);
88
+ });
89
+
90
+ // --- PROCESS SLID ---
91
  const slidMap = {};
92
  slidRecords.forEach(r => {
 
93
  const code = r.lang_code;
 
 
94
  const meta = langMap[code] || { name: code, family: 'Unknown' };
95
 
96
  if (!slidMap[code]) {
97
+ slidMap[code] = { Language: meta.name, ISO: code };
 
 
 
 
98
  }
 
 
99
  slidMap[code][r.model_name] = r.f1_score;
100
  });
 
101
  const slidFinal = Object.values(slidMap);
102
 
103
+ // --- PROCESS TTS ---
104
+ const ttsGrouped = {};
105
+ const ttsModels = new Set();
106
+ ttsRecords.forEach(r => {
107
+ ttsModels.add(r.model);
108
+ if(!ttsGrouped[r.model]) ttsGrouped[r.model] = [];
109
+ const meta = langMap[r.lang_code] || { lang_code_key: r.lang_code };
110
+ r.iso = meta.lang_code_key;
111
+ r.Language = meta.name; // Ensure Language name is attached
112
+ ttsGrouped[r.model].push(r);
113
+ });
114
 
115
+ // --- PROCESS ASR ---
 
 
 
 
 
 
 
 
 
116
  const families = new Set();
117
  const models = new Set();
118
  const familyData = {};
119
 
120
  asrRecords.forEach(r => {
 
 
 
121
  const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code };
122
  const fam = meta.family;
123
  const mod = r.model_name;
 
128
  if (!familyData[fam]) familyData[fam] = {};
129
  if (!familyData[fam][mod]) familyData[fam][mod] = {};
130
 
 
131
  familyData[fam][mod][meta.name] = {
132
  wer: (r.datasets_avg_wer || 0) * 100,
133
  cer: (r.datasets_avg_cer || 0) * 100,
 
135
  };
136
  });
137
 
138
+ // Format ASR By Family
139
  const asrByFamily = {};
140
  Array.from(families).sort().forEach(fam => {
141
  const modData = familyData[fam];
 
 
142
  const langMapForFamily = new Map();
143
  Object.values(modData).forEach(m => {
144
  Object.keys(m).forEach(langName => {
145
+ langMapForFamily.set(langName, m[langName].iso);
 
146
  });
147
  });
148
 
 
154
  const rows = Object.keys(modData).map(modName => {
155
  const scores = modData[modName];
156
  let sW=0, sC=0, n=0;
 
157
  languages.forEach(l => {
158
  if(scores[l.name]) { sW += scores[l.name].wer; sC += scores[l.name].cer; n++; }
159
  });
160
+ const row = { Model: modName, Avg_WER: n ? sW/n : 0, Avg_CER: n ? sC/n : 0 };
 
 
 
 
 
 
161
  languages.forEach(l => {
162
  if(scores[l.name]) {
163
  row[`WER_${l.name}`] = scores[l.name].wer;
 
166
  });
167
  return row;
168
  });
 
169
  rows.sort((a,b) => a.Avg_WER - b.Avg_WER);
170
  asrByFamily[fam] = { data: rows, languages: languages };
171
  });
172
 
173
+ // Format ASR By Model
174
  const asrByModel = {};
175
  Array.from(models).forEach(mod => {
176
  const rows = [];
177
  asrRecords.forEach(r => {
178
+ if(r.model_name === mod) {
179
  const meta = langMap[r.lang_code] || { name: r.lang_code, family: 'Unknown', lang_code_key: r.lang_code };
180
  rows.push({
181
  Language: meta.name,
 
190
  asrByModel[mod] = rows;
191
  });
192
 
 
 
 
 
 
 
 
 
 
 
 
193
  CACHED_DATA = {
194
  metadata: {
195
  families: Array.from(families).sort(),
 
198
  },
199
  asr: { by_family: asrByFamily, by_model: asrByModel },
200
  tts: ttsGrouped,
201
+ slid: slidFinal
202
  };
203
+ console.log("--- Data Ready ---");
204
  }
205
 
206
  app.get('/api/data', async (req, res) => {