melakio commited on
Commit
b414ccb
ยท
verified ยท
1 Parent(s): 4011b1e

Upload generate_top_1000.js with huggingface_hub

Browse files
Files changed (1) hide show
  1. generate_top_1000.js +176 -0
generate_top_1000.js ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const duckdb = require('duckdb');
2
+ const fs = require('fs');
3
+ const path = require('path');
4
+ const translate = require('translate-google');
5
+
6
+ const db = new duckdb.Database(':memory:');
7
+
8
+ db.all(`
9
+ SELECT regexp_replace(word_ar, '[ูŽู‹ููŒููู‘ู’ูฐ]', '', 'g') as clean_ar,
10
+ word_ar,
11
+ word_en,
12
+ word_tr as phonetic,
13
+ count(*) as cnt
14
+ FROM 'data/*.parquet'
15
+ GROUP BY clean_ar, word_ar, word_en, phonetic
16
+ `, async (err, rows) => {
17
+ if (err) throw err;
18
+
19
+ const stems = {};
20
+ for (const row of rows) {
21
+ if (!stems[row.clean_ar]) {
22
+ stems[row.clean_ar] = { totalCnt: 0, variants: [] };
23
+ }
24
+ stems[row.clean_ar].totalCnt += Number(row.cnt);
25
+ stems[row.clean_ar].variants.push({
26
+ word_ar: row.word_ar,
27
+ en: row.word_en,
28
+ phonetic: row.phonetic,
29
+ cnt: Number(row.cnt)
30
+ });
31
+ }
32
+
33
+ const sortedStems = Object.keys(stems)
34
+ .sort((a, b) => stems[b].totalCnt - stems[a].totalCnt)
35
+ .slice(0, 1000);
36
+
37
+ const top1000 = [];
38
+
39
+ for (let i = 0; i < sortedStems.length; i++) {
40
+ const clean = sortedStems[i];
41
+ const variants = stems[clean].variants.sort((a, b) => b.cnt - a.cnt);
42
+
43
+ let rep = variants[0];
44
+
45
+ if (clean === 'ุงู„ู„ู‡' || clean === 'ุงู„ู„ู‘ูฐู‡') {
46
+ rep.word_ar = 'ุงู„ู„ู‘ูŽู‡ู';
47
+ rep.phonetic = 'Allฤhu';
48
+ rep.en = 'Allah';
49
+ }
50
+
51
+ top1000.push({
52
+ id: `q_word_${i + 1}`,
53
+ grapheme: rep.word_ar,
54
+ phonetic_fr: rep.phonetic,
55
+ translation_en: rep.en,
56
+ translation_fr: "",
57
+ frequency: stems[clean].totalCnt,
58
+ tags: ["quran_freq", `top_${Math.ceil((i + 1) / 10) * 10}`],
59
+ type: "word",
60
+ position: "isolated"
61
+ });
62
+ }
63
+
64
+ console.log(`Extracted top 1000 words. Starting robust translation...`);
65
+
66
+ // Translate English to French in text blocks with IDs to guarantee perfectly mapped alignments
67
+ const chunkSize = 50;
68
+ for (let i = 0; i < top1000.length; i += chunkSize) {
69
+ const chunk = top1000.slice(i, i + chunkSize);
70
+
71
+ // Build an indexed string: "0 | house \n 1 | dog"
72
+ const blockString = chunk.map((w, idx) => {
73
+ const cleanEn = w.translation_en.replace(/\\(.*?\\)/g, '').trim();
74
+ return `${idx} | ${cleanEn}`;
75
+ }).join('\\n');
76
+
77
+ try {
78
+ const frText = await translate(blockString, { to: 'fr' });
79
+
80
+ // Parse back the translated lines
81
+ const lines = frText.split('\\n');
82
+ let j = 0;
83
+ for (let line of lines) {
84
+ line = line.trim();
85
+ const parts = line.split('|');
86
+ if (parts.length >= 2) {
87
+ let translatedFr = parts[1].trim();
88
+ translatedFr = translatedFr.charAt(0).toUpperCase() + translatedFr.slice(1);
89
+ // Match the index back to the chunk
90
+ const originalIdx = parseInt(parts[0].trim());
91
+ if (!isNaN(originalIdx) && chunk[originalIdx]) {
92
+ chunk[originalIdx].translation_fr = translatedFr;
93
+ }
94
+ }
95
+ }
96
+ // Catch misses due to weird google formatting
97
+ for (let j = 0; j < chunk.length; j++) {
98
+ if (!chunk[j].translation_fr) {
99
+ chunk[j].translation_fr = chunk[j].translation_en;
100
+ }
101
+ }
102
+ console.log(`Translated snippet ${i} to ${i + chunkSize}`);
103
+ } catch (errTranslate) {
104
+ console.error("Translation error at chunk", i, errTranslate);
105
+ for (let j = 0; j < chunk.length; j++) {
106
+ chunk[j].translation_fr = chunk[j].translation_en;
107
+ }
108
+ }
109
+ await new Promise(r => setTimeout(r, 1000));
110
+ }
111
+
112
+ // Attempt to merge existing TS metadata to not lose custom user work
113
+ const tsFile = fs.readFileSync(path.join(__dirname, '../src/data/quran_vocabulary.ts'), 'utf8');
114
+
115
+ // Extractor
116
+ const stripHarakats = str => str.replace(/[ูŽู‹ููŒููู‘ู’ูฐ]+/g, '');
117
+ let existingDataMap = {};
118
+
119
+ // Dumb regex to just rip properties like root: "A-L-H", meaning_context: "..."
120
+ // Because it's hard to eval the full TS file without transpiling.
121
+ const blockRegex = /{[\\s\\S]*?grapheme:\\s*"([^"]+)"[\\s\\S]*?}/g;
122
+ let match;
123
+ while ((match = blockRegex.exec(tsFile)) !== null) {
124
+ const block = match[0];
125
+ const grapheme = match[1];
126
+ const stripped = stripHarakats(grapheme);
127
+ const customProps = {};
128
+
129
+ const rootMatch = block.match(/root:\\s*"([^"]+)"/);
130
+ if (rootMatch) customProps.root = rootMatch[1];
131
+
132
+ const meaningMatch = block.match(/meaning_context:\\s*"([^"]+)"/);
133
+ if (meaningMatch) customProps.meaning_context = meaningMatch[1];
134
+
135
+ // manual translation matches
136
+ const transMatch = block.match(/translation:\\s*"([^"]+)"/);
137
+ if (transMatch) customProps.translation = transMatch[1];
138
+
139
+ existingDataMap[stripped] = customProps;
140
+ }
141
+
142
+ const finalTop1000 = top1000.map(w => {
143
+ const stripped = stripHarakats(w.grapheme);
144
+ const existing = existingDataMap[stripped];
145
+
146
+ let newTranslation = w.translation_fr;
147
+ // If the old one had a better manual manual translation in French, prefer it
148
+ if (existing && existing.translation && !existing.translation.includes("(") && w.translation_en !== existing.translation) {
149
+ newTranslation = existing.translation;
150
+ }
151
+
152
+ return {
153
+ id: w.id,
154
+ grapheme: w.grapheme,
155
+ phonetic_fr: w.phonetic_fr.charAt(0).toUpperCase() + w.phonetic_fr.slice(1),
156
+ translation: newTranslation,
157
+ translation_en: w.translation_en,
158
+ type: w.type,
159
+ position: w.position,
160
+ tags: w.tags,
161
+ frequency: w.frequency,
162
+ ...(existing && existing.root ? { root: existing.root } : {}),
163
+ ...(existing && existing.meaning_context ? { meaning_context: existing.meaning_context } : {})
164
+ };
165
+ });
166
+
167
+ const finalData = {
168
+ _comment: "Ce JSON contient les 1000 mots / racines les plus frequentes extraites depuis la BDD audio (DuckDb). Format concu pour permettre 1:1 d'audio avec les graphemes. Methodologie : 1. Extraire les mots sans harakat. 2. Trouver la version harakat+audio la plus recurente pour deduire la phonetique. 3. Traductions ajoutees par Lots. 4. Les racines et definitions existantes dans vocab.ts ont ete conservees. Pour toute mise a jour du dictionnaire, il suffit de modifier ou ajouter un objet ici.",
169
+ words: finalTop1000
170
+ };
171
+
172
+ const outPath = path.join(__dirname, '../src/data/quran_top_1000.json');
173
+ fs.writeFileSync(outPath, JSON.stringify(finalData, null, 2), 'utf8');
174
+
175
+ console.log(`Successfully generated safely merged top 1000 words at ${outPath}`);
176
+ });