Spaces:

melakio
/

arabic-tts

Sleeping

App Files Files Community

melakio commited on Feb 22

Commit

b414ccb

verified ·

1 Parent(s): 4011b1e

Upload generate_top_1000.js with huggingface_hub

Browse files

Files changed (1) hide show

generate_top_1000.js +176 -0

generate_top_1000.js ADDED Viewed

	@@ -0,0 +1,176 @@

+const duckdb = require('duckdb');
+const fs = require('fs');
+const path = require('path');
+const translate = require('translate-google');
+const db = new duckdb.Database(':memory:');
+db.all(`
+    SELECT regexp_replace(word_ar, '[ًٌٍَُِّْٰ]', '', 'g') as clean_ar,
+           word_ar,
+           word_en,
+           word_tr as phonetic,
+           count(*) as cnt
+    FROM 'data/*.parquet'
+    GROUP BY clean_ar, word_ar, word_en, phonetic
+`, async (err, rows) => {
+    if (err) throw err;
+    const stems = {};
+    for (const row of rows) {
+        if (!stems[row.clean_ar]) {
+            stems[row.clean_ar] = { totalCnt: 0, variants: [] };
+        }
+        stems[row.clean_ar].totalCnt += Number(row.cnt);
+        stems[row.clean_ar].variants.push({
+            word_ar: row.word_ar,
+            en: row.word_en,
+            phonetic: row.phonetic,
+            cnt: Number(row.cnt)
+        });
+    }
+    const sortedStems = Object.keys(stems)
+        .sort((a, b) => stems[b].totalCnt - stems[a].totalCnt)
+        .slice(0, 1000);
+    const top1000 = [];
+    for (let i = 0; i < sortedStems.length; i++) {
+        const clean = sortedStems[i];
+        const variants = stems[clean].variants.sort((a, b) => b.cnt - a.cnt);
+        let rep = variants[0];
+        if (clean === 'الله' || clean === 'اللّٰه') {
+            rep.word_ar = 'اللَّهُ';
+            rep.phonetic = 'Allāhu';
+            rep.en = 'Allah';
+        }
+        top1000.push({
+            id: `q_word_${i + 1}`,
+            grapheme: rep.word_ar,
+            phonetic_fr: rep.phonetic,
+            translation_en: rep.en,
+            translation_fr: "",
+            frequency: stems[clean].totalCnt,
+            tags: ["quran_freq", `top_${Math.ceil((i + 1) / 10) * 10}`],
+            type: "word",
+            position: "isolated"
+        });
+    }
+    console.log(`Extracted top 1000 words. Starting robust translation...`);
+    // Translate English to French in text blocks with IDs to guarantee perfectly mapped alignments
+    const chunkSize = 50;
+    for (let i = 0; i < top1000.length; i += chunkSize) {
+        const chunk = top1000.slice(i, i + chunkSize);
+        // Build an indexed string: "0 | house \n 1 | dog"
+        const blockString = chunk.map((w, idx) => {
+            const cleanEn = w.translation_en.replace(/\\(.*?\\)/g, '').trim();
+            return `${idx} | ${cleanEn}`;
+        }).join('\\n');
+        try {
+            const frText = await translate(blockString, { to: 'fr' });
+            // Parse back the translated lines
+            const lines = frText.split('\\n');
+            let j = 0;
+            for (let line of lines) {
+                line = line.trim();
+                const parts = line.split('|');
+                if (parts.length >= 2) {
+                    let translatedFr = parts[1].trim();
+                    translatedFr = translatedFr.charAt(0).toUpperCase() + translatedFr.slice(1);
+                    // Match the index back to the chunk
+                    const originalIdx = parseInt(parts[0].trim());
+                    if (!isNaN(originalIdx) && chunk[originalIdx]) {
+                        chunk[originalIdx].translation_fr = translatedFr;
+                    }
+                }
+            }
+            // Catch misses due to weird google formatting
+            for (let j = 0; j < chunk.length; j++) {
+                if (!chunk[j].translation_fr) {
+                    chunk[j].translation_fr = chunk[j].translation_en;
+                }
+            }
+            console.log(`Translated snippet ${i} to ${i + chunkSize}`);
+        } catch (errTranslate) {
+            console.error("Translation error at chunk", i, errTranslate);
+            for (let j = 0; j < chunk.length; j++) {
+                chunk[j].translation_fr = chunk[j].translation_en;
+            }
+        }
+        await new Promise(r => setTimeout(r, 1000));
+    }
+    // Attempt to merge existing TS metadata to not lose custom user work
+    const tsFile = fs.readFileSync(path.join(__dirname, '../src/data/quran_vocabulary.ts'), 'utf8');
+    // Extractor
+    const stripHarakats = str => str.replace(/[ًٌٍَُِّْٰ]+/g, '');
+    let existingDataMap = {};
+    // Dumb regex to just rip properties like root: "A-L-H", meaning_context: "..."
+    // Because it's hard to eval the full TS file without transpiling.
+    const blockRegex = /{[\\s\\S]*?grapheme:\\s*"([^"]+)"[\\s\\S]*?}/g;
+    let match;
+    while ((match = blockRegex.exec(tsFile)) !== null) {
+        const block = match[0];
+        const grapheme = match[1];
+        const stripped = stripHarakats(grapheme);
+        const customProps = {};
+        const rootMatch = block.match(/root:\\s*"([^"]+)"/);
+        if (rootMatch) customProps.root = rootMatch[1];
+        const meaningMatch = block.match(/meaning_context:\\s*"([^"]+)"/);
+        if (meaningMatch) customProps.meaning_context = meaningMatch[1];
+        // manual translation matches
+        const transMatch = block.match(/translation:\\s*"([^"]+)"/);
+        if (transMatch) customProps.translation = transMatch[1];
+        existingDataMap[stripped] = customProps;
+    }
+    const finalTop1000 = top1000.map(w => {
+        const stripped = stripHarakats(w.grapheme);
+        const existing = existingDataMap[stripped];
+        let newTranslation = w.translation_fr;
+        // If the old one had a better manual manual translation in French, prefer it
+        if (existing && existing.translation && !existing.translation.includes("(") && w.translation_en !== existing.translation) {
+            newTranslation = existing.translation;
+        }
+        return {
+            id: w.id,
+            grapheme: w.grapheme,
+            phonetic_fr: w.phonetic_fr.charAt(0).toUpperCase() + w.phonetic_fr.slice(1),
+            translation: newTranslation,
+            translation_en: w.translation_en,
+            type: w.type,
+            position: w.position,
+            tags: w.tags,
+            frequency: w.frequency,
+            ...(existing && existing.root ? { root: existing.root } : {}),
+            ...(existing && existing.meaning_context ? { meaning_context: existing.meaning_context } : {})
+        };
+    });
+    const finalData = {
+        _comment: "Ce JSON contient les 1000 mots / racines les plus frequentes extraites depuis la BDD audio (DuckDb). Format concu pour permettre 1:1 d'audio avec les graphemes. Methodologie : 1. Extraire les mots sans harakat. 2. Trouver la version harakat+audio la plus recurente pour deduire la phonetique. 3. Traductions ajoutees par Lots. 4. Les racines et definitions existantes dans vocab.ts ont ete conservees. Pour toute mise a jour du dictionnaire, il suffit de modifier ou ajouter un objet ici.",
+        words: finalTop1000
+    };
+    const outPath = path.join(__dirname, '../src/data/quran_top_1000.json');
+    fs.writeFileSync(outPath, JSON.stringify(finalData, null, 2), 'utf8');
+    console.log(`Successfully generated safely merged top 1000 words at ${outPath}`);
+});