Spaces:
Sleeping
Sleeping
Upload generate_top_1000.js with huggingface_hub
Browse files- generate_top_1000.js +176 -0
generate_top_1000.js
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const duckdb = require('duckdb');
|
| 2 |
+
const fs = require('fs');
|
| 3 |
+
const path = require('path');
|
| 4 |
+
const translate = require('translate-google');
|
| 5 |
+
|
| 6 |
+
const db = new duckdb.Database(':memory:');
|
| 7 |
+
|
| 8 |
+
db.all(`
|
| 9 |
+
SELECT regexp_replace(word_ar, '[ูููููููููฐ]', '', 'g') as clean_ar,
|
| 10 |
+
word_ar,
|
| 11 |
+
word_en,
|
| 12 |
+
word_tr as phonetic,
|
| 13 |
+
count(*) as cnt
|
| 14 |
+
FROM 'data/*.parquet'
|
| 15 |
+
GROUP BY clean_ar, word_ar, word_en, phonetic
|
| 16 |
+
`, async (err, rows) => {
|
| 17 |
+
if (err) throw err;
|
| 18 |
+
|
| 19 |
+
const stems = {};
|
| 20 |
+
for (const row of rows) {
|
| 21 |
+
if (!stems[row.clean_ar]) {
|
| 22 |
+
stems[row.clean_ar] = { totalCnt: 0, variants: [] };
|
| 23 |
+
}
|
| 24 |
+
stems[row.clean_ar].totalCnt += Number(row.cnt);
|
| 25 |
+
stems[row.clean_ar].variants.push({
|
| 26 |
+
word_ar: row.word_ar,
|
| 27 |
+
en: row.word_en,
|
| 28 |
+
phonetic: row.phonetic,
|
| 29 |
+
cnt: Number(row.cnt)
|
| 30 |
+
});
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
const sortedStems = Object.keys(stems)
|
| 34 |
+
.sort((a, b) => stems[b].totalCnt - stems[a].totalCnt)
|
| 35 |
+
.slice(0, 1000);
|
| 36 |
+
|
| 37 |
+
const top1000 = [];
|
| 38 |
+
|
| 39 |
+
for (let i = 0; i < sortedStems.length; i++) {
|
| 40 |
+
const clean = sortedStems[i];
|
| 41 |
+
const variants = stems[clean].variants.sort((a, b) => b.cnt - a.cnt);
|
| 42 |
+
|
| 43 |
+
let rep = variants[0];
|
| 44 |
+
|
| 45 |
+
if (clean === 'ุงููู' || clean === 'ุงููููฐู') {
|
| 46 |
+
rep.word_ar = 'ุงูููููู';
|
| 47 |
+
rep.phonetic = 'Allฤhu';
|
| 48 |
+
rep.en = 'Allah';
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
top1000.push({
|
| 52 |
+
id: `q_word_${i + 1}`,
|
| 53 |
+
grapheme: rep.word_ar,
|
| 54 |
+
phonetic_fr: rep.phonetic,
|
| 55 |
+
translation_en: rep.en,
|
| 56 |
+
translation_fr: "",
|
| 57 |
+
frequency: stems[clean].totalCnt,
|
| 58 |
+
tags: ["quran_freq", `top_${Math.ceil((i + 1) / 10) * 10}`],
|
| 59 |
+
type: "word",
|
| 60 |
+
position: "isolated"
|
| 61 |
+
});
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
console.log(`Extracted top 1000 words. Starting robust translation...`);
|
| 65 |
+
|
| 66 |
+
// Translate English to French in text blocks with IDs to guarantee perfectly mapped alignments
|
| 67 |
+
const chunkSize = 50;
|
| 68 |
+
for (let i = 0; i < top1000.length; i += chunkSize) {
|
| 69 |
+
const chunk = top1000.slice(i, i + chunkSize);
|
| 70 |
+
|
| 71 |
+
// Build an indexed string: "0 | house \n 1 | dog"
|
| 72 |
+
const blockString = chunk.map((w, idx) => {
|
| 73 |
+
const cleanEn = w.translation_en.replace(/\\(.*?\\)/g, '').trim();
|
| 74 |
+
return `${idx} | ${cleanEn}`;
|
| 75 |
+
}).join('\\n');
|
| 76 |
+
|
| 77 |
+
try {
|
| 78 |
+
const frText = await translate(blockString, { to: 'fr' });
|
| 79 |
+
|
| 80 |
+
// Parse back the translated lines
|
| 81 |
+
const lines = frText.split('\\n');
|
| 82 |
+
let j = 0;
|
| 83 |
+
for (let line of lines) {
|
| 84 |
+
line = line.trim();
|
| 85 |
+
const parts = line.split('|');
|
| 86 |
+
if (parts.length >= 2) {
|
| 87 |
+
let translatedFr = parts[1].trim();
|
| 88 |
+
translatedFr = translatedFr.charAt(0).toUpperCase() + translatedFr.slice(1);
|
| 89 |
+
// Match the index back to the chunk
|
| 90 |
+
const originalIdx = parseInt(parts[0].trim());
|
| 91 |
+
if (!isNaN(originalIdx) && chunk[originalIdx]) {
|
| 92 |
+
chunk[originalIdx].translation_fr = translatedFr;
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
// Catch misses due to weird google formatting
|
| 97 |
+
for (let j = 0; j < chunk.length; j++) {
|
| 98 |
+
if (!chunk[j].translation_fr) {
|
| 99 |
+
chunk[j].translation_fr = chunk[j].translation_en;
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
console.log(`Translated snippet ${i} to ${i + chunkSize}`);
|
| 103 |
+
} catch (errTranslate) {
|
| 104 |
+
console.error("Translation error at chunk", i, errTranslate);
|
| 105 |
+
for (let j = 0; j < chunk.length; j++) {
|
| 106 |
+
chunk[j].translation_fr = chunk[j].translation_en;
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
await new Promise(r => setTimeout(r, 1000));
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
// Attempt to merge existing TS metadata to not lose custom user work
|
| 113 |
+
const tsFile = fs.readFileSync(path.join(__dirname, '../src/data/quran_vocabulary.ts'), 'utf8');
|
| 114 |
+
|
| 115 |
+
// Extractor
|
| 116 |
+
const stripHarakats = str => str.replace(/[ูููููููููฐ]+/g, '');
|
| 117 |
+
let existingDataMap = {};
|
| 118 |
+
|
| 119 |
+
// Dumb regex to just rip properties like root: "A-L-H", meaning_context: "..."
|
| 120 |
+
// Because it's hard to eval the full TS file without transpiling.
|
| 121 |
+
const blockRegex = /{[\\s\\S]*?grapheme:\\s*"([^"]+)"[\\s\\S]*?}/g;
|
| 122 |
+
let match;
|
| 123 |
+
while ((match = blockRegex.exec(tsFile)) !== null) {
|
| 124 |
+
const block = match[0];
|
| 125 |
+
const grapheme = match[1];
|
| 126 |
+
const stripped = stripHarakats(grapheme);
|
| 127 |
+
const customProps = {};
|
| 128 |
+
|
| 129 |
+
const rootMatch = block.match(/root:\\s*"([^"]+)"/);
|
| 130 |
+
if (rootMatch) customProps.root = rootMatch[1];
|
| 131 |
+
|
| 132 |
+
const meaningMatch = block.match(/meaning_context:\\s*"([^"]+)"/);
|
| 133 |
+
if (meaningMatch) customProps.meaning_context = meaningMatch[1];
|
| 134 |
+
|
| 135 |
+
// manual translation matches
|
| 136 |
+
const transMatch = block.match(/translation:\\s*"([^"]+)"/);
|
| 137 |
+
if (transMatch) customProps.translation = transMatch[1];
|
| 138 |
+
|
| 139 |
+
existingDataMap[stripped] = customProps;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
const finalTop1000 = top1000.map(w => {
|
| 143 |
+
const stripped = stripHarakats(w.grapheme);
|
| 144 |
+
const existing = existingDataMap[stripped];
|
| 145 |
+
|
| 146 |
+
let newTranslation = w.translation_fr;
|
| 147 |
+
// If the old one had a better manual manual translation in French, prefer it
|
| 148 |
+
if (existing && existing.translation && !existing.translation.includes("(") && w.translation_en !== existing.translation) {
|
| 149 |
+
newTranslation = existing.translation;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
return {
|
| 153 |
+
id: w.id,
|
| 154 |
+
grapheme: w.grapheme,
|
| 155 |
+
phonetic_fr: w.phonetic_fr.charAt(0).toUpperCase() + w.phonetic_fr.slice(1),
|
| 156 |
+
translation: newTranslation,
|
| 157 |
+
translation_en: w.translation_en,
|
| 158 |
+
type: w.type,
|
| 159 |
+
position: w.position,
|
| 160 |
+
tags: w.tags,
|
| 161 |
+
frequency: w.frequency,
|
| 162 |
+
...(existing && existing.root ? { root: existing.root } : {}),
|
| 163 |
+
...(existing && existing.meaning_context ? { meaning_context: existing.meaning_context } : {})
|
| 164 |
+
};
|
| 165 |
+
});
|
| 166 |
+
|
| 167 |
+
const finalData = {
|
| 168 |
+
_comment: "Ce JSON contient les 1000 mots / racines les plus frequentes extraites depuis la BDD audio (DuckDb). Format concu pour permettre 1:1 d'audio avec les graphemes. Methodologie : 1. Extraire les mots sans harakat. 2. Trouver la version harakat+audio la plus recurente pour deduire la phonetique. 3. Traductions ajoutees par Lots. 4. Les racines et definitions existantes dans vocab.ts ont ete conservees. Pour toute mise a jour du dictionnaire, il suffit de modifier ou ajouter un objet ici.",
|
| 169 |
+
words: finalTop1000
|
| 170 |
+
};
|
| 171 |
+
|
| 172 |
+
const outPath = path.join(__dirname, '../src/data/quran_top_1000.json');
|
| 173 |
+
fs.writeFileSync(outPath, JSON.stringify(finalData, null, 2), 'utf8');
|
| 174 |
+
|
| 175 |
+
console.log(`Successfully generated safely merged top 1000 words at ${outPath}`);
|
| 176 |
+
});
|