arabic-tts / update_phonetics.js
melakio's picture
Upload update_phonetics.js with huggingface_hub
81ec9f5 verified
const duckdb = require('duckdb');
const fs = require('fs');
const path = require('path');
const vocabPath = path.join(__dirname, '../src/data/quran_vocabulary.ts');
let vocabContent = fs.readFileSync(vocabPath, 'utf8');
const db = new duckdb.Database(':memory:');
db.all(`
SELECT word_ar, word_tr, count(*) as cnt
FROM 'data/*.parquet'
GROUP BY word_ar, word_tr
ORDER BY cnt DESC
`, (err, rows) => {
if (err) throw err;
// index grouping by word_ar to get the most frequent word_tr
const dbIndex = {};
for (const row of rows) {
if (!dbIndex[row.word_ar]) {
dbIndex[row.word_ar] = row.word_tr;
}
}
let modifications = 0;
// We will parse grapheme first, then find the corresponding phonetic_fr and replace it
// Safe way: match blocks
let updatedContent = vocabContent.replace(/{[\s\S]*?}/g, (block) => {
const graphemeMatch = block.match(/grapheme:\s*"([^"]+)"/);
if (!graphemeMatch) return block;
let grapheme = graphemeMatch[1];
let newTr = dbIndex[grapheme];
// Manual override for Allah
if (grapheme === 'اللَّهُ') newTr = 'Allāhu';
if (newTr) {
// uppercase first letter
newTr = newTr.charAt(0).toUpperCase() + newTr.slice(1);
// replace phonetic
const newBlock = block.replace(/phonetic_fr:\s*"([^"]+)"/, (phMatch, oldPhG) => {
if (oldPhG !== newTr) {
modifications++;
console.log(`Update phonetic for ${grapheme}: ${oldPhG} -> ${newTr}`);
return `phonetic_fr: "${newTr}"`;
}
return phMatch;
});
return newBlock;
}
return block;
});
if (modifications > 0) {
fs.writeFileSync(vocabPath, updatedContent, 'utf8');
console.log(`Vocabulary phonetic updated. Total altered phonetics: ${modifications}`);
} else {
console.log('No modifications needed.');
}
});