Spaces:
Running
Running
| const duckdb = require('duckdb'); | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const vocabPath = path.join(__dirname, '../src/data/quran_vocabulary.ts'); | |
| let vocabContent = fs.readFileSync(vocabPath, 'utf8'); | |
| const db = new duckdb.Database(':memory:'); | |
| db.all(` | |
| SELECT word_ar, word_tr, count(*) as cnt | |
| FROM 'data/*.parquet' | |
| GROUP BY word_ar, word_tr | |
| ORDER BY cnt DESC | |
| `, (err, rows) => { | |
| if (err) throw err; | |
| // index grouping by word_ar to get the most frequent word_tr | |
| const dbIndex = {}; | |
| for (const row of rows) { | |
| if (!dbIndex[row.word_ar]) { | |
| dbIndex[row.word_ar] = row.word_tr; | |
| } | |
| } | |
| let modifications = 0; | |
| // We will parse grapheme first, then find the corresponding phonetic_fr and replace it | |
| // Safe way: match blocks | |
| let updatedContent = vocabContent.replace(/{[\s\S]*?}/g, (block) => { | |
| const graphemeMatch = block.match(/grapheme:\s*"([^"]+)"/); | |
| if (!graphemeMatch) return block; | |
| let grapheme = graphemeMatch[1]; | |
| let newTr = dbIndex[grapheme]; | |
| // Manual override for Allah | |
| if (grapheme === 'اللَّهُ') newTr = 'Allāhu'; | |
| if (newTr) { | |
| // uppercase first letter | |
| newTr = newTr.charAt(0).toUpperCase() + newTr.slice(1); | |
| // replace phonetic | |
| const newBlock = block.replace(/phonetic_fr:\s*"([^"]+)"/, (phMatch, oldPhG) => { | |
| if (oldPhG !== newTr) { | |
| modifications++; | |
| console.log(`Update phonetic for ${grapheme}: ${oldPhG} -> ${newTr}`); | |
| return `phonetic_fr: "${newTr}"`; | |
| } | |
| return phMatch; | |
| }); | |
| return newBlock; | |
| } | |
| return block; | |
| }); | |
| if (modifications > 0) { | |
| fs.writeFileSync(vocabPath, updatedContent, 'utf8'); | |
| console.log(`Vocabulary phonetic updated. Total altered phonetics: ${modifications}`); | |
| } else { | |
| console.log('No modifications needed.'); | |
| } | |
| }); | |