Spaces:

melakio
/

arabic-tts

Running

App Files Files Community

arabic-tts / update_phonetics.js

melakio's picture

Upload update_phonetics.js with huggingface_hub

81ec9f5 verified about 2 months ago

history blame contribute delete

2.05 kB

	const duckdb = require('duckdb');
	const fs = require('fs');
	const path = require('path');

	const vocabPath = path.join(__dirname, '../src/data/quran_vocabulary.ts');
	let vocabContent = fs.readFileSync(vocabPath, 'utf8');

	const db = new duckdb.Database(':memory:');

	db.all(`
	SELECT word_ar, word_tr, count(*) as cnt
	FROM 'data/*.parquet'
	GROUP BY word_ar, word_tr
	ORDER BY cnt DESC
	`, (err, rows) => {
	if (err) throw err;

	// index grouping by word_ar to get the most frequent word_tr
	const dbIndex = {};
	for (const row of rows) {
	if (!dbIndex[row.word_ar]) {
	dbIndex[row.word_ar] = row.word_tr;
	}
	}

	let modifications = 0;

	// We will parse grapheme first, then find the corresponding phonetic_fr and replace it
	// Safe way: match blocks
	let updatedContent = vocabContent.replace(/{[\s\S]*?}/g, (block) => {
	const graphemeMatch = block.match(/grapheme:\s*"([^"]+)"/);
	if (!graphemeMatch) return block;

	let grapheme = graphemeMatch[1];
	let newTr = dbIndex[grapheme];

	// Manual override for Allah
	if (grapheme === 'اللَّهُ') newTr = 'Allāhu';

	if (newTr) {
	// uppercase first letter
	newTr = newTr.charAt(0).toUpperCase() + newTr.slice(1);

	// replace phonetic
	const newBlock = block.replace(/phonetic_fr:\s*"([^"]+)"/, (phMatch, oldPhG) => {
	if (oldPhG !== newTr) {
	modifications++;
	console.log(`Update phonetic for ${grapheme}: ${oldPhG} -> ${newTr}`);
	return `phonetic_fr: "${newTr}"`;
	}
	return phMatch;
	});
	return newBlock;
	}

	return block;
	});

	if (modifications > 0) {
	fs.writeFileSync(vocabPath, updatedContent, 'utf8');
	console.log(`Vocabulary phonetic updated. Total altered phonetics: ${modifications}`);
	} else {
	console.log('No modifications needed.');
	}
	});