""" Sofelia TTS — Palestinian Arabic text frontend. espeak-ng's Arabic G2P underspecifies dialect pronunciation. This module fixes the systematic issues before phonemization: - Arabic punctuation (،؛؟) -> Latin (,;?) so the model gets pause/question cues - word-final ة read as /t/ for out-of-lexicon words -> rewritten to a bare fatha - a small Palestinian pronunciation lexicon (ar_lexicon.json) for words espeak mis-reads or strips vowels from (هيك، وين، تنين، قلت، معلش، ...) Then, after G2P, phonemes outside Kokoro's 178-token vocab are remapped: ħ->ʰ, ʕ->ʁ, ˤ->ᵊ, dental diacritic and []-> dropped. """ import json import re from pathlib import Path PHONEME_FIXUPS = { "ħ": "ʰ", "ʕ": "ʁ", "ˤ": "ᵊ", "̪": "", "[": "", "]": "", } PUNCT_MAP = str.maketrans({"،": ",", "؛": ";", "؟": "?"}) _MARBUTA_WORD = re.compile(r"([ؠ-يـ]+ة)(?![ؠ-ي])") _HERE = Path(__file__).resolve().parent _LEXICON_PATH = _HERE / "ar_lexicon.json" _marbuta_cache = {} _lexicon = None _lexicon_re = None def _load_lexicon(): global _lexicon, _lexicon_re if _lexicon is None: if _LEXICON_PATH.exists(): entries = json.loads(_LEXICON_PATH.read_text(encoding="utf-8")) _lexicon = {k: v for k, v in entries.items() if not k.startswith("_")} else: _lexicon = {} if _lexicon: alts = "|".join(re.escape(w) for w in sorted(_lexicon, key=len, reverse=True)) _lexicon_re = re.compile(rf"(? Kokoro-compatible phoneme string (frontend + G2P + vocab fixups).""" ps, _ = g2p(normalize_arabic(text, g2p)) for old, new in PHONEME_FIXUPS.items(): ps = ps.replace(old, new) return ps.strip()