| """ |
| Sofelia TTS โ Palestinian Arabic text frontend. |
| |
| espeak-ng's Arabic G2P underspecifies dialect pronunciation. This module fixes |
| the systematic issues before phonemization: |
| - Arabic punctuation (ุุุ) -> Latin (,;?) so the model gets pause/question cues |
| - word-final ุฉ read as /t/ for out-of-lexicon words -> rewritten to a bare fatha |
| - a small Palestinian pronunciation lexicon (ar_lexicon.json) for words espeak |
| mis-reads or strips vowels from (ูููุ ูููุ ุชูููุ ููุชุ ู
ุนูุดุ ...) |
| Then, after G2P, phonemes outside Kokoro's 178-token vocab are remapped: |
| ฤง->สฐ, ส->ส, หค->แต, dental diacritic and []-> dropped. |
| """ |
|
|
| import json |
| import re |
| from pathlib import Path |
|
|
| PHONEME_FIXUPS = { |
| "ฤง": "สฐ", |
| "ส": "ส", |
| "หค": "แต", |
| "ฬช": "", |
| "[": "", |
| "]": "", |
| } |
|
|
| PUNCT_MAP = str.maketrans({"ุ": ",", "ุ": ";", "ุ": "?"}) |
|
|
| _MARBUTA_WORD = re.compile(r"([ุ -ูู]+ุฉ)(?![ุ -ู])") |
| _HERE = Path(__file__).resolve().parent |
| _LEXICON_PATH = _HERE / "ar_lexicon.json" |
|
|
| _marbuta_cache = {} |
| _lexicon = None |
| _lexicon_re = None |
|
|
|
|
| def _load_lexicon(): |
| global _lexicon, _lexicon_re |
| if _lexicon is None: |
| if _LEXICON_PATH.exists(): |
| entries = json.loads(_LEXICON_PATH.read_text(encoding="utf-8")) |
| _lexicon = {k: v for k, v in entries.items() if not k.startswith("_")} |
| else: |
| _lexicon = {} |
| if _lexicon: |
| alts = "|".join(re.escape(w) for w in sorted(_lexicon, key=len, reverse=True)) |
| _lexicon_re = re.compile(rf"(?<![ุ -ูู])({alts})(?![ุ -ูู])") |
| return _lexicon, _lexicon_re |
|
|
|
|
| def normalize_taa_marbuta(text, g2p): |
| def repl(m): |
| word = m.group(1) |
| fixed = _marbuta_cache.get(word) |
| if fixed is None: |
| try: |
| ph, _ = g2p(word) |
| broken = ph.strip().rstrip(".").endswith("t") |
| except Exception: |
| broken = False |
| fixed = word[:-1] + "ู" if broken else word |
| _marbuta_cache[word] = fixed |
| return fixed |
|
|
| return _MARBUTA_WORD.sub(repl, text) |
|
|
|
|
| def normalize_arabic(text, g2p): |
| """Full normalization: punctuation + lexicon + taa-marbuta fix.""" |
| text = text.translate(PUNCT_MAP) |
| lex, lex_re = _load_lexicon() |
| if lex_re is not None: |
| text = lex_re.sub(lambda m: lex[m.group(1)], text) |
| return normalize_taa_marbuta(text, g2p) |
|
|
|
|
| def text_to_phonemes(text, g2p): |
| """text -> Kokoro-compatible phoneme string (frontend + G2P + vocab fixups).""" |
| ps, _ = g2p(normalize_arabic(text, g2p)) |
| for old, new in PHONEME_FIXUPS.items(): |
| ps = ps.replace(old, new) |
| return ps.strip() |
|
|