"""
Sofelia TTS — Palestinian Arabic text frontend.

espeak-ng's Arabic G2P underspecifies dialect pronunciation. This module fixes
the systematic issues before phonemization:
  - Arabic punctuation (،؛؟) -> Latin (,;?) so the model gets pause/question cues
  - word-final ة read as /t/ for out-of-lexicon words -> rewritten to a bare fatha
  - a small Palestinian pronunciation lexicon (ar_lexicon.json) for words espeak
    mis-reads or strips vowels from (هيك، وين، تنين، قلت، معلش، ...)
Then, after G2P, phonemes outside Kokoro's 178-token vocab are remapped:
  ħ->ʰ, ʕ->ʁ, ˤ->ᵊ, dental diacritic and []-> dropped.
"""

import json
import re
from pathlib import Path

PHONEME_FIXUPS = {
    "ħ": "ʰ",
    "ʕ": "ʁ",
    "ˤ": "ᵊ",
    "̪": "",
    "[": "",
    "]": "",
}

PUNCT_MAP = str.maketrans({"،": ",", "؛": ";", "؟": "?"})

_MARBUTA_WORD = re.compile(r"([ؠ-يـ]+ة)(?![ؠ-ي])")
_HERE = Path(__file__).resolve().parent
_LEXICON_PATH = _HERE / "ar_lexicon.json"

_marbuta_cache = {}
_lexicon = None
_lexicon_re = None


def _load_lexicon():
    global _lexicon, _lexicon_re
    if _lexicon is None:
        if _LEXICON_PATH.exists():
            entries = json.loads(_LEXICON_PATH.read_text(encoding="utf-8"))
            _lexicon = {k: v for k, v in entries.items() if not k.startswith("_")}
        else:
            _lexicon = {}
        if _lexicon:
            alts = "|".join(re.escape(w) for w in sorted(_lexicon, key=len, reverse=True))
            _lexicon_re = re.compile(rf"(?<![ؠ-يـ])({alts})(?![ؠ-يـ])")
    return _lexicon, _lexicon_re


def normalize_taa_marbuta(text, g2p):
    def repl(m):
        word = m.group(1)
        fixed = _marbuta_cache.get(word)
        if fixed is None:
            try:
                ph, _ = g2p(word)
                broken = ph.strip().rstrip(".").endswith("t")
            except Exception:
                broken = False
            fixed = word[:-1] + "َ" if broken else word
            _marbuta_cache[word] = fixed
        return fixed

    return _MARBUTA_WORD.sub(repl, text)


def normalize_arabic(text, g2p):
    """Full normalization: punctuation + lexicon + taa-marbuta fix."""
    text = text.translate(PUNCT_MAP)
    lex, lex_re = _load_lexicon()
    if lex_re is not None:
        text = lex_re.sub(lambda m: lex[m.group(1)], text)
    return normalize_taa_marbuta(text, g2p)


def text_to_phonemes(text, g2p):
    """text -> Kokoro-compatible phoneme string (frontend + G2P + vocab fixups)."""
    ps, _ = g2p(normalize_arabic(text, g2p))
    for old, new in PHONEME_FIXUPS.items():
        ps = ps.replace(old, new)
    return ps.strip()