import re, unicodedata

_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)

dicts = {}
for i in range(len((symbols))):
    dicts[symbols[i]] = i
    
class TextCleaner:
    """
    • Normalises text to NFC so pre-composed IPA glyphs match `symbols`.
    • Splits on event tokens first (e.g. <evt_gasp>), then per-character.
    • Unknown chars map to the <unk> symbol instead of printing.
    """
    _EVENT_RE = re.compile(r"<[^>]+>|.")   # match <evt_xxx> or single char

    def __init__(self):
        # `dicts` must already include EVENT_TOKENS and "<unk>"
        self.lookup = dicts
        self.unk_id = 0

    def __call__(self, text: str):
        text = unicodedata.normalize("NFC", text)
        ids = []
        for tok in self._EVENT_RE.findall(text):
            ids.append(self.lookup.get(tok, self.unk_id))
        return ids
        
tc = TextCleaner()
miss = {}

with open("/home/ubuntu/styletts2-ft/data/train_list.txt", encoding="utf-8") as f:
    for line in f:
        for i in tc(line.split("|")[1]):        # convert once
            pass                               # if it got an ID, it's known
print("Unknown chars left:", [k for k,v in miss.items()])