import re, unicodedata _pad = "$" _punctuation = ';:,.!?¡¿—…"«»“” ' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" # Export all symbols: symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) dicts = {} for i in range(len((symbols))): dicts[symbols[i]] = i class TextCleaner: """ • Normalises text to NFC so pre-composed IPA glyphs match `symbols`. • Splits on event tokens first (e.g. ), then per-character. • Unknown chars map to the symbol instead of printing. """ _EVENT_RE = re.compile(r"<[^>]+>|.") # match or single char def __init__(self): # `dicts` must already include EVENT_TOKENS and "" self.lookup = dicts self.unk_id = 0 def __call__(self, text: str): text = unicodedata.normalize("NFC", text) ids = [] for tok in self._EVENT_RE.findall(text): ids.append(self.lookup.get(tok, self.unk_id)) return ids tc = TextCleaner() miss = {} with open("/home/ubuntu/styletts2-ft/data/train_list.txt", encoding="utf-8") as f: for line in f: for i in tc(line.split("|")[1]): # convert once pass # if it got an ID, it's known print("Unknown chars left:", [k for k,v in miss.items()])