| import re, unicodedata | |
| _pad = "$" | |
| _punctuation = ';:,.!?¡¿—…"«»“” ' | |
| _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' | |
| _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" | |
| # Export all symbols: | |
| symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) | |
| dicts = {} | |
| for i in range(len((symbols))): | |
| dicts[symbols[i]] = i | |
| class TextCleaner: | |
| """ | |
| • Normalises text to NFC so pre-composed IPA glyphs match `symbols`. | |
| • Splits on event tokens first (e.g. <evt_gasp>), then per-character. | |
| • Unknown chars map to the <unk> symbol instead of printing. | |
| """ | |
| _EVENT_RE = re.compile(r"<[^>]+>|.") # match <evt_xxx> or single char | |
| def __init__(self): | |
| # `dicts` must already include EVENT_TOKENS and "<unk>" | |
| self.lookup = dicts | |
| self.unk_id = 0 | |
| def __call__(self, text: str): | |
| text = unicodedata.normalize("NFC", text) | |
| ids = [] | |
| for tok in self._EVENT_RE.findall(text): | |
| ids.append(self.lookup.get(tok, self.unk_id)) | |
| return ids | |
| tc = TextCleaner() | |
| miss = {} | |
| with open("/home/ubuntu/styletts2-ft/data/train_list.txt", encoding="utf-8") as f: | |
| for line in f: | |
| for i in tc(line.split("|")[1]): # convert once | |
| pass # if it got an ID, it's known | |
| print("Unknown chars left:", [k for k,v in miss.items()]) |