# IPA Phonemizer: https://github.com/bootphon/phonemizer import string PAD = "P" PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation))) LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ' LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz' PHONEME_MASK = "M" PHONEME_SEPARATOR = " " UNKNOWN='U' # Export all symbols: symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN] assert len(symbols) == len(set(symbols)) # no duplicates class CharacterIndexer: def __init__(self): self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)} def __call__(self, text): return [self.word_index_dictionary[char] if char in self.word_index_dictionary else self.word_index_dictionary[UNKNOWN] for char in text]