fadi77's picture
Update models/mlm_only_non_diacritics/char_indexer.py
dc672a9 verified
# IPA Phonemizer: https://github.com/bootphon/phonemizer
import string
PAD = "P"
PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
PHONEME_MASK = "M"
PHONEME_SEPARATOR = " "
UNKNOWN='U'
# Export all symbols:
symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]
assert len(symbols) == len(set(symbols)) # no duplicates
class CharacterIndexer:
def __init__(self):
self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}
def __call__(self, text):
return [self.word_index_dictionary[char] if char in self.word_index_dictionary
else self.word_index_dictionary[UNKNOWN] for char in text]