Update models/mlm_only_non_diacritics/char_indexer.py

dc672a9 verified 9 months ago

1.07 kB

	# IPA Phonemizer: https://github.com/bootphon/phonemizer

	import string

	PAD = "P"
	PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
	LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
	LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
	PHONEME_MASK = "M"
	PHONEME_SEPARATOR = " "
	UNKNOWN='U'

	# Export all symbols:
	symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]

	assert len(symbols) == len(set(symbols)) # no duplicates

	class CharacterIndexer:
	def __init__(self):
	self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}

	def __call__(self, text):
	return [self.word_index_dictionary[char] if char in self.word_index_dictionary
	else self.word_index_dictionary[UNKNOWN] for char in text]