fadi77
/

pl-bert

fadi77 commited on Apr 10, 2025

Commit

8f5d729

verified ·

1 Parent(s): 130f5cf

Character indexer to mlm_only_non_diacritics

Files changed (1) hide show

models/mlm_only_non_diacritics/char_indexer.py ADDED Viewed

+# IPA Phonemizer: https://github.com/bootphon/phonemizer
+import string
+PAD = "P"
+PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
+LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
+LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
+PHONEME_MASK = "M"
+PHONEME_SEPARATOR = " "
+# NOTE: '¤' is a valid 'unknown' character because it is different from all the characters above it. In English PL-BERT, 'U' was used as the unknown character which was not ideal as it was part of the English alphabet
+UNKNOWN='U'
+# Export all symbols:
+symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]
+assert len(symbols) == len(set(symbols)) # no duplicates
+class CharacterIndexer:
+    def __init__(self):
+        self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}
+    def __call__(self, text):
+        return [self.word_index_dictionary[char] if char in self.word_index_dictionary
+                else self.word_index_dictionary[UNKNOWN] for char in text]