fadi77 commited on
Commit
8f5d729
·
verified ·
1 Parent(s): 130f5cf

Character indexer to mlm_only_non_diacritics

Browse files
models/mlm_only_non_diacritics/char_indexer.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IPA Phonemizer: https://github.com/bootphon/phonemizer
2
+
3
+ import string
4
+
5
+ PAD = "P"
6
+ PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
7
+ LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
8
+ LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
9
+ PHONEME_MASK = "M"
10
+ PHONEME_SEPARATOR = " "
11
+ # NOTE: '¤' is a valid 'unknown' character because it is different from all the characters above it. In English PL-BERT, 'U' was used as the unknown character which was not ideal as it was part of the English alphabet
12
+ UNKNOWN='U'
13
+
14
+ # Export all symbols:
15
+ symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]
16
+
17
+ assert len(symbols) == len(set(symbols)) # no duplicates
18
+
19
+ class CharacterIndexer:
20
+ def __init__(self):
21
+ self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}
22
+
23
+ def __call__(self, text):
24
+ return [self.word_index_dictionary[char] if char in self.word_index_dictionary
25
+ else self.word_index_dictionary[UNKNOWN] for char in text]