Character indexer to mlm_only_non_diacritics
Browse files
models/mlm_only_non_diacritics/char_indexer.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
| 2 |
+
|
| 3 |
+
import string
|
| 4 |
+
|
| 5 |
+
PAD = "P"
|
| 6 |
+
PUNCTUATION = ''.join(sorted(set(';:,.!?¡¿—…"«»“”‘’،؛؟٫٬٪﴾﴿ـ' + string.punctuation)))
|
| 7 |
+
LETTERS_IPA = 'ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘̩ᵻ'
|
| 8 |
+
LATIN_LETTERS = 'abcdefghijklmnopqrstuvwxyz'
|
| 9 |
+
PHONEME_MASK = "M"
|
| 10 |
+
PHONEME_SEPARATOR = " "
|
| 11 |
+
# NOTE: '¤' is a valid 'unknown' character because it is different from all the characters above it. In English PL-BERT, 'U' was used as the unknown character which was not ideal as it was part of the English alphabet
|
| 12 |
+
UNKNOWN='U'
|
| 13 |
+
|
| 14 |
+
# Export all symbols:
|
| 15 |
+
symbols = [PAD] + list(PUNCTUATION) + list(LETTERS_IPA) + list(LATIN_LETTERS) + [PHONEME_MASK] + [PHONEME_SEPARATOR] + [UNKNOWN]
|
| 16 |
+
|
| 17 |
+
assert len(symbols) == len(set(symbols)) # no duplicates
|
| 18 |
+
|
| 19 |
+
class CharacterIndexer:
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.word_index_dictionary = {symbol: i for i, symbol in enumerate(symbols)}
|
| 22 |
+
|
| 23 |
+
def __call__(self, text):
|
| 24 |
+
return [self.word_index_dictionary[char] if char in self.word_index_dictionary
|
| 25 |
+
else self.word_index_dictionary[UNKNOWN] for char in text]
|