| | |
| |
|
| | _pad = "$" |
| | _punctuation = ';:,.!?¡¿—…"«»“” ' |
| | _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" |
| | _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" |
| |
|
| | |
| | symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) |
| |
|
| | dicts = {} |
| | for i in range(len((symbols))): |
| | dicts[symbols[i]] = i |
| |
|
| |
|
| | class TextCleaner: |
| | def __init__(self, dummy=None): |
| | self.word_index_dictionary = dicts |
| | print(len(dicts)) |
| |
|
| | def __call__(self, text): |
| | indexes = [] |
| | for char in text: |
| | try: |
| | indexes.append(self.word_index_dictionary[char]) |
| | except KeyError: |
| | print(text) |
| | return indexes |
| |
|