| import numpy as np | |
| s = " `1234567890-=~!@#$%^&*()_+[;,{:<];.}:>\\'/|\"?\n–№…«»→" | |
| def split(text): | |
| o = [] | |
| t = "" | |
| for i in text+" ": | |
| if i in s: | |
| if t != "": | |
| o.append(t) | |
| t = "" | |
| if i != " ": | |
| o.append(i) | |
| t = "" | |
| else: | |
| t += i | |
| return o | |
| def tokenize_2str(text: str): | |
| text = split(text) | |
| o = [] | |
| for i in text: | |
| if i[-2:] == "es": | |
| o.append(i[:-2]) | |
| o.append("<es>") | |
| else: | |
| o.append(i) | |
| return o | |
| ind2text = ["<NULL>", "<UNK>", "<es>"] | |
| text2ind = {"<NULL>": 0, "<UNK>": 1, "<es>": 2} | |
| def fit_on_text(text: str): | |
| global ind2text | |
| global text2ind | |
| tokens = tokenize_2str(text) | |
| for i in tokens: | |
| if i not in ind2text: | |
| ind2text.append(i) | |
| text2ind[i] = len(ind2text) - 1 | |
| def fit_on_texts(texts): | |
| for text in texts: fit_on_text(text) | |
| def tokenize(text: str): | |
| text = tokenize_2str(text) | |
| o = [] | |
| for i in text: | |
| if i in ind2text: | |
| o.append(text2ind[i]) | |
| else: | |
| o.append(text2ind['<UNK>']) | |
| return np.array(o) | |