Spaces:
Sleeping
Sleeping
| import re | |
| TOKEN_PATTERN = re.compile(r"\n|[A-Za-z0-9_']+|[^\w\s]") | |
| class WordTokenizer: | |
| def __init__(self): | |
| self.special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"] | |
| self.stoi = {} | |
| self.itos = {} | |
| def pad_id(self): | |
| return self.stoi["<pad>"] | |
| def bos_id(self): | |
| return self.stoi["<bos>"] | |
| def eos_id(self): | |
| return self.stoi["<eos>"] | |
| def vocab_size(self): | |
| return len(self.stoi) | |
| def tokenize(self, text: str): | |
| return TOKEN_PATTERN.findall(text) | |
| def fit(self, text: str): | |
| vocab = self.special_tokens + sorted(set(self.tokenize(text))) | |
| self.stoi = {token: idx for idx, token in enumerate(vocab)} | |
| self.itos = {idx: token for token, idx in self.stoi.items()} | |
| return self | |
| def encode(self, text: str, add_bos: bool = False, add_eos: bool = False): | |
| tokens = self.tokenize(text) | |
| ids = [self.stoi.get(token, self.stoi["<unk>"]) for token in tokens] | |
| if add_bos: | |
| ids = [self.bos_id] + ids | |
| if add_eos: | |
| ids = ids + [self.eos_id] | |
| return ids | |
| def decode(self, ids): | |
| tokens = [] | |
| for idx in ids: | |
| token = self.itos.get(int(idx), "<unk>") | |
| if token in self.special_tokens: | |
| continue | |
| tokens.append(token) | |
| text = "" | |
| for token in tokens: | |
| if token == "\n": | |
| text = text.rstrip() + "\n" | |
| elif token in {".", ",", "!", "?", ":", ";"}: | |
| text = text.rstrip() + token + " " | |
| else: | |
| text += token + " " | |
| return text.strip() | |
| def state_dict(self): | |
| return {"stoi": self.stoi} | |
| def from_state_dict(cls, state): | |
| tok = cls() | |
| tok.stoi = dict(state["stoi"]) | |
| tok.itos = {idx: token for token, idx in tok.stoi.items()} | |
| return tok | |