Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import re | |
| from collections import Counter | |
| class SimpleTokenizer: | |
| PAD_TOKEN = "<pad>" | |
| UNK_TOKEN = "<unk>" | |
| BOS_TOKEN = "<bos>" | |
| EOS_TOKEN = "<eos>" | |
| def __init__(self, vocab_path=None): | |
| self.vocab = {} | |
| self.inv_vocab = {} | |
| self.vocab_path = vocab_path | |
| self.frozen = False | |
| if vocab_path and os.path.exists(vocab_path): | |
| self.load_vocab(vocab_path) | |
| def tokenize(self, text): | |
| return re.findall(r"\b\w+\b", text.lower()) | |
| def build_vocab(self, texts, min_freq=2): | |
| counter = Counter() | |
| for text in texts: | |
| for word in self.tokenize(text): | |
| counter[word] += 1 | |
| # initialize with special tokens | |
| self.vocab = { | |
| self.PAD_TOKEN: 0, | |
| self.UNK_TOKEN: 1, | |
| self.BOS_TOKEN: 2, | |
| self.EOS_TOKEN: 3 | |
| } | |
| idx = 4 | |
| for word, freq in counter.items(): | |
| if freq >= min_freq: | |
| self.vocab[word] = idx | |
| idx += 1 | |
| # inverse vocab: index -> token | |
| self.inv_vocab = {idx: token for token, idx in self.vocab.items()} | |
| print(f"[TOKENIZER] Built vocab of size {len(self.vocab)}") | |
| def encode(self, text): | |
| tokens = self.tokenize(text) | |
| return [self.vocab.get(t, self.vocab[self.UNK_TOKEN]) for t in tokens] | |
| def pad(self, ids, max_len): | |
| if len(ids) < max_len: | |
| return ids + [self.vocab[self.PAD_TOKEN]] * (max_len - len(ids)) | |
| return ids[:max_len] | |
| def decode(self, ids): | |
| return " ".join(self.inv_vocab.get(i, self.UNK_TOKEN) for i in ids) | |
| def vocab_size(self): | |
| return len(self.vocab) | |
| def save_vocab(self, path): | |
| os.makedirs(os.path.dirname(path), exist_ok=True) | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(self.vocab, f, indent=2) | |
| print(f"[TOKENIZER] Vocab exported to {path}") | |
| def load_vocab(self, path): | |
| with open(path, "r", encoding="utf-8") as f: | |
| self.vocab = json.load(f) | |
| self.inv_vocab = {int(idx): token for token, idx in self.vocab.items()} | |
| self.frozen = True | |
| print(f"[TOKENIZER] Vocab loaded from {path}") | |