import json import os import re from collections import Counter class SimpleTokenizer: PAD_TOKEN = "" UNK_TOKEN = "" BOS_TOKEN = "" EOS_TOKEN = "" def __init__(self, vocab_path=None): self.vocab = {} self.inv_vocab = {} self.vocab_path = vocab_path self.frozen = False if vocab_path and os.path.exists(vocab_path): self.load_vocab(vocab_path) def tokenize(self, text): return re.findall(r"\b\w+\b", text.lower()) def build_vocab(self, texts, min_freq=2): counter = Counter() for text in texts: for word in self.tokenize(text): counter[word] += 1 # initialize with special tokens self.vocab = { self.PAD_TOKEN: 0, self.UNK_TOKEN: 1, self.BOS_TOKEN: 2, self.EOS_TOKEN: 3 } idx = 4 for word, freq in counter.items(): if freq >= min_freq: self.vocab[word] = idx idx += 1 # inverse vocab: index -> token self.inv_vocab = {idx: token for token, idx in self.vocab.items()} print(f"[TOKENIZER] Built vocab of size {len(self.vocab)}") def encode(self, text): tokens = self.tokenize(text) return [self.vocab.get(t, self.vocab[self.UNK_TOKEN]) for t in tokens] def pad(self, ids, max_len): if len(ids) < max_len: return ids + [self.vocab[self.PAD_TOKEN]] * (max_len - len(ids)) return ids[:max_len] def decode(self, ids): return " ".join(self.inv_vocab.get(i, self.UNK_TOKEN) for i in ids) @property def vocab_size(self): return len(self.vocab) def save_vocab(self, path): os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as f: json.dump(self.vocab, f, indent=2) print(f"[TOKENIZER] Vocab exported to {path}") def load_vocab(self, path): with open(path, "r", encoding="utf-8") as f: self.vocab = json.load(f) self.inv_vocab = {int(idx): token for token, idx in self.vocab.items()} self.frozen = True print(f"[TOKENIZER] Vocab loaded from {path}")