mvi-ai-engine / language /tokenizer.py
Musombi's picture
Update language/tokenizer.py
4a1b32e verified
import json
import os
import re
from collections import Counter
class SimpleTokenizer:
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
def __init__(self, vocab_path=None):
self.vocab = {}
self.inv_vocab = {}
self.vocab_path = vocab_path
self.frozen = False
if vocab_path and os.path.exists(vocab_path):
self.load_vocab(vocab_path)
def tokenize(self, text):
return re.findall(r"\b\w+\b", text.lower())
def build_vocab(self, texts, min_freq=2):
counter = Counter()
for text in texts:
for word in self.tokenize(text):
counter[word] += 1
# initialize with special tokens
self.vocab = {
self.PAD_TOKEN: 0,
self.UNK_TOKEN: 1,
self.BOS_TOKEN: 2,
self.EOS_TOKEN: 3
}
idx = 4
for word, freq in counter.items():
if freq >= min_freq:
self.vocab[word] = idx
idx += 1
# inverse vocab: index -> token
self.inv_vocab = {idx: token for token, idx in self.vocab.items()}
print(f"[TOKENIZER] Built vocab of size {len(self.vocab)}")
def encode(self, text):
tokens = self.tokenize(text)
return [self.vocab.get(t, self.vocab[self.UNK_TOKEN]) for t in tokens]
def pad(self, ids, max_len):
if len(ids) < max_len:
return ids + [self.vocab[self.PAD_TOKEN]] * (max_len - len(ids))
return ids[:max_len]
def decode(self, ids):
return " ".join(self.inv_vocab.get(i, self.UNK_TOKEN) for i in ids)
@property
def vocab_size(self):
return len(self.vocab)
def save_vocab(self, path):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(self.vocab, f, indent=2)
print(f"[TOKENIZER] Vocab exported to {path}")
def load_vocab(self, path):
with open(path, "r", encoding="utf-8") as f:
self.vocab = json.load(f)
self.inv_vocab = {int(idx): token for token, idx in self.vocab.items()}
self.frozen = True
print(f"[TOKENIZER] Vocab loaded from {path}")