Spaces:

Musombi
/

mvi-ai-engine

Runtime error

App Files Files Community

mvi-ai-engine / language /tokenizer.py

Musombi

Update language/tokenizer.py

4a1b32e verified about 13 hours ago

raw

history blame contribute delete

2.27 kB

	import json
	import os
	import re
	from collections import Counter


	class SimpleTokenizer:
	PAD_TOKEN = "<pad>"
	UNK_TOKEN = "<unk>"
	BOS_TOKEN = "<bos>"
	EOS_TOKEN = "<eos>"

	def __init__(self, vocab_path=None):
	self.vocab = {}
	self.inv_vocab = {}
	self.vocab_path = vocab_path
	self.frozen = False

	if vocab_path and os.path.exists(vocab_path):
	self.load_vocab(vocab_path)

	def tokenize(self, text):
	return re.findall(r"\b\w+\b", text.lower())

	def build_vocab(self, texts, min_freq=2):
	counter = Counter()
	for text in texts:
	for word in self.tokenize(text):
	counter[word] += 1

	# initialize with special tokens
	self.vocab = {
	self.PAD_TOKEN: 0,
	self.UNK_TOKEN: 1,
	self.BOS_TOKEN: 2,
	self.EOS_TOKEN: 3
	}

	idx = 4
	for word, freq in counter.items():
	if freq >= min_freq:
	self.vocab[word] = idx
	idx += 1

	# inverse vocab: index -> token
	self.inv_vocab = {idx: token for token, idx in self.vocab.items()}
	print(f"[TOKENIZER] Built vocab of size {len(self.vocab)}")

	def encode(self, text):
	tokens = self.tokenize(text)
	return [self.vocab.get(t, self.vocab[self.UNK_TOKEN]) for t in tokens]

	def pad(self, ids, max_len):
	if len(ids) < max_len:
	return ids + [self.vocab[self.PAD_TOKEN]] * (max_len - len(ids))
	return ids[:max_len]

	def decode(self, ids):
	return " ".join(self.inv_vocab.get(i, self.UNK_TOKEN) for i in ids)

	@property
	def vocab_size(self):
	return len(self.vocab)

	def save_vocab(self, path):
	os.makedirs(os.path.dirname(path), exist_ok=True)
	with open(path, "w", encoding="utf-8") as f:
	json.dump(self.vocab, f, indent=2)
	print(f"[TOKENIZER] Vocab exported to {path}")

	def load_vocab(self, path):
	with open(path, "r", encoding="utf-8") as f:
	self.vocab = json.load(f)
	self.inv_vocab = {int(idx): token for token, idx in self.vocab.items()}
	self.frozen = True
	print(f"[TOKENIZER] Vocab loaded from {path}")