Hoodrobot
/

TinkyBrain-31M

speech-prosthetic

small-language-model

Model card Files Files and versions

TinkyBrain-31M / tokenizer.py

Hoodrobot's picture

Upload tokenizer.py with huggingface_hub

c16617e verified about 2 months ago

history blame contribute delete

4.17 kB

	"""
	Simple word-level tokenizer for the AAC Micro Brain.
	No BPE complexity needed — the vocabulary is everyday conversational English.
	"""

	import json
	import re
	from collections import Counter


	# Special tokens
	PAD = 0
	BOS = 1 # beginning of sequence
	EOS = 2 # end of sequence
	SEP = 3 # separator between phrases in a flow
	UNK = 4 # unknown word

	SPECIAL_TOKENS = {"<pad>": PAD, "<bos>": BOS, "<eos>": EOS, "<sep>": SEP, "<unk>": UNK}


	class Tokenizer:
	def __init__(self, vocab_size=8192):
	self.vocab_size = vocab_size
	self.word2idx = dict(SPECIAL_TOKENS)
	self.idx2word = {v: k for k, v in SPECIAL_TOKENS.items()}
	self.fitted = False

	def _tokenize_text(self, text):
	"""Split text into lowercase words, keep basic punctuation."""
	text = text.lower().strip()
	# Split on whitespace, keep punctuation attached
	tokens = re.findall(r"[a-z']+\|[.,!?;:]", text)
	return tokens

	def fit(self, texts):
	"""Build vocabulary from a list of texts."""
	counts = Counter()
	for text in texts:
	tokens = self._tokenize_text(text)
	counts.update(tokens)

	# Take top vocab_size - len(special tokens) most common words
	n_special = len(SPECIAL_TOKENS)
	for word, _ in counts.most_common(self.vocab_size - n_special):
	idx = len(self.word2idx)
	if idx >= self.vocab_size:
	break
	self.word2idx[word] = idx
	self.idx2word[idx] = word

	self.fitted = True
	print(f"Tokenizer: {len(self.word2idx)} tokens (from {len(counts)} unique words)")

	def encode(self, text):
	"""Convert text to token IDs."""
	tokens = self._tokenize_text(text)
	return [self.word2idx.get(t, UNK) for t in tokens]

	def decode(self, ids):
	"""Convert token IDs back to text."""
	words = [self.idx2word.get(i, "<unk>") for i in ids if i not in (PAD, BOS, EOS, SEP)]
	return " ".join(words)

	def encode_sequence(self, phrases, max_len=128):
	"""Encode a conversation flow (list of phrases) into a token sequence.
	Format: <bos> phrase1 <sep> phrase2 <sep> ... phraseN <eos> <pad>...
	"""
	ids = [BOS]
	for i, phrase in enumerate(phrases):
	if i > 0:
	ids.append(SEP)
	ids.extend(self.encode(phrase))
	ids.append(EOS)

	# Truncate or pad
	if len(ids) > max_len:
	ids = ids[:max_len - 1] + [EOS]
	while len(ids) < max_len:
	ids.append(PAD)

	return ids

	def save(self, path):
	"""Save tokenizer to JSON."""
	data = {
	"vocab_size": self.vocab_size,
	"word2idx": self.word2idx,
	}
	with open(path, "w") as f:
	json.dump(data, f)

	@classmethod
	def load(cls, path):
	"""Load tokenizer from JSON."""
	with open(path) as f:
	data = json.load(f)
	tok = cls(data["vocab_size"])
	tok.word2idx = data["word2idx"]
	tok.idx2word = {int(v): k for k, v in data["word2idx"].items()}
	tok.fitted = True
	return tok


	def build_tokenizer(data_path, vocab_size=8192):
	"""Build tokenizer from conversation_flows.jsonl."""
	print("Building tokenizer...")
	texts = []
	with open(data_path) as f:
	for line in f:
	entry = json.loads(line)
	for phrase in entry["phrases"]:
	texts.append(phrase)

	tok = Tokenizer(vocab_size)
	tok.fit(texts)
	return tok


	if __name__ == "__main__":
	tok = build_tokenizer("/Volumes/PRO-G40/models/aac-micro-brain/data/conversation_flows.jsonl")
	tok.save("/Volumes/PRO-G40/models/aac-micro-brain/data/tokenizer.json")

	# Test
	test = "I want to go to the airport please"
	encoded = tok.encode(test)
	decoded = tok.decode(encoded)
	print(f"Test: '{test}'")
	print(f"Encoded: {encoded}")
	print(f"Decoded: '{decoded}'")

	# Test sequence
	seq = tok.encode_sequence(["Hello how are you", "I'm doing great", "Want to get lunch"])
	print(f"Sequence: {seq[:30]}...")