TinkyBrain-31M / tokenizer.py
Hoodrobot's picture
Upload tokenizer.py with huggingface_hub
c16617e verified
"""
Simple word-level tokenizer for the AAC Micro Brain.
No BPE complexity needed — the vocabulary is everyday conversational English.
"""
import json
import re
from collections import Counter
# Special tokens
PAD = 0
BOS = 1 # beginning of sequence
EOS = 2 # end of sequence
SEP = 3 # separator between phrases in a flow
UNK = 4 # unknown word
SPECIAL_TOKENS = {"<pad>": PAD, "<bos>": BOS, "<eos>": EOS, "<sep>": SEP, "<unk>": UNK}
class Tokenizer:
def __init__(self, vocab_size=8192):
self.vocab_size = vocab_size
self.word2idx = dict(SPECIAL_TOKENS)
self.idx2word = {v: k for k, v in SPECIAL_TOKENS.items()}
self.fitted = False
def _tokenize_text(self, text):
"""Split text into lowercase words, keep basic punctuation."""
text = text.lower().strip()
# Split on whitespace, keep punctuation attached
tokens = re.findall(r"[a-z']+|[.,!?;:]", text)
return tokens
def fit(self, texts):
"""Build vocabulary from a list of texts."""
counts = Counter()
for text in texts:
tokens = self._tokenize_text(text)
counts.update(tokens)
# Take top vocab_size - len(special tokens) most common words
n_special = len(SPECIAL_TOKENS)
for word, _ in counts.most_common(self.vocab_size - n_special):
idx = len(self.word2idx)
if idx >= self.vocab_size:
break
self.word2idx[word] = idx
self.idx2word[idx] = word
self.fitted = True
print(f"Tokenizer: {len(self.word2idx)} tokens (from {len(counts)} unique words)")
def encode(self, text):
"""Convert text to token IDs."""
tokens = self._tokenize_text(text)
return [self.word2idx.get(t, UNK) for t in tokens]
def decode(self, ids):
"""Convert token IDs back to text."""
words = [self.idx2word.get(i, "<unk>") for i in ids if i not in (PAD, BOS, EOS, SEP)]
return " ".join(words)
def encode_sequence(self, phrases, max_len=128):
"""Encode a conversation flow (list of phrases) into a token sequence.
Format: <bos> phrase1 <sep> phrase2 <sep> ... phraseN <eos> <pad>...
"""
ids = [BOS]
for i, phrase in enumerate(phrases):
if i > 0:
ids.append(SEP)
ids.extend(self.encode(phrase))
ids.append(EOS)
# Truncate or pad
if len(ids) > max_len:
ids = ids[:max_len - 1] + [EOS]
while len(ids) < max_len:
ids.append(PAD)
return ids
def save(self, path):
"""Save tokenizer to JSON."""
data = {
"vocab_size": self.vocab_size,
"word2idx": self.word2idx,
}
with open(path, "w") as f:
json.dump(data, f)
@classmethod
def load(cls, path):
"""Load tokenizer from JSON."""
with open(path) as f:
data = json.load(f)
tok = cls(data["vocab_size"])
tok.word2idx = data["word2idx"]
tok.idx2word = {int(v): k for k, v in data["word2idx"].items()}
tok.fitted = True
return tok
def build_tokenizer(data_path, vocab_size=8192):
"""Build tokenizer from conversation_flows.jsonl."""
print("Building tokenizer...")
texts = []
with open(data_path) as f:
for line in f:
entry = json.loads(line)
for phrase in entry["phrases"]:
texts.append(phrase)
tok = Tokenizer(vocab_size)
tok.fit(texts)
return tok
if __name__ == "__main__":
tok = build_tokenizer("/Volumes/PRO-G40/models/aac-micro-brain/data/conversation_flows.jsonl")
tok.save("/Volumes/PRO-G40/models/aac-micro-brain/data/tokenizer.json")
# Test
test = "I want to go to the airport please"
encoded = tok.encode(test)
decoded = tok.decode(encoded)
print(f"Test: '{test}'")
print(f"Encoded: {encoded}")
print(f"Decoded: '{decoded}'")
# Test sequence
seq = tok.encode_sequence(["Hello how are you", "I'm doing great", "Want to get lunch"])
print(f"Sequence: {seq[:30]}...")