""" Simple word-level tokenizer for the AAC Micro Brain. No BPE complexity needed — the vocabulary is everyday conversational English. """ import json import re from collections import Counter # Special tokens PAD = 0 BOS = 1 # beginning of sequence EOS = 2 # end of sequence SEP = 3 # separator between phrases in a flow UNK = 4 # unknown word SPECIAL_TOKENS = {"": PAD, "": BOS, "": EOS, "": SEP, "": UNK} class Tokenizer: def __init__(self, vocab_size=8192): self.vocab_size = vocab_size self.word2idx = dict(SPECIAL_TOKENS) self.idx2word = {v: k for k, v in SPECIAL_TOKENS.items()} self.fitted = False def _tokenize_text(self, text): """Split text into lowercase words, keep basic punctuation.""" text = text.lower().strip() # Split on whitespace, keep punctuation attached tokens = re.findall(r"[a-z']+|[.,!?;:]", text) return tokens def fit(self, texts): """Build vocabulary from a list of texts.""" counts = Counter() for text in texts: tokens = self._tokenize_text(text) counts.update(tokens) # Take top vocab_size - len(special tokens) most common words n_special = len(SPECIAL_TOKENS) for word, _ in counts.most_common(self.vocab_size - n_special): idx = len(self.word2idx) if idx >= self.vocab_size: break self.word2idx[word] = idx self.idx2word[idx] = word self.fitted = True print(f"Tokenizer: {len(self.word2idx)} tokens (from {len(counts)} unique words)") def encode(self, text): """Convert text to token IDs.""" tokens = self._tokenize_text(text) return [self.word2idx.get(t, UNK) for t in tokens] def decode(self, ids): """Convert token IDs back to text.""" words = [self.idx2word.get(i, "") for i in ids if i not in (PAD, BOS, EOS, SEP)] return " ".join(words) def encode_sequence(self, phrases, max_len=128): """Encode a conversation flow (list of phrases) into a token sequence. Format: phrase1 phrase2 ... phraseN ... """ ids = [BOS] for i, phrase in enumerate(phrases): if i > 0: ids.append(SEP) ids.extend(self.encode(phrase)) ids.append(EOS) # Truncate or pad if len(ids) > max_len: ids = ids[:max_len - 1] + [EOS] while len(ids) < max_len: ids.append(PAD) return ids def save(self, path): """Save tokenizer to JSON.""" data = { "vocab_size": self.vocab_size, "word2idx": self.word2idx, } with open(path, "w") as f: json.dump(data, f) @classmethod def load(cls, path): """Load tokenizer from JSON.""" with open(path) as f: data = json.load(f) tok = cls(data["vocab_size"]) tok.word2idx = data["word2idx"] tok.idx2word = {int(v): k for k, v in data["word2idx"].items()} tok.fitted = True return tok def build_tokenizer(data_path, vocab_size=8192): """Build tokenizer from conversation_flows.jsonl.""" print("Building tokenizer...") texts = [] with open(data_path) as f: for line in f: entry = json.loads(line) for phrase in entry["phrases"]: texts.append(phrase) tok = Tokenizer(vocab_size) tok.fit(texts) return tok if __name__ == "__main__": tok = build_tokenizer("/Volumes/PRO-G40/models/aac-micro-brain/data/conversation_flows.jsonl") tok.save("/Volumes/PRO-G40/models/aac-micro-brain/data/tokenizer.json") # Test test = "I want to go to the airport please" encoded = tok.encode(test) decoded = tok.decode(encoded) print(f"Test: '{test}'") print(f"Encoded: {encoded}") print(f"Decoded: '{decoded}'") # Test sequence seq = tok.encode_sequence(["Hello how are you", "I'm doing great", "Want to get lunch"]) print(f"Sequence: {seq[:30]}...")