File size: 4,167 Bytes

c16617e

"""
Simple word-level tokenizer for the AAC Micro Brain.
No BPE complexity needed — the vocabulary is everyday conversational English.
"""

import json
import re
from collections import Counter


# Special tokens
PAD = 0
BOS = 1  # beginning of sequence
EOS = 2  # end of sequence
SEP = 3  # separator between phrases in a flow
UNK = 4  # unknown word

SPECIAL_TOKENS = {"<pad>": PAD, "<bos>": BOS, "<eos>": EOS, "<sep>": SEP, "<unk>": UNK}


class Tokenizer:
    def __init__(self, vocab_size=8192):
        self.vocab_size = vocab_size
        self.word2idx = dict(SPECIAL_TOKENS)
        self.idx2word = {v: k for k, v in SPECIAL_TOKENS.items()}
        self.fitted = False

    def _tokenize_text(self, text):
        """Split text into lowercase words, keep basic punctuation."""
        text = text.lower().strip()
        # Split on whitespace, keep punctuation attached
        tokens = re.findall(r"[a-z']+|[.,!?;:]", text)
        return tokens

    def fit(self, texts):
        """Build vocabulary from a list of texts."""
        counts = Counter()
        for text in texts:
            tokens = self._tokenize_text(text)
            counts.update(tokens)

        # Take top vocab_size - len(special tokens) most common words
        n_special = len(SPECIAL_TOKENS)
        for word, _ in counts.most_common(self.vocab_size - n_special):
            idx = len(self.word2idx)
            if idx >= self.vocab_size:
                break
            self.word2idx[word] = idx
            self.idx2word[idx] = word

        self.fitted = True
        print(f"Tokenizer: {len(self.word2idx)} tokens (from {len(counts)} unique words)")

    def encode(self, text):
        """Convert text to token IDs."""
        tokens = self._tokenize_text(text)
        return [self.word2idx.get(t, UNK) for t in tokens]

    def decode(self, ids):
        """Convert token IDs back to text."""
        words = [self.idx2word.get(i, "<unk>") for i in ids if i not in (PAD, BOS, EOS, SEP)]
        return " ".join(words)

    def encode_sequence(self, phrases, max_len=128):
        """Encode a conversation flow (list of phrases) into a token sequence.
        Format: <bos> phrase1 <sep> phrase2 <sep> ... phraseN <eos> <pad>...
        """
        ids = [BOS]
        for i, phrase in enumerate(phrases):
            if i > 0:
                ids.append(SEP)
            ids.extend(self.encode(phrase))
        ids.append(EOS)

        # Truncate or pad
        if len(ids) > max_len:
            ids = ids[:max_len - 1] + [EOS]
        while len(ids) < max_len:
            ids.append(PAD)

        return ids

    def save(self, path):
        """Save tokenizer to JSON."""
        data = {
            "vocab_size": self.vocab_size,
            "word2idx": self.word2idx,
        }
        with open(path, "w") as f:
            json.dump(data, f)

    @classmethod
    def load(cls, path):
        """Load tokenizer from JSON."""
        with open(path) as f:
            data = json.load(f)
        tok = cls(data["vocab_size"])
        tok.word2idx = data["word2idx"]
        tok.idx2word = {int(v): k for k, v in data["word2idx"].items()}
        tok.fitted = True
        return tok


def build_tokenizer(data_path, vocab_size=8192):
    """Build tokenizer from conversation_flows.jsonl."""
    print("Building tokenizer...")
    texts = []
    with open(data_path) as f:
        for line in f:
            entry = json.loads(line)
            for phrase in entry["phrases"]:
                texts.append(phrase)

    tok = Tokenizer(vocab_size)
    tok.fit(texts)
    return tok


if __name__ == "__main__":
    tok = build_tokenizer("/Volumes/PRO-G40/models/aac-micro-brain/data/conversation_flows.jsonl")
    tok.save("/Volumes/PRO-G40/models/aac-micro-brain/data/tokenizer.json")

    # Test
    test = "I want to go to the airport please"
    encoded = tok.encode(test)
    decoded = tok.decode(encoded)
    print(f"Test: '{test}'")
    print(f"Encoded: {encoded}")
    print(f"Decoded: '{decoded}'")

    # Test sequence
    seq = tok.encode_sequence(["Hello how are you", "I'm doing great", "Want to get lunch"])
    print(f"Sequence: {seq[:30]}...")