File size: 4,167 Bytes
c16617e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Simple word-level tokenizer for the AAC Micro Brain.
No BPE complexity needed — the vocabulary is everyday conversational English.
"""

import json
import re
from collections import Counter


# Special tokens
PAD = 0
BOS = 1  # beginning of sequence
EOS = 2  # end of sequence
SEP = 3  # separator between phrases in a flow
UNK = 4  # unknown word

SPECIAL_TOKENS = {"<pad>": PAD, "<bos>": BOS, "<eos>": EOS, "<sep>": SEP, "<unk>": UNK}


class Tokenizer:
    def __init__(self, vocab_size=8192):
        self.vocab_size = vocab_size
        self.word2idx = dict(SPECIAL_TOKENS)
        self.idx2word = {v: k for k, v in SPECIAL_TOKENS.items()}
        self.fitted = False

    def _tokenize_text(self, text):
        """Split text into lowercase words, keep basic punctuation."""
        text = text.lower().strip()
        # Split on whitespace, keep punctuation attached
        tokens = re.findall(r"[a-z']+|[.,!?;:]", text)
        return tokens

    def fit(self, texts):
        """Build vocabulary from a list of texts."""
        counts = Counter()
        for text in texts:
            tokens = self._tokenize_text(text)
            counts.update(tokens)

        # Take top vocab_size - len(special tokens) most common words
        n_special = len(SPECIAL_TOKENS)
        for word, _ in counts.most_common(self.vocab_size - n_special):
            idx = len(self.word2idx)
            if idx >= self.vocab_size:
                break
            self.word2idx[word] = idx
            self.idx2word[idx] = word

        self.fitted = True
        print(f"Tokenizer: {len(self.word2idx)} tokens (from {len(counts)} unique words)")

    def encode(self, text):
        """Convert text to token IDs."""
        tokens = self._tokenize_text(text)
        return [self.word2idx.get(t, UNK) for t in tokens]

    def decode(self, ids):
        """Convert token IDs back to text."""
        words = [self.idx2word.get(i, "<unk>") for i in ids if i not in (PAD, BOS, EOS, SEP)]
        return " ".join(words)

    def encode_sequence(self, phrases, max_len=128):
        """Encode a conversation flow (list of phrases) into a token sequence.
        Format: <bos> phrase1 <sep> phrase2 <sep> ... phraseN <eos> <pad>...
        """
        ids = [BOS]
        for i, phrase in enumerate(phrases):
            if i > 0:
                ids.append(SEP)
            ids.extend(self.encode(phrase))
        ids.append(EOS)

        # Truncate or pad
        if len(ids) > max_len:
            ids = ids[:max_len - 1] + [EOS]
        while len(ids) < max_len:
            ids.append(PAD)

        return ids

    def save(self, path):
        """Save tokenizer to JSON."""
        data = {
            "vocab_size": self.vocab_size,
            "word2idx": self.word2idx,
        }
        with open(path, "w") as f:
            json.dump(data, f)

    @classmethod
    def load(cls, path):
        """Load tokenizer from JSON."""
        with open(path) as f:
            data = json.load(f)
        tok = cls(data["vocab_size"])
        tok.word2idx = data["word2idx"]
        tok.idx2word = {int(v): k for k, v in data["word2idx"].items()}
        tok.fitted = True
        return tok


def build_tokenizer(data_path, vocab_size=8192):
    """Build tokenizer from conversation_flows.jsonl."""
    print("Building tokenizer...")
    texts = []
    with open(data_path) as f:
        for line in f:
            entry = json.loads(line)
            for phrase in entry["phrases"]:
                texts.append(phrase)

    tok = Tokenizer(vocab_size)
    tok.fit(texts)
    return tok


if __name__ == "__main__":
    tok = build_tokenizer("/Volumes/PRO-G40/models/aac-micro-brain/data/conversation_flows.jsonl")
    tok.save("/Volumes/PRO-G40/models/aac-micro-brain/data/tokenizer.json")

    # Test
    test = "I want to go to the airport please"
    encoded = tok.encode(test)
    decoded = tok.decode(encoded)
    print(f"Test: '{test}'")
    print(f"Encoded: {encoded}")
    print(f"Decoded: '{decoded}'")

    # Test sequence
    seq = tok.encode_sequence(["Hello how are you", "I'm doing great", "Want to get lunch"])
    print(f"Sequence: {seq[:30]}...")