Instructions to use Hoodrobot/TinkyBrain-31M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use Hoodrobot/TinkyBrain-31M with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir TinkyBrain-31M Hoodrobot/TinkyBrain-31M
- Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- LM Studio
File size: 4,167 Bytes
c16617e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """
Simple word-level tokenizer for the AAC Micro Brain.
No BPE complexity needed — the vocabulary is everyday conversational English.
"""
import json
import re
from collections import Counter
# Special tokens
PAD = 0
BOS = 1 # beginning of sequence
EOS = 2 # end of sequence
SEP = 3 # separator between phrases in a flow
UNK = 4 # unknown word
SPECIAL_TOKENS = {"<pad>": PAD, "<bos>": BOS, "<eos>": EOS, "<sep>": SEP, "<unk>": UNK}
class Tokenizer:
def __init__(self, vocab_size=8192):
self.vocab_size = vocab_size
self.word2idx = dict(SPECIAL_TOKENS)
self.idx2word = {v: k for k, v in SPECIAL_TOKENS.items()}
self.fitted = False
def _tokenize_text(self, text):
"""Split text into lowercase words, keep basic punctuation."""
text = text.lower().strip()
# Split on whitespace, keep punctuation attached
tokens = re.findall(r"[a-z']+|[.,!?;:]", text)
return tokens
def fit(self, texts):
"""Build vocabulary from a list of texts."""
counts = Counter()
for text in texts:
tokens = self._tokenize_text(text)
counts.update(tokens)
# Take top vocab_size - len(special tokens) most common words
n_special = len(SPECIAL_TOKENS)
for word, _ in counts.most_common(self.vocab_size - n_special):
idx = len(self.word2idx)
if idx >= self.vocab_size:
break
self.word2idx[word] = idx
self.idx2word[idx] = word
self.fitted = True
print(f"Tokenizer: {len(self.word2idx)} tokens (from {len(counts)} unique words)")
def encode(self, text):
"""Convert text to token IDs."""
tokens = self._tokenize_text(text)
return [self.word2idx.get(t, UNK) for t in tokens]
def decode(self, ids):
"""Convert token IDs back to text."""
words = [self.idx2word.get(i, "<unk>") for i in ids if i not in (PAD, BOS, EOS, SEP)]
return " ".join(words)
def encode_sequence(self, phrases, max_len=128):
"""Encode a conversation flow (list of phrases) into a token sequence.
Format: <bos> phrase1 <sep> phrase2 <sep> ... phraseN <eos> <pad>...
"""
ids = [BOS]
for i, phrase in enumerate(phrases):
if i > 0:
ids.append(SEP)
ids.extend(self.encode(phrase))
ids.append(EOS)
# Truncate or pad
if len(ids) > max_len:
ids = ids[:max_len - 1] + [EOS]
while len(ids) < max_len:
ids.append(PAD)
return ids
def save(self, path):
"""Save tokenizer to JSON."""
data = {
"vocab_size": self.vocab_size,
"word2idx": self.word2idx,
}
with open(path, "w") as f:
json.dump(data, f)
@classmethod
def load(cls, path):
"""Load tokenizer from JSON."""
with open(path) as f:
data = json.load(f)
tok = cls(data["vocab_size"])
tok.word2idx = data["word2idx"]
tok.idx2word = {int(v): k for k, v in data["word2idx"].items()}
tok.fitted = True
return tok
def build_tokenizer(data_path, vocab_size=8192):
"""Build tokenizer from conversation_flows.jsonl."""
print("Building tokenizer...")
texts = []
with open(data_path) as f:
for line in f:
entry = json.loads(line)
for phrase in entry["phrases"]:
texts.append(phrase)
tok = Tokenizer(vocab_size)
tok.fit(texts)
return tok
if __name__ == "__main__":
tok = build_tokenizer("/Volumes/PRO-G40/models/aac-micro-brain/data/conversation_flows.jsonl")
tok.save("/Volumes/PRO-G40/models/aac-micro-brain/data/tokenizer.json")
# Test
test = "I want to go to the airport please"
encoded = tok.encode(test)
decoded = tok.decode(encoded)
print(f"Test: '{test}'")
print(f"Encoded: {encoded}")
print(f"Decoded: '{decoded}'")
# Test sequence
seq = tok.encode_sequence(["Hello how are you", "I'm doing great", "Want to get lunch"])
print(f"Sequence: {seq[:30]}...")
|