Instructions to use Hoodrobot/TinkyBrain-31M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use Hoodrobot/TinkyBrain-31M with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir TinkyBrain-31M Hoodrobot/TinkyBrain-31M
- Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
| """ | |
| Simple word-level tokenizer for the AAC Micro Brain. | |
| No BPE complexity needed — the vocabulary is everyday conversational English. | |
| """ | |
| import json | |
| import re | |
| from collections import Counter | |
| # Special tokens | |
| PAD = 0 | |
| BOS = 1 # beginning of sequence | |
| EOS = 2 # end of sequence | |
| SEP = 3 # separator between phrases in a flow | |
| UNK = 4 # unknown word | |
| SPECIAL_TOKENS = {"<pad>": PAD, "<bos>": BOS, "<eos>": EOS, "<sep>": SEP, "<unk>": UNK} | |
| class Tokenizer: | |
| def __init__(self, vocab_size=8192): | |
| self.vocab_size = vocab_size | |
| self.word2idx = dict(SPECIAL_TOKENS) | |
| self.idx2word = {v: k for k, v in SPECIAL_TOKENS.items()} | |
| self.fitted = False | |
| def _tokenize_text(self, text): | |
| """Split text into lowercase words, keep basic punctuation.""" | |
| text = text.lower().strip() | |
| # Split on whitespace, keep punctuation attached | |
| tokens = re.findall(r"[a-z']+|[.,!?;:]", text) | |
| return tokens | |
| def fit(self, texts): | |
| """Build vocabulary from a list of texts.""" | |
| counts = Counter() | |
| for text in texts: | |
| tokens = self._tokenize_text(text) | |
| counts.update(tokens) | |
| # Take top vocab_size - len(special tokens) most common words | |
| n_special = len(SPECIAL_TOKENS) | |
| for word, _ in counts.most_common(self.vocab_size - n_special): | |
| idx = len(self.word2idx) | |
| if idx >= self.vocab_size: | |
| break | |
| self.word2idx[word] = idx | |
| self.idx2word[idx] = word | |
| self.fitted = True | |
| print(f"Tokenizer: {len(self.word2idx)} tokens (from {len(counts)} unique words)") | |
| def encode(self, text): | |
| """Convert text to token IDs.""" | |
| tokens = self._tokenize_text(text) | |
| return [self.word2idx.get(t, UNK) for t in tokens] | |
| def decode(self, ids): | |
| """Convert token IDs back to text.""" | |
| words = [self.idx2word.get(i, "<unk>") for i in ids if i not in (PAD, BOS, EOS, SEP)] | |
| return " ".join(words) | |
| def encode_sequence(self, phrases, max_len=128): | |
| """Encode a conversation flow (list of phrases) into a token sequence. | |
| Format: <bos> phrase1 <sep> phrase2 <sep> ... phraseN <eos> <pad>... | |
| """ | |
| ids = [BOS] | |
| for i, phrase in enumerate(phrases): | |
| if i > 0: | |
| ids.append(SEP) | |
| ids.extend(self.encode(phrase)) | |
| ids.append(EOS) | |
| # Truncate or pad | |
| if len(ids) > max_len: | |
| ids = ids[:max_len - 1] + [EOS] | |
| while len(ids) < max_len: | |
| ids.append(PAD) | |
| return ids | |
| def save(self, path): | |
| """Save tokenizer to JSON.""" | |
| data = { | |
| "vocab_size": self.vocab_size, | |
| "word2idx": self.word2idx, | |
| } | |
| with open(path, "w") as f: | |
| json.dump(data, f) | |
| def load(cls, path): | |
| """Load tokenizer from JSON.""" | |
| with open(path) as f: | |
| data = json.load(f) | |
| tok = cls(data["vocab_size"]) | |
| tok.word2idx = data["word2idx"] | |
| tok.idx2word = {int(v): k for k, v in data["word2idx"].items()} | |
| tok.fitted = True | |
| return tok | |
| def build_tokenizer(data_path, vocab_size=8192): | |
| """Build tokenizer from conversation_flows.jsonl.""" | |
| print("Building tokenizer...") | |
| texts = [] | |
| with open(data_path) as f: | |
| for line in f: | |
| entry = json.loads(line) | |
| for phrase in entry["phrases"]: | |
| texts.append(phrase) | |
| tok = Tokenizer(vocab_size) | |
| tok.fit(texts) | |
| return tok | |
| if __name__ == "__main__": | |
| tok = build_tokenizer("/Volumes/PRO-G40/models/aac-micro-brain/data/conversation_flows.jsonl") | |
| tok.save("/Volumes/PRO-G40/models/aac-micro-brain/data/tokenizer.json") | |
| # Test | |
| test = "I want to go to the airport please" | |
| encoded = tok.encode(test) | |
| decoded = tok.decode(encoded) | |
| print(f"Test: '{test}'") | |
| print(f"Encoded: {encoded}") | |
| print(f"Decoded: '{decoded}'") | |
| # Test sequence | |
| seq = tok.encode_sequence(["Hello how are you", "I'm doing great", "Want to get lunch"]) | |
| print(f"Sequence: {seq[:30]}...") | |