"""Build the conversational corpus + train a BPE tokenizer, then tokenize to a binary token stream for fast training. """ import os, json, numpy as np from tokenizers import ByteLevelBPETokenizer DATA = 'data' SLICE = f'{DATA}/ts_slice.txt' # TinyStories slice (coherent prose+dialogue) ALPACA = f'{DATA}/alpaca.json' VOCAB_SIZE = 8192 EOT = '<|endoftext|>' # ---------- 1. assemble a small mixed text for TOKENIZER TRAINING ---------- # (tokenizer only needs a representative sample, not the whole corpus) sample_path = f'{DATA}/tok_sample.txt' if not os.path.exists(sample_path): with open(sample_path, 'w') as out: # ~80MB of stories with open(SLICE) as f: out.write(f.read(80_000_000)) # + Alpaca Q&A so the tokenizer knows the chat format tokens alp = json.load(open(ALPACA)) for ex in alp[:20000]: if ex['input'].strip(): continue out.write(f"\nUser: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}\n") print("tokenizer sample built") # ---------- 2. train BPE ---------- tok_dir = 'tokenizer_bpe' if not os.path.exists(f'{tok_dir}/vocab.json'): os.makedirs(tok_dir, exist_ok=True) tk = ByteLevelBPETokenizer() tk.train(files=[sample_path], vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=[EOT, '']) tk.save_model(tok_dir) print(f"BPE trained -> {tok_dir} (vocab {VOCAB_SIZE})") # ---------- 3. tokenize the full corpus to a uint16 .bin ---------- from tokenizers import ByteLevelBPETokenizer as T tk = T(f'{tok_dir}/vocab.json', f'{tok_dir}/merges.txt') eot_id = tk.token_to_id(EOT) print('eot id', eot_id, 'vocab', tk.get_vocab_size()) out_bin = f'{DATA}/train.bin' if not os.path.exists(out_bin): ids_all = [] # stories with open(SLICE) as f: text = f.read() # TinyStories already uses <|endoftext|> between stories parts = text.split(EOT) print(f"encoding {len(parts):,} stories...") buf = [] BIN = np.memmap(out_bin, dtype=np.uint16, mode='w+', shape=(400_000_000,)) pos = 0 for i, p in enumerate(parts): p = p.strip() if not p: continue ids = tk.encode(p).ids + [eot_id] if pos + len(ids) >= BIN.shape[0]: break BIN[pos:pos+len(ids)] = ids pos += len(ids) if i % 50000 == 0: print(f" {i:,} stories, {pos:,} tokens") # append Alpaca Q&A as User/Bot turns alp = json.load(open(ALPACA)) for ex in alp: if ex['input'].strip(): continue s = f"User: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}" ids = tk.encode(s).ids + [eot_id] if pos + len(ids) >= BIN.shape[0]: break BIN[pos:pos+len(ids)] = ids pos += len(ids) BIN.flush() # trim to actual size final = np.memmap(out_bin, dtype=np.uint16, mode='r', shape=(pos,)) np.array(final).tofile(out_bin) print(f"DONE: {pos:,} tokens written to {out_bin}") json.dump({'tokens': int(pos), 'vocab_size': tk.get_vocab_size(), 'eot': eot_id}, open(f'{DATA}/meta.json', 'w'))