RecursiveComplete / prep_bpe.py
Gentraxyz's picture
Upload folder using huggingface_hub
3c38b94 verified
Raw
History Blame Contribute Delete
3.16 kB
"""Build the conversational corpus + train a BPE tokenizer, then tokenize to a
binary token stream for fast training.
"""
import os, json, numpy as np
from tokenizers import ByteLevelBPETokenizer
DATA = 'data'
SLICE = f'{DATA}/ts_slice.txt' # TinyStories slice (coherent prose+dialogue)
ALPACA = f'{DATA}/alpaca.json'
VOCAB_SIZE = 8192
EOT = '<|endoftext|>'
# ---------- 1. assemble a small mixed text for TOKENIZER TRAINING ----------
# (tokenizer only needs a representative sample, not the whole corpus)
sample_path = f'{DATA}/tok_sample.txt'
if not os.path.exists(sample_path):
with open(sample_path, 'w') as out:
# ~80MB of stories
with open(SLICE) as f:
out.write(f.read(80_000_000))
# + Alpaca Q&A so the tokenizer knows the chat format tokens
alp = json.load(open(ALPACA))
for ex in alp[:20000]:
if ex['input'].strip():
continue
out.write(f"\nUser: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}\n")
print("tokenizer sample built")
# ---------- 2. train BPE ----------
tok_dir = 'tokenizer_bpe'
if not os.path.exists(f'{tok_dir}/vocab.json'):
os.makedirs(tok_dir, exist_ok=True)
tk = ByteLevelBPETokenizer()
tk.train(files=[sample_path], vocab_size=VOCAB_SIZE, min_frequency=2,
special_tokens=[EOT, '<pad>'])
tk.save_model(tok_dir)
print(f"BPE trained -> {tok_dir} (vocab {VOCAB_SIZE})")
# ---------- 3. tokenize the full corpus to a uint16 .bin ----------
from tokenizers import ByteLevelBPETokenizer as T
tk = T(f'{tok_dir}/vocab.json', f'{tok_dir}/merges.txt')
eot_id = tk.token_to_id(EOT)
print('eot id', eot_id, 'vocab', tk.get_vocab_size())
out_bin = f'{DATA}/train.bin'
if not os.path.exists(out_bin):
ids_all = []
# stories
with open(SLICE) as f:
text = f.read()
# TinyStories already uses <|endoftext|> between stories
parts = text.split(EOT)
print(f"encoding {len(parts):,} stories...")
buf = []
BIN = np.memmap(out_bin, dtype=np.uint16, mode='w+', shape=(400_000_000,))
pos = 0
for i, p in enumerate(parts):
p = p.strip()
if not p:
continue
ids = tk.encode(p).ids + [eot_id]
if pos + len(ids) >= BIN.shape[0]:
break
BIN[pos:pos+len(ids)] = ids
pos += len(ids)
if i % 50000 == 0:
print(f" {i:,} stories, {pos:,} tokens")
# append Alpaca Q&A as User/Bot turns
alp = json.load(open(ALPACA))
for ex in alp:
if ex['input'].strip():
continue
s = f"User: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}"
ids = tk.encode(s).ids + [eot_id]
if pos + len(ids) >= BIN.shape[0]:
break
BIN[pos:pos+len(ids)] = ids
pos += len(ids)
BIN.flush()
# trim to actual size
final = np.memmap(out_bin, dtype=np.uint16, mode='r', shape=(pos,))
np.array(final).tofile(out_bin)
print(f"DONE: {pos:,} tokens written to {out_bin}")
json.dump({'tokens': int(pos), 'vocab_size': tk.get_vocab_size(), 'eot': eot_id},
open(f'{DATA}/meta.json', 'w'))