Gentraxyz
/

RecursiveComplete

Text Generation

Model card Files Files and versions

RecursiveComplete / prep_bpe.py

Gentraxyz's picture

Upload folder using huggingface_hub

3c38b94 verified 12 days ago

History Blame Contribute Delete

3.16 kB

	"""Build the conversational corpus + train a BPE tokenizer, then tokenize to a
	binary token stream for fast training.
	"""
	import os, json, numpy as np
	from tokenizers import ByteLevelBPETokenizer

	DATA = 'data'
	SLICE = f'{DATA}/ts_slice.txt' # TinyStories slice (coherent prose+dialogue)
	ALPACA = f'{DATA}/alpaca.json'
	VOCAB_SIZE = 8192
	EOT = '<\|endoftext\|>'

	# ---------- 1. assemble a small mixed text for TOKENIZER TRAINING ----------
	# (tokenizer only needs a representative sample, not the whole corpus)
	sample_path = f'{DATA}/tok_sample.txt'
	if not os.path.exists(sample_path):
	with open(sample_path, 'w') as out:
	# ~80MB of stories
	with open(SLICE) as f:
	out.write(f.read(80_000_000))
	# + Alpaca Q&A so the tokenizer knows the chat format tokens
	alp = json.load(open(ALPACA))
	for ex in alp[:20000]:
	if ex['input'].strip():
	continue
	out.write(f"\nUser: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}\n")
	print("tokenizer sample built")

	# ---------- 2. train BPE ----------
	tok_dir = 'tokenizer_bpe'
	if not os.path.exists(f'{tok_dir}/vocab.json'):
	os.makedirs(tok_dir, exist_ok=True)
	tk = ByteLevelBPETokenizer()
	tk.train(files=[sample_path], vocab_size=VOCAB_SIZE, min_frequency=2,
	special_tokens=[EOT, '<pad>'])
	tk.save_model(tok_dir)
	print(f"BPE trained -> {tok_dir} (vocab {VOCAB_SIZE})")

	# ---------- 3. tokenize the full corpus to a uint16 .bin ----------
	from tokenizers import ByteLevelBPETokenizer as T
	tk = T(f'{tok_dir}/vocab.json', f'{tok_dir}/merges.txt')
	eot_id = tk.token_to_id(EOT)
	print('eot id', eot_id, 'vocab', tk.get_vocab_size())

	out_bin = f'{DATA}/train.bin'
	if not os.path.exists(out_bin):
	ids_all = []
	# stories
	with open(SLICE) as f:
	text = f.read()
	# TinyStories already uses <\|endoftext\|> between stories
	parts = text.split(EOT)
	print(f"encoding {len(parts):,} stories...")
	buf = []
	BIN = np.memmap(out_bin, dtype=np.uint16, mode='w+', shape=(400_000_000,))
	pos = 0
	for i, p in enumerate(parts):
	p = p.strip()
	if not p:
	continue
	ids = tk.encode(p).ids + [eot_id]
	if pos + len(ids) >= BIN.shape[0]:
	break
	BIN[pos:pos+len(ids)] = ids
	pos += len(ids)
	if i % 50000 == 0:
	print(f" {i:,} stories, {pos:,} tokens")
	# append Alpaca Q&A as User/Bot turns
	alp = json.load(open(ALPACA))
	for ex in alp:
	if ex['input'].strip():
	continue
	s = f"User: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}"
	ids = tk.encode(s).ids + [eot_id]
	if pos + len(ids) >= BIN.shape[0]:
	break
	BIN[pos:pos+len(ids)] = ids
	pos += len(ids)
	BIN.flush()
	# trim to actual size
	final = np.memmap(out_bin, dtype=np.uint16, mode='r', shape=(pos,))
	np.array(final).tofile(out_bin)
	print(f"DONE: {pos:,} tokens written to {out_bin}")
	json.dump({'tokens': int(pos), 'vocab_size': tk.get_vocab_size(), 'eot': eot_id},
	open(f'{DATA}/meta.json', 'w'))