AlexWortega
/

moe100m-physics-tinybpe

Mixture of Experts

Model card Files Files and versions

moe100m-physics-tinybpe / tokenizer_build.py

AlexWortega's picture

Upload tokenizer_build.py with huggingface_hub

59b2c69 verified 4 days ago

history blame contribute delete

3.81 kB

	"""Train a tiny ByteLevel-BPE on serialized physics scenes.

	Streams ~N scenes (interleaved across the 24 train shards so all scenario
	types are represented), serializes each with the reduced (sim-only)
	serialization, trains a ByteLevel BPE to vocab~=512, saves tokenizer.json.

	Reports final vocab size + median tokens/scene.
	"""
	from __future__ import annotations
	import argparse, json, statistics, sys, time

	from datasets import load_dataset, interleave_datasets
	from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors

	import physics_serialize as psz

	REPO = "AlexWortega/physics-scenarios-packed"
	TRAIN_TYPES = [
	"avalanche","basketball","billiards","breakout","bridge","chain","conveyor",
	"dominos","explosion","funnel","head_on","jenga","marble_run","orbit",
	"pendulum","pinball","plinko","projectile","pyramid","seesaw","ski_jump",
	"tower","wind","wrecking_ball",
	]
	SPECIALS = ["<pad>", "<bos>", "<eos>"]


	def scene_stream(n_scenes: int, seed: int = 0):
	shards = [f"train/{t}.tar.gz" for t in TRAIN_TYPES]
	dss = [load_dataset(REPO, data_files={"train": [s]}, split="train", streaming=True)
	for s in shards]
	mix = interleave_datasets(dss, seed=seed, stopping_strategy="all_exhausted")
	n = 0
	for r in mix:
	lines = r["jsonl"].decode().splitlines()
	if not lines:
	continue
	try:
	header = json.loads(lines[0])
	frames = [json.loads(x) for x in lines[1:] if x.startswith("{")]
	except Exception:
	continue
	if not frames:
	continue
	yield psz.serialize_scene(header, frames)
	n += 1
	if n >= n_scenes:
	return


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--n-scenes", type=int, default=30000)
	ap.add_argument("--vocab", type=int, default=512)
	ap.add_argument("--out", default="tokenizer.json")
	ap.add_argument("--seed", type=int, default=0)
	args = ap.parse_args()

	t0 = time.time()
	print(f"[tok] streaming {args.n_scenes} scenes for training corpus...", flush=True)
	texts = []
	tok_counts = []
	for i, s in enumerate(scene_stream(args.n_scenes, args.seed)):
	texts.append(s)
	if (i + 1) % 5000 == 0:
	print(f"[tok] collected {i+1} scenes ({time.time()-t0:.0f}s)", flush=True)
	print(f"[tok] collected {len(texts)} scenes in {time.time()-t0:.0f}s", flush=True)

	tok = Tokenizer(models.BPE(unk_token=None))
	tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
	tok.decoder = decoders.ByteLevel()
	trainer = trainers.BpeTrainer(
	vocab_size=args.vocab,
	special_tokens=SPECIALS,
	initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
	show_progress=True,
	)
	tok.train_from_iterator(texts, trainer=trainer)

	bos_id = tok.token_to_id("<bos>")
	eos_id = tok.token_to_id("<eos>")
	tok.post_processor = processors.TemplateProcessing(
	single="<bos> $A <eos>",
	special_tokens=[("<bos>", bos_id), ("<eos>", eos_id)],
	)
	tok.save(args.out)

	# stats
	for s in texts[:4000]:
	tok_counts.append(len(tok.encode(s).ids))
	vsz = tok.get_vocab_size()
	med = statistics.median(tok_counts)
	p90 = statistics.quantiles(tok_counts, n=10)[8]
	print(f"[tok] DONE vocab_size={vsz} median_tokens/scene={med:.0f} "
	f"p90={p90:.0f} max={max(tok_counts)} (sample n={len(tok_counts)})",
	flush=True)
	with open("TOKENIZER_STATS.json", "w") as f:
	json.dump({"vocab_size": vsz, "median_tokens_per_scene": med,
	"p90_tokens_per_scene": p90, "max_tokens": max(tok_counts),
	"n_train_scenes": len(texts)}, f, indent=2)


	if __name__ == "__main__":
	main()