"""Train a tiny ByteLevel-BPE on serialized physics scenes. Streams ~N scenes (interleaved across the 24 train shards so all scenario types are represented), serializes each with the reduced (sim-only) serialization, trains a ByteLevel BPE to vocab~=512, saves tokenizer.json. Reports final vocab size + median tokens/scene. """ from __future__ import annotations import argparse, json, statistics, sys, time from datasets import load_dataset, interleave_datasets from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors import physics_serialize as psz REPO = "AlexWortega/physics-scenarios-packed" TRAIN_TYPES = [ "avalanche","basketball","billiards","breakout","bridge","chain","conveyor", "dominos","explosion","funnel","head_on","jenga","marble_run","orbit", "pendulum","pinball","plinko","projectile","pyramid","seesaw","ski_jump", "tower","wind","wrecking_ball", ] SPECIALS = ["", "", ""] def scene_stream(n_scenes: int, seed: int = 0): shards = [f"train/{t}.tar.gz" for t in TRAIN_TYPES] dss = [load_dataset(REPO, data_files={"train": [s]}, split="train", streaming=True) for s in shards] mix = interleave_datasets(dss, seed=seed, stopping_strategy="all_exhausted") n = 0 for r in mix: lines = r["jsonl"].decode().splitlines() if not lines: continue try: header = json.loads(lines[0]) frames = [json.loads(x) for x in lines[1:] if x.startswith("{")] except Exception: continue if not frames: continue yield psz.serialize_scene(header, frames) n += 1 if n >= n_scenes: return def main(): ap = argparse.ArgumentParser() ap.add_argument("--n-scenes", type=int, default=30000) ap.add_argument("--vocab", type=int, default=512) ap.add_argument("--out", default="tokenizer.json") ap.add_argument("--seed", type=int, default=0) args = ap.parse_args() t0 = time.time() print(f"[tok] streaming {args.n_scenes} scenes for training corpus...", flush=True) texts = [] tok_counts = [] for i, s in enumerate(scene_stream(args.n_scenes, args.seed)): texts.append(s) if (i + 1) % 5000 == 0: print(f"[tok] collected {i+1} scenes ({time.time()-t0:.0f}s)", flush=True) print(f"[tok] collected {len(texts)} scenes in {time.time()-t0:.0f}s", flush=True) tok = Tokenizer(models.BPE(unk_token=None)) tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tok.decoder = decoders.ByteLevel() trainer = trainers.BpeTrainer( vocab_size=args.vocab, special_tokens=SPECIALS, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), show_progress=True, ) tok.train_from_iterator(texts, trainer=trainer) bos_id = tok.token_to_id("") eos_id = tok.token_to_id("") tok.post_processor = processors.TemplateProcessing( single=" $A ", special_tokens=[("", bos_id), ("", eos_id)], ) tok.save(args.out) # stats for s in texts[:4000]: tok_counts.append(len(tok.encode(s).ids)) vsz = tok.get_vocab_size() med = statistics.median(tok_counts) p90 = statistics.quantiles(tok_counts, n=10)[8] print(f"[tok] DONE vocab_size={vsz} median_tokens/scene={med:.0f} " f"p90={p90:.0f} max={max(tok_counts)} (sample n={len(tok_counts)})", flush=True) with open("TOKENIZER_STATS.json", "w") as f: json.dump({"vocab_size": vsz, "median_tokens_per_scene": med, "p90_tokens_per_scene": p90, "max_tokens": max(tok_counts), "n_train_scenes": len(texts)}, f, indent=2) if __name__ == "__main__": main()