AlexWortega commited on
Commit
59b2c69
·
verified ·
1 Parent(s): 3ab4691

Upload tokenizer_build.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer_build.py +103 -0
tokenizer_build.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Train a tiny ByteLevel-BPE on serialized physics scenes.
2
+
3
+ Streams ~N scenes (interleaved across the 24 train shards so all scenario
4
+ types are represented), serializes each with the reduced (sim-only)
5
+ serialization, trains a ByteLevel BPE to vocab~=512, saves tokenizer.json.
6
+
7
+ Reports final vocab size + median tokens/scene.
8
+ """
9
+ from __future__ import annotations
10
+ import argparse, json, statistics, sys, time
11
+
12
+ from datasets import load_dataset, interleave_datasets
13
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
14
+
15
+ import physics_serialize as psz
16
+
17
+ REPO = "AlexWortega/physics-scenarios-packed"
18
+ TRAIN_TYPES = [
19
+ "avalanche","basketball","billiards","breakout","bridge","chain","conveyor",
20
+ "dominos","explosion","funnel","head_on","jenga","marble_run","orbit",
21
+ "pendulum","pinball","plinko","projectile","pyramid","seesaw","ski_jump",
22
+ "tower","wind","wrecking_ball",
23
+ ]
24
+ SPECIALS = ["<pad>", "<bos>", "<eos>"]
25
+
26
+
27
+ def scene_stream(n_scenes: int, seed: int = 0):
28
+ shards = [f"train/{t}.tar.gz" for t in TRAIN_TYPES]
29
+ dss = [load_dataset(REPO, data_files={"train": [s]}, split="train", streaming=True)
30
+ for s in shards]
31
+ mix = interleave_datasets(dss, seed=seed, stopping_strategy="all_exhausted")
32
+ n = 0
33
+ for r in mix:
34
+ lines = r["jsonl"].decode().splitlines()
35
+ if not lines:
36
+ continue
37
+ try:
38
+ header = json.loads(lines[0])
39
+ frames = [json.loads(x) for x in lines[1:] if x.startswith("{")]
40
+ except Exception:
41
+ continue
42
+ if not frames:
43
+ continue
44
+ yield psz.serialize_scene(header, frames)
45
+ n += 1
46
+ if n >= n_scenes:
47
+ return
48
+
49
+
50
+ def main():
51
+ ap = argparse.ArgumentParser()
52
+ ap.add_argument("--n-scenes", type=int, default=30000)
53
+ ap.add_argument("--vocab", type=int, default=512)
54
+ ap.add_argument("--out", default="tokenizer.json")
55
+ ap.add_argument("--seed", type=int, default=0)
56
+ args = ap.parse_args()
57
+
58
+ t0 = time.time()
59
+ print(f"[tok] streaming {args.n_scenes} scenes for training corpus...", flush=True)
60
+ texts = []
61
+ tok_counts = []
62
+ for i, s in enumerate(scene_stream(args.n_scenes, args.seed)):
63
+ texts.append(s)
64
+ if (i + 1) % 5000 == 0:
65
+ print(f"[tok] collected {i+1} scenes ({time.time()-t0:.0f}s)", flush=True)
66
+ print(f"[tok] collected {len(texts)} scenes in {time.time()-t0:.0f}s", flush=True)
67
+
68
+ tok = Tokenizer(models.BPE(unk_token=None))
69
+ tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
70
+ tok.decoder = decoders.ByteLevel()
71
+ trainer = trainers.BpeTrainer(
72
+ vocab_size=args.vocab,
73
+ special_tokens=SPECIALS,
74
+ initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
75
+ show_progress=True,
76
+ )
77
+ tok.train_from_iterator(texts, trainer=trainer)
78
+
79
+ bos_id = tok.token_to_id("<bos>")
80
+ eos_id = tok.token_to_id("<eos>")
81
+ tok.post_processor = processors.TemplateProcessing(
82
+ single="<bos> $A <eos>",
83
+ special_tokens=[("<bos>", bos_id), ("<eos>", eos_id)],
84
+ )
85
+ tok.save(args.out)
86
+
87
+ # stats
88
+ for s in texts[:4000]:
89
+ tok_counts.append(len(tok.encode(s).ids))
90
+ vsz = tok.get_vocab_size()
91
+ med = statistics.median(tok_counts)
92
+ p90 = statistics.quantiles(tok_counts, n=10)[8]
93
+ print(f"[tok] DONE vocab_size={vsz} median_tokens/scene={med:.0f} "
94
+ f"p90={p90:.0f} max={max(tok_counts)} (sample n={len(tok_counts)})",
95
+ flush=True)
96
+ with open("TOKENIZER_STATS.json", "w") as f:
97
+ json.dump({"vocab_size": vsz, "median_tokens_per_scene": med,
98
+ "p90_tokens_per_scene": p90, "max_tokens": max(tok_counts),
99
+ "n_train_scenes": len(texts)}, f, indent=2)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ main()