Instructions to use AlexWortega/moe100m-physics-tinybpe with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use AlexWortega/moe100m-physics-tinybpe with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("AlexWortega/moe100m-physics-tinybpe", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Upload tokenizer_build.py with huggingface_hub
Browse files- tokenizer_build.py +103 -0
tokenizer_build.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Train a tiny ByteLevel-BPE on serialized physics scenes.
|
| 2 |
+
|
| 3 |
+
Streams ~N scenes (interleaved across the 24 train shards so all scenario
|
| 4 |
+
types are represented), serializes each with the reduced (sim-only)
|
| 5 |
+
serialization, trains a ByteLevel BPE to vocab~=512, saves tokenizer.json.
|
| 6 |
+
|
| 7 |
+
Reports final vocab size + median tokens/scene.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
import argparse, json, statistics, sys, time
|
| 11 |
+
|
| 12 |
+
from datasets import load_dataset, interleave_datasets
|
| 13 |
+
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors
|
| 14 |
+
|
| 15 |
+
import physics_serialize as psz
|
| 16 |
+
|
| 17 |
+
REPO = "AlexWortega/physics-scenarios-packed"
|
| 18 |
+
TRAIN_TYPES = [
|
| 19 |
+
"avalanche","basketball","billiards","breakout","bridge","chain","conveyor",
|
| 20 |
+
"dominos","explosion","funnel","head_on","jenga","marble_run","orbit",
|
| 21 |
+
"pendulum","pinball","plinko","projectile","pyramid","seesaw","ski_jump",
|
| 22 |
+
"tower","wind","wrecking_ball",
|
| 23 |
+
]
|
| 24 |
+
SPECIALS = ["<pad>", "<bos>", "<eos>"]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def scene_stream(n_scenes: int, seed: int = 0):
|
| 28 |
+
shards = [f"train/{t}.tar.gz" for t in TRAIN_TYPES]
|
| 29 |
+
dss = [load_dataset(REPO, data_files={"train": [s]}, split="train", streaming=True)
|
| 30 |
+
for s in shards]
|
| 31 |
+
mix = interleave_datasets(dss, seed=seed, stopping_strategy="all_exhausted")
|
| 32 |
+
n = 0
|
| 33 |
+
for r in mix:
|
| 34 |
+
lines = r["jsonl"].decode().splitlines()
|
| 35 |
+
if not lines:
|
| 36 |
+
continue
|
| 37 |
+
try:
|
| 38 |
+
header = json.loads(lines[0])
|
| 39 |
+
frames = [json.loads(x) for x in lines[1:] if x.startswith("{")]
|
| 40 |
+
except Exception:
|
| 41 |
+
continue
|
| 42 |
+
if not frames:
|
| 43 |
+
continue
|
| 44 |
+
yield psz.serialize_scene(header, frames)
|
| 45 |
+
n += 1
|
| 46 |
+
if n >= n_scenes:
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def main():
|
| 51 |
+
ap = argparse.ArgumentParser()
|
| 52 |
+
ap.add_argument("--n-scenes", type=int, default=30000)
|
| 53 |
+
ap.add_argument("--vocab", type=int, default=512)
|
| 54 |
+
ap.add_argument("--out", default="tokenizer.json")
|
| 55 |
+
ap.add_argument("--seed", type=int, default=0)
|
| 56 |
+
args = ap.parse_args()
|
| 57 |
+
|
| 58 |
+
t0 = time.time()
|
| 59 |
+
print(f"[tok] streaming {args.n_scenes} scenes for training corpus...", flush=True)
|
| 60 |
+
texts = []
|
| 61 |
+
tok_counts = []
|
| 62 |
+
for i, s in enumerate(scene_stream(args.n_scenes, args.seed)):
|
| 63 |
+
texts.append(s)
|
| 64 |
+
if (i + 1) % 5000 == 0:
|
| 65 |
+
print(f"[tok] collected {i+1} scenes ({time.time()-t0:.0f}s)", flush=True)
|
| 66 |
+
print(f"[tok] collected {len(texts)} scenes in {time.time()-t0:.0f}s", flush=True)
|
| 67 |
+
|
| 68 |
+
tok = Tokenizer(models.BPE(unk_token=None))
|
| 69 |
+
tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
| 70 |
+
tok.decoder = decoders.ByteLevel()
|
| 71 |
+
trainer = trainers.BpeTrainer(
|
| 72 |
+
vocab_size=args.vocab,
|
| 73 |
+
special_tokens=SPECIALS,
|
| 74 |
+
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
| 75 |
+
show_progress=True,
|
| 76 |
+
)
|
| 77 |
+
tok.train_from_iterator(texts, trainer=trainer)
|
| 78 |
+
|
| 79 |
+
bos_id = tok.token_to_id("<bos>")
|
| 80 |
+
eos_id = tok.token_to_id("<eos>")
|
| 81 |
+
tok.post_processor = processors.TemplateProcessing(
|
| 82 |
+
single="<bos> $A <eos>",
|
| 83 |
+
special_tokens=[("<bos>", bos_id), ("<eos>", eos_id)],
|
| 84 |
+
)
|
| 85 |
+
tok.save(args.out)
|
| 86 |
+
|
| 87 |
+
# stats
|
| 88 |
+
for s in texts[:4000]:
|
| 89 |
+
tok_counts.append(len(tok.encode(s).ids))
|
| 90 |
+
vsz = tok.get_vocab_size()
|
| 91 |
+
med = statistics.median(tok_counts)
|
| 92 |
+
p90 = statistics.quantiles(tok_counts, n=10)[8]
|
| 93 |
+
print(f"[tok] DONE vocab_size={vsz} median_tokens/scene={med:.0f} "
|
| 94 |
+
f"p90={p90:.0f} max={max(tok_counts)} (sample n={len(tok_counts)})",
|
| 95 |
+
flush=True)
|
| 96 |
+
with open("TOKENIZER_STATS.json", "w") as f:
|
| 97 |
+
json.dump({"vocab_size": vsz, "median_tokens_per_scene": med,
|
| 98 |
+
"p90_tokens_per_scene": p90, "max_tokens": max(tok_counts),
|
| 99 |
+
"n_train_scenes": len(texts)}, f, indent=2)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
main()
|