Instructions to use AlexWortega/moe100m-physics-tinybpe with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use AlexWortega/moe100m-physics-tinybpe with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("AlexWortega/moe100m-physics-tinybpe", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """Train a tiny ByteLevel-BPE on serialized physics scenes. | |
| Streams ~N scenes (interleaved across the 24 train shards so all scenario | |
| types are represented), serializes each with the reduced (sim-only) | |
| serialization, trains a ByteLevel BPE to vocab~=512, saves tokenizer.json. | |
| Reports final vocab size + median tokens/scene. | |
| """ | |
| from __future__ import annotations | |
| import argparse, json, statistics, sys, time | |
| from datasets import load_dataset, interleave_datasets | |
| from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors | |
| import physics_serialize as psz | |
| REPO = "AlexWortega/physics-scenarios-packed" | |
| TRAIN_TYPES = [ | |
| "avalanche","basketball","billiards","breakout","bridge","chain","conveyor", | |
| "dominos","explosion","funnel","head_on","jenga","marble_run","orbit", | |
| "pendulum","pinball","plinko","projectile","pyramid","seesaw","ski_jump", | |
| "tower","wind","wrecking_ball", | |
| ] | |
| SPECIALS = ["<pad>", "<bos>", "<eos>"] | |
| def scene_stream(n_scenes: int, seed: int = 0): | |
| shards = [f"train/{t}.tar.gz" for t in TRAIN_TYPES] | |
| dss = [load_dataset(REPO, data_files={"train": [s]}, split="train", streaming=True) | |
| for s in shards] | |
| mix = interleave_datasets(dss, seed=seed, stopping_strategy="all_exhausted") | |
| n = 0 | |
| for r in mix: | |
| lines = r["jsonl"].decode().splitlines() | |
| if not lines: | |
| continue | |
| try: | |
| header = json.loads(lines[0]) | |
| frames = [json.loads(x) for x in lines[1:] if x.startswith("{")] | |
| except Exception: | |
| continue | |
| if not frames: | |
| continue | |
| yield psz.serialize_scene(header, frames) | |
| n += 1 | |
| if n >= n_scenes: | |
| return | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--n-scenes", type=int, default=30000) | |
| ap.add_argument("--vocab", type=int, default=512) | |
| ap.add_argument("--out", default="tokenizer.json") | |
| ap.add_argument("--seed", type=int, default=0) | |
| args = ap.parse_args() | |
| t0 = time.time() | |
| print(f"[tok] streaming {args.n_scenes} scenes for training corpus...", flush=True) | |
| texts = [] | |
| tok_counts = [] | |
| for i, s in enumerate(scene_stream(args.n_scenes, args.seed)): | |
| texts.append(s) | |
| if (i + 1) % 5000 == 0: | |
| print(f"[tok] collected {i+1} scenes ({time.time()-t0:.0f}s)", flush=True) | |
| print(f"[tok] collected {len(texts)} scenes in {time.time()-t0:.0f}s", flush=True) | |
| tok = Tokenizer(models.BPE(unk_token=None)) | |
| tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) | |
| tok.decoder = decoders.ByteLevel() | |
| trainer = trainers.BpeTrainer( | |
| vocab_size=args.vocab, | |
| special_tokens=SPECIALS, | |
| initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), | |
| show_progress=True, | |
| ) | |
| tok.train_from_iterator(texts, trainer=trainer) | |
| bos_id = tok.token_to_id("<bos>") | |
| eos_id = tok.token_to_id("<eos>") | |
| tok.post_processor = processors.TemplateProcessing( | |
| single="<bos> $A <eos>", | |
| special_tokens=[("<bos>", bos_id), ("<eos>", eos_id)], | |
| ) | |
| tok.save(args.out) | |
| # stats | |
| for s in texts[:4000]: | |
| tok_counts.append(len(tok.encode(s).ids)) | |
| vsz = tok.get_vocab_size() | |
| med = statistics.median(tok_counts) | |
| p90 = statistics.quantiles(tok_counts, n=10)[8] | |
| print(f"[tok] DONE vocab_size={vsz} median_tokens/scene={med:.0f} " | |
| f"p90={p90:.0f} max={max(tok_counts)} (sample n={len(tok_counts)})", | |
| flush=True) | |
| with open("TOKENIZER_STATS.json", "w") as f: | |
| json.dump({"vocab_size": vsz, "median_tokens_per_scene": med, | |
| "p90_tokens_per_scene": p90, "max_tokens": max(tok_counts), | |
| "n_train_scenes": len(texts)}, f, indent=2) | |
| if __name__ == "__main__": | |
| main() | |