#!/usr/bin/env python3 """Train a byte-level BPE tokenizer and write uint16 token-id shards. This is a teaching/reference script for reproducing the data shape expected by the Slayer GPT-style training code. Input is a directory of UTF-8 .txt files. """ from __future__ import annotations import argparse import random from pathlib import Path import numpy as np from tokenizers import Tokenizer from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.models import BPE from tokenizers.pre_tokenizers import ByteLevel from tokenizers.trainers import BpeTrainer SPECIAL_TOKEN = "<|endoftext|>" def iter_text_files(raw_dir: Path) -> list[Path]: files = sorted(raw_dir.rglob("*.txt")) if not files: raise SystemExit(f"No .txt files found under {raw_dir}") return files def train_tokenizer(files: list[Path], out_path: Path, vocab_size: int) -> Tokenizer: tokenizer = Tokenizer(BPE(unk_token=None)) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) tokenizer.decoder = ByteLevelDecoder() trainer = BpeTrainer( vocab_size=vocab_size, min_frequency=2, special_tokens=[SPECIAL_TOKEN], show_progress=True, ) tokenizer.train([str(path) for path in files], trainer) out_path.parent.mkdir(parents=True, exist_ok=True) tokenizer.save(str(out_path)) return tokenizer def load_tokenizer(path: Path) -> Tokenizer: return Tokenizer.from_file(str(path)) def write_shard(tokenizer: Tokenizer, files: list[Path], out_path: Path) -> int: ids: list[int] = [] eot = tokenizer.token_to_id(SPECIAL_TOKEN) if eot is None: raise SystemExit(f"Tokenizer is missing {SPECIAL_TOKEN!r}") for path in files: text = path.read_text(encoding="utf-8", errors="replace").strip() if not text: continue ids.append(eot) ids.extend(tokenizer.encode(text).ids) arr = np.asarray(ids, dtype=np.uint16) out_path.parent.mkdir(parents=True, exist_ok=True) arr.tofile(out_path) return int(arr.size) def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--raw-dir", type=Path, required=True) parser.add_argument("--out-dir", type=Path, required=True) parser.add_argument("--vocab-size", type=int, default=32768) parser.add_argument("--val-frac", type=float, default=0.01) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--train-tokenizer", action="store_true") args = parser.parse_args() if args.vocab_size > 65536: raise SystemExit("This shard writer uses uint16; vocab size must be <= 65536") files = iter_text_files(args.raw_dir) random.Random(args.seed).shuffle(files) val_count = max(1, int(len(files) * args.val_frac)) val_files = sorted(files[:val_count]) train_files = sorted(files[val_count:]) tokenizer_path = args.out_dir / "tokenizer.json" if args.train_tokenizer or not tokenizer_path.exists(): tokenizer = train_tokenizer(train_files, tokenizer_path, args.vocab_size) else: tokenizer = load_tokenizer(tokenizer_path) train_tokens = write_shard( tokenizer, train_files, args.out_dir / "shards" / "polish_train_000000.bin", ) val_tokens = write_shard( tokenizer, val_files, args.out_dir / "shards" / "polish_val_000000.bin", ) print(f"tokenizer={tokenizer_path}") print(f"train_files={len(train_files)} train_tokens={train_tokens}") print(f"val_files={len(val_files)} val_tokens={val_tokens}") if __name__ == "__main__": main()