| |
| """Train a byte-level BPE tokenizer and write uint16 token-id shards. |
| |
| This is a teaching/reference script for reproducing the data shape expected by |
| the Slayer GPT-style training code. Input is a directory of UTF-8 .txt files. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import random |
| from pathlib import Path |
|
|
| import numpy as np |
| from tokenizers import Tokenizer |
| from tokenizers.decoders import ByteLevel as ByteLevelDecoder |
| from tokenizers.models import BPE |
| from tokenizers.pre_tokenizers import ByteLevel |
| from tokenizers.trainers import BpeTrainer |
|
|
|
|
| SPECIAL_TOKEN = "<|endoftext|>" |
|
|
|
|
| def iter_text_files(raw_dir: Path) -> list[Path]: |
| files = sorted(raw_dir.rglob("*.txt")) |
| if not files: |
| raise SystemExit(f"No .txt files found under {raw_dir}") |
| return files |
|
|
|
|
| def train_tokenizer(files: list[Path], out_path: Path, vocab_size: int) -> Tokenizer: |
| tokenizer = Tokenizer(BPE(unk_token=None)) |
| tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False) |
| tokenizer.decoder = ByteLevelDecoder() |
|
|
| trainer = BpeTrainer( |
| vocab_size=vocab_size, |
| min_frequency=2, |
| special_tokens=[SPECIAL_TOKEN], |
| show_progress=True, |
| ) |
| tokenizer.train([str(path) for path in files], trainer) |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
| tokenizer.save(str(out_path)) |
| return tokenizer |
|
|
|
|
| def load_tokenizer(path: Path) -> Tokenizer: |
| return Tokenizer.from_file(str(path)) |
|
|
|
|
| def write_shard(tokenizer: Tokenizer, files: list[Path], out_path: Path) -> int: |
| ids: list[int] = [] |
| eot = tokenizer.token_to_id(SPECIAL_TOKEN) |
| if eot is None: |
| raise SystemExit(f"Tokenizer is missing {SPECIAL_TOKEN!r}") |
|
|
| for path in files: |
| text = path.read_text(encoding="utf-8", errors="replace").strip() |
| if not text: |
| continue |
| ids.append(eot) |
| ids.extend(tokenizer.encode(text).ids) |
|
|
| arr = np.asarray(ids, dtype=np.uint16) |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
| arr.tofile(out_path) |
| return int(arr.size) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--raw-dir", type=Path, required=True) |
| parser.add_argument("--out-dir", type=Path, required=True) |
| parser.add_argument("--vocab-size", type=int, default=32768) |
| parser.add_argument("--val-frac", type=float, default=0.01) |
| parser.add_argument("--seed", type=int, default=42) |
| parser.add_argument("--train-tokenizer", action="store_true") |
| args = parser.parse_args() |
|
|
| if args.vocab_size > 65536: |
| raise SystemExit("This shard writer uses uint16; vocab size must be <= 65536") |
|
|
| files = iter_text_files(args.raw_dir) |
| random.Random(args.seed).shuffle(files) |
| val_count = max(1, int(len(files) * args.val_frac)) |
| val_files = sorted(files[:val_count]) |
| train_files = sorted(files[val_count:]) |
|
|
| tokenizer_path = args.out_dir / "tokenizer.json" |
| if args.train_tokenizer or not tokenizer_path.exists(): |
| tokenizer = train_tokenizer(train_files, tokenizer_path, args.vocab_size) |
| else: |
| tokenizer = load_tokenizer(tokenizer_path) |
|
|
| train_tokens = write_shard( |
| tokenizer, |
| train_files, |
| args.out_dir / "shards" / "polish_train_000000.bin", |
| ) |
| val_tokens = write_shard( |
| tokenizer, |
| val_files, |
| args.out_dir / "shards" / "polish_val_000000.bin", |
| ) |
|
|
| print(f"tokenizer={tokenizer_path}") |
| print(f"train_files={len(train_files)} train_tokens={train_tokens}") |
| print(f"val_files={len(val_files)} val_tokens={val_tokens}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|