slayer-gpt-tokenizer-model / examples /prepare_corpus.py
kacperwikiel's picture
Upload Slayer GPT tokenizer model archive
78c54ec verified
Raw
History Blame Contribute Delete
3.65 kB
#!/usr/bin/env python3
"""Train a byte-level BPE tokenizer and write uint16 token-id shards.
This is a teaching/reference script for reproducing the data shape expected by
the Slayer GPT-style training code. Input is a directory of UTF-8 .txt files.
"""
from __future__ import annotations
import argparse
import random
from pathlib import Path
import numpy as np
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
SPECIAL_TOKEN = "<|endoftext|>"
def iter_text_files(raw_dir: Path) -> list[Path]:
files = sorted(raw_dir.rglob("*.txt"))
if not files:
raise SystemExit(f"No .txt files found under {raw_dir}")
return files
def train_tokenizer(files: list[Path], out_path: Path, vocab_size: int) -> Tokenizer:
tokenizer = Tokenizer(BPE(unk_token=None))
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(
vocab_size=vocab_size,
min_frequency=2,
special_tokens=[SPECIAL_TOKEN],
show_progress=True,
)
tokenizer.train([str(path) for path in files], trainer)
out_path.parent.mkdir(parents=True, exist_ok=True)
tokenizer.save(str(out_path))
return tokenizer
def load_tokenizer(path: Path) -> Tokenizer:
return Tokenizer.from_file(str(path))
def write_shard(tokenizer: Tokenizer, files: list[Path], out_path: Path) -> int:
ids: list[int] = []
eot = tokenizer.token_to_id(SPECIAL_TOKEN)
if eot is None:
raise SystemExit(f"Tokenizer is missing {SPECIAL_TOKEN!r}")
for path in files:
text = path.read_text(encoding="utf-8", errors="replace").strip()
if not text:
continue
ids.append(eot)
ids.extend(tokenizer.encode(text).ids)
arr = np.asarray(ids, dtype=np.uint16)
out_path.parent.mkdir(parents=True, exist_ok=True)
arr.tofile(out_path)
return int(arr.size)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--raw-dir", type=Path, required=True)
parser.add_argument("--out-dir", type=Path, required=True)
parser.add_argument("--vocab-size", type=int, default=32768)
parser.add_argument("--val-frac", type=float, default=0.01)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--train-tokenizer", action="store_true")
args = parser.parse_args()
if args.vocab_size > 65536:
raise SystemExit("This shard writer uses uint16; vocab size must be <= 65536")
files = iter_text_files(args.raw_dir)
random.Random(args.seed).shuffle(files)
val_count = max(1, int(len(files) * args.val_frac))
val_files = sorted(files[:val_count])
train_files = sorted(files[val_count:])
tokenizer_path = args.out_dir / "tokenizer.json"
if args.train_tokenizer or not tokenizer_path.exists():
tokenizer = train_tokenizer(train_files, tokenizer_path, args.vocab_size)
else:
tokenizer = load_tokenizer(tokenizer_path)
train_tokens = write_shard(
tokenizer,
train_files,
args.out_dir / "shards" / "polish_train_000000.bin",
)
val_tokens = write_shard(
tokenizer,
val_files,
args.out_dir / "shards" / "polish_val_000000.bin",
)
print(f"tokenizer={tokenizer_path}")
print(f"train_files={len(train_files)} train_tokens={train_tokens}")
print(f"val_files={len(val_files)} val_tokens={val_tokens}")
if __name__ == "__main__":
main()