File size: 5,022 Bytes

f0169be

"""
Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1).

We train on a balanced sample drawn from the same dense mix used for
pretraining, so the tokenizer's merges reflect the actual data distribution
(web text + textbooks + math + code). A from-scratch tokenizer matters at this
scale: every wasted token in the vocab is embedding-table budget burned.

Usage:
    python tokenizer.py --train      # train and save ivme_tokenizer.json
    python tokenizer.py --test       # quick round-trip check on saved tokenizer
"""

from __future__ import annotations

import argparse
import os

# Avoid fork/threading crashes when the Rust tokenizer consumes Python data.
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

VOCAB_SIZE = 16_384
TOKENIZER_PATH = "ivme_tokenizer.json"

# Truncate each document — subword merges are learned fine from prefixes, and
# this keeps the trainer's in-memory word counts bounded (no segfaults).
MAX_CHARS = 8_000

# Special tokens. We keep a small, purposeful set: pad, bos, eos, and a small
# bank of chat-control tokens reused later for instruct fine-tuning.
SPECIAL_TOKENS = [
    "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>",
    "<|user|>", "<|assistant|>", "<|system|>",
]

# How many documents to sample per source for tokenizer training. A 16K vocab
# needs very little data — ~130K docs total is plenty and keeps RAM bounded.
SAMPLE_PER_SOURCE = {
    "fineweb_hq": 60_000,
    "cosmopedia": 30_000,
    "finemath":   20_000,
    "python":     20_000,
    "wikipedia":  20_000,
}


def text_iterator():
    """Yield raw text strings sampled from each source in the dense mix."""
    from datasets import load_dataset

    def take(ds, n, field="text"):
        count = 0
        for row in ds:
            txt = row.get(field)
            if txt:
                yield txt[:MAX_CHARS]
                count += 1
                if count >= n:
                    return

    print("[tok] streaming FineWeb-HQ ...")
    ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"])

    print("[tok] streaming Cosmopedia ...")
    ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"])

    print("[tok] streaming FineMath ...")
    ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["finemath"])

    print("[tok] streaming Python stack ...")
    ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content")

    print("[tok] streaming Wikipedia ...")
    ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"])


def train():
    from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders

    tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>"))
    # ByteLevel pre-tokenizer: no out-of-vocab characters ever, GPT-2 style.
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    trainer = trainers.BpeTrainer(
        vocab_size=VOCAB_SIZE,
        special_tokens=SPECIAL_TOKENS,
        min_frequency=2,
        show_progress=True,
    )

    # Collect ALL text into memory first, in the main thread. This is the fix
    # for the segfault: train_from_iterator consumes its input from Rust threads,
    # so doing network I/O / dataset loading lazily mid-iteration crashes. By
    # fully materializing first, every download happens here, safely.
    print("[tok] collecting corpus into memory (this is where downloads happen)...")
    texts = list(text_iterator())
    print(f"[tok] collected {len(texts):,} documents")

    print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...")
    tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts))
    tokenizer.save(TOKENIZER_PATH)
    print(f"[tok] saved -> {TOKENIZER_PATH}  (vocab {tokenizer.get_vocab_size():,})")


def test():
    from tokenizers import Tokenizer

    tok = Tokenizer.from_file(TOKENIZER_PATH)
    samples = [
        "İvme is a stupidly small language model.",
        "def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)",
        "The derivative of x^2 is 2x.",
    ]
    for s in samples:
        ids = tok.encode(s).ids
        back = tok.decode(ids)
        print(f"\n  text   : {s}")
        print(f"  tokens : {len(ids)}  ({len(s)/max(1,len(ids)):.2f} chars/token)")
        print(f"  decoded: {back}")


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--train", action="store_true")
    ap.add_argument("--test", action="store_true")
    args = ap.parse_args()
    if args.train:
        train()
    elif args.test:
        test()
    else:
        print("pass --train or --test")