File size: 5,022 Bytes
f0169be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | """
Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1).
We train on a balanced sample drawn from the same dense mix used for
pretraining, so the tokenizer's merges reflect the actual data distribution
(web text + textbooks + math + code). A from-scratch tokenizer matters at this
scale: every wasted token in the vocab is embedding-table budget burned.
Usage:
python tokenizer.py --train # train and save ivme_tokenizer.json
python tokenizer.py --test # quick round-trip check on saved tokenizer
"""
from __future__ import annotations
import argparse
import os
# Avoid fork/threading crashes when the Rust tokenizer consumes Python data.
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
VOCAB_SIZE = 16_384
TOKENIZER_PATH = "ivme_tokenizer.json"
# Truncate each document — subword merges are learned fine from prefixes, and
# this keeps the trainer's in-memory word counts bounded (no segfaults).
MAX_CHARS = 8_000
# Special tokens. We keep a small, purposeful set: pad, bos, eos, and a small
# bank of chat-control tokens reused later for instruct fine-tuning.
SPECIAL_TOKENS = [
"<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>",
"<|user|>", "<|assistant|>", "<|system|>",
]
# How many documents to sample per source for tokenizer training. A 16K vocab
# needs very little data — ~130K docs total is plenty and keeps RAM bounded.
SAMPLE_PER_SOURCE = {
"fineweb_hq": 60_000,
"cosmopedia": 30_000,
"finemath": 20_000,
"python": 20_000,
"wikipedia": 20_000,
}
def text_iterator():
"""Yield raw text strings sampled from each source in the dense mix."""
from datasets import load_dataset
def take(ds, n, field="text"):
count = 0
for row in ds:
txt = row.get(field)
if txt:
yield txt[:MAX_CHARS]
count += 1
if count >= n:
return
print("[tok] streaming FineWeb-HQ ...")
ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"])
print("[tok] streaming Cosmopedia ...")
ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"])
print("[tok] streaming FineMath ...")
ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
yield from take(ds, SAMPLE_PER_SOURCE["finemath"])
print("[tok] streaming Python stack ...")
ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content")
print("[tok] streaming Wikipedia ...")
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"])
def train():
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>"))
# ByteLevel pre-tokenizer: no out-of-vocab characters ever, GPT-2 style.
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=VOCAB_SIZE,
special_tokens=SPECIAL_TOKENS,
min_frequency=2,
show_progress=True,
)
# Collect ALL text into memory first, in the main thread. This is the fix
# for the segfault: train_from_iterator consumes its input from Rust threads,
# so doing network I/O / dataset loading lazily mid-iteration crashes. By
# fully materializing first, every download happens here, safely.
print("[tok] collecting corpus into memory (this is where downloads happen)...")
texts = list(text_iterator())
print(f"[tok] collected {len(texts):,} documents")
print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...")
tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts))
tokenizer.save(TOKENIZER_PATH)
print(f"[tok] saved -> {TOKENIZER_PATH} (vocab {tokenizer.get_vocab_size():,})")
def test():
from tokenizers import Tokenizer
tok = Tokenizer.from_file(TOKENIZER_PATH)
samples = [
"İvme is a stupidly small language model.",
"def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)",
"The derivative of x^2 is 2x.",
]
for s in samples:
ids = tok.encode(s).ids
back = tok.decode(ids)
print(f"\n text : {s}")
print(f" tokens : {len(ids)} ({len(s)/max(1,len(ids)):.2f} chars/token)")
print(f" decoded: {back}")
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--train", action="store_true")
ap.add_argument("--test", action="store_true")
args = ap.parse_args()
if args.train:
train()
elif args.test:
test()
else:
print("pass --train or --test") |