File size: 5,022 Bytes
f0169be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1).

We train on a balanced sample drawn from the same dense mix used for
pretraining, so the tokenizer's merges reflect the actual data distribution
(web text + textbooks + math + code). A from-scratch tokenizer matters at this
scale: every wasted token in the vocab is embedding-table budget burned.

Usage:
    python tokenizer.py --train      # train and save ivme_tokenizer.json
    python tokenizer.py --test       # quick round-trip check on saved tokenizer
"""

from __future__ import annotations

import argparse
import os

# Avoid fork/threading crashes when the Rust tokenizer consumes Python data.
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

VOCAB_SIZE = 16_384
TOKENIZER_PATH = "ivme_tokenizer.json"

# Truncate each document — subword merges are learned fine from prefixes, and
# this keeps the trainer's in-memory word counts bounded (no segfaults).
MAX_CHARS = 8_000

# Special tokens. We keep a small, purposeful set: pad, bos, eos, and a small
# bank of chat-control tokens reused later for instruct fine-tuning.
SPECIAL_TOKENS = [
    "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>",
    "<|user|>", "<|assistant|>", "<|system|>",
]

# How many documents to sample per source for tokenizer training. A 16K vocab
# needs very little data — ~130K docs total is plenty and keeps RAM bounded.
SAMPLE_PER_SOURCE = {
    "fineweb_hq": 60_000,
    "cosmopedia": 30_000,
    "finemath":   20_000,
    "python":     20_000,
    "wikipedia":  20_000,
}


def text_iterator():
    """Yield raw text strings sampled from each source in the dense mix."""
    from datasets import load_dataset

    def take(ds, n, field="text"):
        count = 0
        for row in ds:
            txt = row.get(field)
            if txt:
                yield txt[:MAX_CHARS]
                count += 1
                if count >= n:
                    return

    print("[tok] streaming FineWeb-HQ ...")
    ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"])

    print("[tok] streaming Cosmopedia ...")
    ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"])

    print("[tok] streaming FineMath ...")
    ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["finemath"])

    print("[tok] streaming Python stack ...")
    ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content")

    print("[tok] streaming Wikipedia ...")
    ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
    yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"])


def train():
    from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders

    tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>"))
    # ByteLevel pre-tokenizer: no out-of-vocab characters ever, GPT-2 style.
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    trainer = trainers.BpeTrainer(
        vocab_size=VOCAB_SIZE,
        special_tokens=SPECIAL_TOKENS,
        min_frequency=2,
        show_progress=True,
    )

    # Collect ALL text into memory first, in the main thread. This is the fix
    # for the segfault: train_from_iterator consumes its input from Rust threads,
    # so doing network I/O / dataset loading lazily mid-iteration crashes. By
    # fully materializing first, every download happens here, safely.
    print("[tok] collecting corpus into memory (this is where downloads happen)...")
    texts = list(text_iterator())
    print(f"[tok] collected {len(texts):,} documents")

    print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...")
    tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts))
    tokenizer.save(TOKENIZER_PATH)
    print(f"[tok] saved -> {TOKENIZER_PATH}  (vocab {tokenizer.get_vocab_size():,})")


def test():
    from tokenizers import Tokenizer

    tok = Tokenizer.from_file(TOKENIZER_PATH)
    samples = [
        "İvme is a stupidly small language model.",
        "def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)",
        "The derivative of x^2 is 2x.",
    ]
    for s in samples:
        ids = tok.encode(s).ids
        back = tok.decode(ids)
        print(f"\n  text   : {s}")
        print(f"  tokens : {len(ids)}  ({len(s)/max(1,len(ids)):.2f} chars/token)")
        print(f"  decoded: {back}")


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--train", action="store_true")
    ap.add_argument("--test", action="store_true")
    args = ap.parse_args()
    if args.train:
        train()
    elif args.test:
        test()
    else:
        print("pass --train or --test")