IvmeLabs
/

Ivme-Conversate-22M-Base

+"""
+Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1).
+We train on a balanced sample drawn from the same dense mix used for
+pretraining, so the tokenizer's merges reflect the actual data distribution
+(web text + textbooks + math + code). A from-scratch tokenizer matters at this
+scale: every wasted token in the vocab is embedding-table budget burned.
+Usage:
+    python tokenizer.py --train      # train and save ivme_tokenizer.json
+    python tokenizer.py --test       # quick round-trip check on saved tokenizer
+"""
+from __future__ import annotations
+import argparse
+import os
+# Avoid fork/threading crashes when the Rust tokenizer consumes Python data.
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+VOCAB_SIZE = 16_384
+TOKENIZER_PATH = "ivme_tokenizer.json"
+# Truncate each document — subword merges are learned fine from prefixes, and
+# this keeps the trainer's in-memory word counts bounded (no segfaults).
+MAX_CHARS = 8_000
+# Special tokens. We keep a small, purposeful set: pad, bos, eos, and a small
+# bank of chat-control tokens reused later for instruct fine-tuning.
+SPECIAL_TOKENS = [
+    "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>",
+    "<|user|>", "<|assistant|>", "<|system|>",
+]
+# How many documents to sample per source for tokenizer training. A 16K vocab
+# needs very little data — ~130K docs total is plenty and keeps RAM bounded.
+SAMPLE_PER_SOURCE = {
+    "fineweb_hq": 60_000,
+    "cosmopedia": 30_000,
+    "finemath":   20_000,
+    "python":     20_000,
+    "wikipedia":  20_000,
+}
+def text_iterator():
+    """Yield raw text strings sampled from each source in the dense mix."""
+    from datasets import load_dataset
+    def take(ds, n, field="text"):
+        count = 0
+        for row in ds:
+            txt = row.get(field)
+            if txt:
+                yield txt[:MAX_CHARS]
+                count += 1
+                if count >= n:
+                    return
+    print("[tok] streaming FineWeb-HQ ...")
+    ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
+    yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"])
+    print("[tok] streaming Cosmopedia ...")
+    ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
+    yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"])
+    print("[tok] streaming FineMath ...")
+    ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
+    yield from take(ds, SAMPLE_PER_SOURCE["finemath"])
+    print("[tok] streaming Python stack ...")
+    ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
+    yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content")
+    print("[tok] streaming Wikipedia ...")
+    ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
+    yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"])
+def train():
+    from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
+    tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>"))
+    # ByteLevel pre-tokenizer: no out-of-vocab characters ever, GPT-2 style.
+    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+    tokenizer.decoder = decoders.ByteLevel()
+    trainer = trainers.BpeTrainer(
+        vocab_size=VOCAB_SIZE,
+        special_tokens=SPECIAL_TOKENS,
+        min_frequency=2,
+        show_progress=True,
+    )
+    # Collect ALL text into memory first, in the main thread. This is the fix
+    # for the segfault: train_from_iterator consumes its input from Rust threads,
+    # so doing network I/O / dataset loading lazily mid-iteration crashes. By
+    # fully materializing first, every download happens here, safely.
+    print("[tok] collecting corpus into memory (this is where downloads happen)...")
+    texts = list(text_iterator())
+    print(f"[tok] collected {len(texts):,} documents")
+    print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...")
+    tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts))
+    tokenizer.save(TOKENIZER_PATH)
+    print(f"[tok] saved -> {TOKENIZER_PATH}  (vocab {tokenizer.get_vocab_size():,})")
+def test():
+    from tokenizers import Tokenizer
+    tok = Tokenizer.from_file(TOKENIZER_PATH)
+    samples = [
+        "İvme is a stupidly small language model.",
+        "def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)",
+        "The derivative of x^2 is 2x.",
+    ]
+    for s in samples:
+        ids = tok.encode(s).ids
+        back = tok.decode(ids)
+        print(f"\n  text   : {s}")
+        print(f"  tokens : {len(ids)}  ({len(s)/max(1,len(ids)):.2f} chars/token)")
+        print(f"  decoded: {back}")
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--train", action="store_true")
+    ap.add_argument("--test", action="store_true")
+    args = ap.parse_args()
+    if args.train:
+        train()
+    elif args.test:
+        test()
+    else:
+        print("pass --train or --test")