Upload tokenizer.py with huggingface_hub
Browse files- tokenizer.py +138 -0
tokenizer.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1).
|
| 3 |
+
|
| 4 |
+
We train on a balanced sample drawn from the same dense mix used for
|
| 5 |
+
pretraining, so the tokenizer's merges reflect the actual data distribution
|
| 6 |
+
(web text + textbooks + math + code). A from-scratch tokenizer matters at this
|
| 7 |
+
scale: every wasted token in the vocab is embedding-table budget burned.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python tokenizer.py --train # train and save ivme_tokenizer.json
|
| 11 |
+
python tokenizer.py --test # quick round-trip check on saved tokenizer
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
# Avoid fork/threading crashes when the Rust tokenizer consumes Python data.
|
| 20 |
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 21 |
+
|
| 22 |
+
VOCAB_SIZE = 16_384
|
| 23 |
+
TOKENIZER_PATH = "ivme_tokenizer.json"
|
| 24 |
+
|
| 25 |
+
# Truncate each document — subword merges are learned fine from prefixes, and
|
| 26 |
+
# this keeps the trainer's in-memory word counts bounded (no segfaults).
|
| 27 |
+
MAX_CHARS = 8_000
|
| 28 |
+
|
| 29 |
+
# Special tokens. We keep a small, purposeful set: pad, bos, eos, and a small
|
| 30 |
+
# bank of chat-control tokens reused later for instruct fine-tuning.
|
| 31 |
+
SPECIAL_TOKENS = [
|
| 32 |
+
"<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>",
|
| 33 |
+
"<|user|>", "<|assistant|>", "<|system|>",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
# How many documents to sample per source for tokenizer training. A 16K vocab
|
| 37 |
+
# needs very little data — ~130K docs total is plenty and keeps RAM bounded.
|
| 38 |
+
SAMPLE_PER_SOURCE = {
|
| 39 |
+
"fineweb_hq": 60_000,
|
| 40 |
+
"cosmopedia": 30_000,
|
| 41 |
+
"finemath": 20_000,
|
| 42 |
+
"python": 20_000,
|
| 43 |
+
"wikipedia": 20_000,
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def text_iterator():
|
| 48 |
+
"""Yield raw text strings sampled from each source in the dense mix."""
|
| 49 |
+
from datasets import load_dataset
|
| 50 |
+
|
| 51 |
+
def take(ds, n, field="text"):
|
| 52 |
+
count = 0
|
| 53 |
+
for row in ds:
|
| 54 |
+
txt = row.get(field)
|
| 55 |
+
if txt:
|
| 56 |
+
yield txt[:MAX_CHARS]
|
| 57 |
+
count += 1
|
| 58 |
+
if count >= n:
|
| 59 |
+
return
|
| 60 |
+
|
| 61 |
+
print("[tok] streaming FineWeb-HQ ...")
|
| 62 |
+
ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
|
| 63 |
+
yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"])
|
| 64 |
+
|
| 65 |
+
print("[tok] streaming Cosmopedia ...")
|
| 66 |
+
ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
|
| 67 |
+
yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"])
|
| 68 |
+
|
| 69 |
+
print("[tok] streaming FineMath ...")
|
| 70 |
+
ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
|
| 71 |
+
yield from take(ds, SAMPLE_PER_SOURCE["finemath"])
|
| 72 |
+
|
| 73 |
+
print("[tok] streaming Python stack ...")
|
| 74 |
+
ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
|
| 75 |
+
yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content")
|
| 76 |
+
|
| 77 |
+
print("[tok] streaming Wikipedia ...")
|
| 78 |
+
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
|
| 79 |
+
yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"])
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def train():
|
| 83 |
+
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
|
| 84 |
+
|
| 85 |
+
tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>"))
|
| 86 |
+
# ByteLevel pre-tokenizer: no out-of-vocab characters ever, GPT-2 style.
|
| 87 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
| 88 |
+
tokenizer.decoder = decoders.ByteLevel()
|
| 89 |
+
|
| 90 |
+
trainer = trainers.BpeTrainer(
|
| 91 |
+
vocab_size=VOCAB_SIZE,
|
| 92 |
+
special_tokens=SPECIAL_TOKENS,
|
| 93 |
+
min_frequency=2,
|
| 94 |
+
show_progress=True,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Collect ALL text into memory first, in the main thread. This is the fix
|
| 98 |
+
# for the segfault: train_from_iterator consumes its input from Rust threads,
|
| 99 |
+
# so doing network I/O / dataset loading lazily mid-iteration crashes. By
|
| 100 |
+
# fully materializing first, every download happens here, safely.
|
| 101 |
+
print("[tok] collecting corpus into memory (this is where downloads happen)...")
|
| 102 |
+
texts = list(text_iterator())
|
| 103 |
+
print(f"[tok] collected {len(texts):,} documents")
|
| 104 |
+
|
| 105 |
+
print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...")
|
| 106 |
+
tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts))
|
| 107 |
+
tokenizer.save(TOKENIZER_PATH)
|
| 108 |
+
print(f"[tok] saved -> {TOKENIZER_PATH} (vocab {tokenizer.get_vocab_size():,})")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def test():
|
| 112 |
+
from tokenizers import Tokenizer
|
| 113 |
+
|
| 114 |
+
tok = Tokenizer.from_file(TOKENIZER_PATH)
|
| 115 |
+
samples = [
|
| 116 |
+
"İvme is a stupidly small language model.",
|
| 117 |
+
"def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)",
|
| 118 |
+
"The derivative of x^2 is 2x.",
|
| 119 |
+
]
|
| 120 |
+
for s in samples:
|
| 121 |
+
ids = tok.encode(s).ids
|
| 122 |
+
back = tok.decode(ids)
|
| 123 |
+
print(f"\n text : {s}")
|
| 124 |
+
print(f" tokens : {len(ids)} ({len(s)/max(1,len(ids)):.2f} chars/token)")
|
| 125 |
+
print(f" decoded: {back}")
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
if __name__ == "__main__":
|
| 129 |
+
ap = argparse.ArgumentParser()
|
| 130 |
+
ap.add_argument("--train", action="store_true")
|
| 131 |
+
ap.add_argument("--test", action="store_true")
|
| 132 |
+
args = ap.parse_args()
|
| 133 |
+
if args.train:
|
| 134 |
+
train()
|
| 135 |
+
elif args.test:
|
| 136 |
+
test()
|
| 137 |
+
else:
|
| 138 |
+
print("pass --train or --test")
|