ereniko commited on
Commit
f0169be
·
verified ·
1 Parent(s): edfd803

Upload tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.py +138 -0
tokenizer.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1).
3
+
4
+ We train on a balanced sample drawn from the same dense mix used for
5
+ pretraining, so the tokenizer's merges reflect the actual data distribution
6
+ (web text + textbooks + math + code). A from-scratch tokenizer matters at this
7
+ scale: every wasted token in the vocab is embedding-table budget burned.
8
+
9
+ Usage:
10
+ python tokenizer.py --train # train and save ivme_tokenizer.json
11
+ python tokenizer.py --test # quick round-trip check on saved tokenizer
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import os
18
+
19
+ # Avoid fork/threading crashes when the Rust tokenizer consumes Python data.
20
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
21
+
22
+ VOCAB_SIZE = 16_384
23
+ TOKENIZER_PATH = "ivme_tokenizer.json"
24
+
25
+ # Truncate each document — subword merges are learned fine from prefixes, and
26
+ # this keeps the trainer's in-memory word counts bounded (no segfaults).
27
+ MAX_CHARS = 8_000
28
+
29
+ # Special tokens. We keep a small, purposeful set: pad, bos, eos, and a small
30
+ # bank of chat-control tokens reused later for instruct fine-tuning.
31
+ SPECIAL_TOKENS = [
32
+ "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>",
33
+ "<|user|>", "<|assistant|>", "<|system|>",
34
+ ]
35
+
36
+ # How many documents to sample per source for tokenizer training. A 16K vocab
37
+ # needs very little data — ~130K docs total is plenty and keeps RAM bounded.
38
+ SAMPLE_PER_SOURCE = {
39
+ "fineweb_hq": 60_000,
40
+ "cosmopedia": 30_000,
41
+ "finemath": 20_000,
42
+ "python": 20_000,
43
+ "wikipedia": 20_000,
44
+ }
45
+
46
+
47
+ def text_iterator():
48
+ """Yield raw text strings sampled from each source in the dense mix."""
49
+ from datasets import load_dataset
50
+
51
+ def take(ds, n, field="text"):
52
+ count = 0
53
+ for row in ds:
54
+ txt = row.get(field)
55
+ if txt:
56
+ yield txt[:MAX_CHARS]
57
+ count += 1
58
+ if count >= n:
59
+ return
60
+
61
+ print("[tok] streaming FineWeb-HQ ...")
62
+ ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
63
+ yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"])
64
+
65
+ print("[tok] streaming Cosmopedia ...")
66
+ ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
67
+ yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"])
68
+
69
+ print("[tok] streaming FineMath ...")
70
+ ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
71
+ yield from take(ds, SAMPLE_PER_SOURCE["finemath"])
72
+
73
+ print("[tok] streaming Python stack ...")
74
+ ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
75
+ yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content")
76
+
77
+ print("[tok] streaming Wikipedia ...")
78
+ ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
79
+ yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"])
80
+
81
+
82
+ def train():
83
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders
84
+
85
+ tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>"))
86
+ # ByteLevel pre-tokenizer: no out-of-vocab characters ever, GPT-2 style.
87
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
88
+ tokenizer.decoder = decoders.ByteLevel()
89
+
90
+ trainer = trainers.BpeTrainer(
91
+ vocab_size=VOCAB_SIZE,
92
+ special_tokens=SPECIAL_TOKENS,
93
+ min_frequency=2,
94
+ show_progress=True,
95
+ )
96
+
97
+ # Collect ALL text into memory first, in the main thread. This is the fix
98
+ # for the segfault: train_from_iterator consumes its input from Rust threads,
99
+ # so doing network I/O / dataset loading lazily mid-iteration crashes. By
100
+ # fully materializing first, every download happens here, safely.
101
+ print("[tok] collecting corpus into memory (this is where downloads happen)...")
102
+ texts = list(text_iterator())
103
+ print(f"[tok] collected {len(texts):,} documents")
104
+
105
+ print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...")
106
+ tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts))
107
+ tokenizer.save(TOKENIZER_PATH)
108
+ print(f"[tok] saved -> {TOKENIZER_PATH} (vocab {tokenizer.get_vocab_size():,})")
109
+
110
+
111
+ def test():
112
+ from tokenizers import Tokenizer
113
+
114
+ tok = Tokenizer.from_file(TOKENIZER_PATH)
115
+ samples = [
116
+ "İvme is a stupidly small language model.",
117
+ "def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)",
118
+ "The derivative of x^2 is 2x.",
119
+ ]
120
+ for s in samples:
121
+ ids = tok.encode(s).ids
122
+ back = tok.decode(ids)
123
+ print(f"\n text : {s}")
124
+ print(f" tokens : {len(ids)} ({len(s)/max(1,len(ids)):.2f} chars/token)")
125
+ print(f" decoded: {back}")
126
+
127
+
128
+ if __name__ == "__main__":
129
+ ap = argparse.ArgumentParser()
130
+ ap.add_argument("--train", action="store_true")
131
+ ap.add_argument("--test", action="store_true")
132
+ args = ap.parse_args()
133
+ if args.train:
134
+ train()
135
+ elif args.test:
136
+ test()
137
+ else:
138
+ print("pass --train or --test")