File size: 6,948 Bytes
7f974df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | from datasets import load_dataset
from tokenizers import Tokenizer
# Import our components
from normalizer import normalization # our normalize function
from bpe import build_tokenizer, build_trainer, get_special_token_ids
from post_processor import add_post_processor
# ------------------------------------------------------------------ #
# CONSTANTS
# ------------------------------------------------------------------ #
DATASET_NAME = "HuggingFaceFW/fineweb-edu"
DATASET_SUBSET = "CC-MAIN-2014-49"
MIN_QUALITY = 3 # int_score >= 3 only
MAX_TOKENS = 25_000_000 # ~100M characters worth, enough for BPE training
# FineWeb-Edu tokens avg 4-5 chars each
MIN_DOC_LENGTH = 100 # skip very short documents, likely boilerplate
import os
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
SAVE_PATH = os.path.join(SCRIPT_DIR, "fineweb_edu_tokenizer")
# ------------------------------------------------------------------ #
# DATA GENERATOR
# ------------------------------------------------------------------ #
def fineweb_edu_iterator(
max_tokens: int = MAX_TOKENS,
min_quality: int = MIN_QUALITY,
min_length: int = MIN_DOC_LENGTH,
):
"""
Streams FineWeb-Edu documents, filters by quality,
normalizes text, and yields clean strings for BPE training.
Args:
max_tokens : stop after consuming this many tokens total
min_quality : only yield docs with int_score >= this value
min_length : skip docs shorter than this many characters
Yields:
str: normalized, clean document text
"""
print(f"Loading dataset stream: {DATASET_NAME} / {DATASET_SUBSET}")
ds = load_dataset(
DATASET_NAME,
name=DATASET_SUBSET,
split="train",
streaming=True,
)
tokens_seen = 0 # running total of tokens consumed
docs_yielded = 0 # how many docs passed all filters
docs_skipped = 0 # how many docs were filtered out
for doc in ds:
# ---- Stop condition ----------------------------------------
if tokens_seen >= max_tokens:
break
# ---- Quality filter ----------------------------------------
# int_score is 0-5, we want educational quality >= 3
if doc["int_score"] < min_quality:
docs_skipped += 1
continue
# ---- Extract and normalize ---------------------------------
text = doc["text"]
# Skip very short documents before normalization
# (saves compute on boilerplate/empty docs)
if len(text) < min_length:
docs_skipped += 1
continue
# Run our normalization pipeline
text = normalization(text)
# Skip if normalization made it too short
# (e.g. doc was mostly HTML tags or control chars)
if len(text) < min_length:
docs_skipped += 1
continue
# ---- Track progress ----------------------------------------
tokens_seen += doc["token_count"]
docs_yielded += 1
# Log progress every 100k documents
if docs_yielded % 100_000 == 0:
print(
f" docs yielded: {docs_yielded:,} | "
f"docs skipped: {docs_skipped:,} | "
f"tokens seen: {tokens_seen:,} / {max_tokens:,} "
f"({100 * tokens_seen / max_tokens:.1f}%)"
)
yield text
# Final stats
print(f"\nStream complete:")
print(f" docs yielded : {docs_yielded:,}")
print(f" docs skipped : {docs_skipped:,}")
print(f" tokens seen : {tokens_seen:,}")
# ------------------------------------------------------------------ #
# TRAINING
# ------------------------------------------------------------------ #
def train_tokenizer() -> Tokenizer:
"""
Builds, trains, and saves the tokenizer.
Returns:
Trained Tokenizer object
"""
# Build untrained tokenizer and trainer
tokenizer = build_tokenizer()
trainer = build_trainer()
print("\nStarting BPE training...")
print(f" vocab size : {trainer.vocab_size:,}")
print(f" min frequency : {trainer.min_frequency}")
print(f" quality filter: int_score >= {MIN_QUALITY}")
print(f" max tokens : {MAX_TOKENS:,}\n")
# train_from_iterator expects an iterable of strings
# our generator yields one clean document string at a time
tokenizer.train_from_iterator(
iterator=fineweb_edu_iterator(),
trainer=trainer,
length=MAX_TOKENS, # optional hint for progress bar accuracy
)
print("\nTraining complete.")
tokenizer = add_post_processor(tokenizer)
# Print special token IDs
ids = get_special_token_ids(tokenizer)
print(f"\nSpecial token IDs:")
for token, token_id in ids.items():
print(f" {token} -> {token_id}")
# Save tokenizer to disk
tokenizer.save(f"{SAVE_PATH}.json")
print(f"\nTokenizer saved to: {SAVE_PATH}.json")
return tokenizer
# ------------------------------------------------------------------ #
# QUICK VERIFICATION after training
# ------------------------------------------------------------------ #
def verify_tokenizer(tokenizer: Tokenizer):
"""
Runs a few quick checks after training to verify correctness.
"""
print("\n" + "="*60)
print(" TOKENIZER VERIFICATION")
print("="*60 + "\n")
test_cases = [
"The mitochondria is the powerhouse of the cell.",
"CO2 levels rose by 1.5e-3 ppm in 2024.",
"def compute_loss(y_pred, y_true):\n return (y_pred - y_true)**2",
"U.S.A has a Ph.D program e.g. at MIT.",
"don't they've she'll",
"∇f(x) = 0 is a necessary condition.", # tests byte fallback
]
for text in test_cases:
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded.ids)
n_tokens = len(encoded.ids)
print(f"Input : {repr(text)}")
print(f"Tokens : {encoded.tokens}")
print(f"IDs : {encoded.ids}")
print(f"N tokens: {n_tokens}")
print(f"Decoded : {repr(decoded)}")
print(f"Lossless: {text == decoded}")
print()
# Verify vocab size
vocab_size = tokenizer.get_vocab_size()
print(f"Final vocab size: {vocab_size:,}")
# Verify endoftext token exists
eot_id = tokenizer.token_to_id("<|endoftext|>")
print(f"<|endoftext|> ID: {eot_id}")
# ------------------------------------------------------------------ #
# ENTRY POINT
# ------------------------------------------------------------------ #
if __name__ == "__main__":
tokenizer = train_tokenizer()
verify_tokenizer(tokenizer) |