File size: 4,909 Bytes
7f974df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | from tokenizers import Tokenizer, AddedToken
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Sequence, ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from pretokenizer import get_pretokenizer
VOCAB_SIZE = 32_000
MIN_FREQUENCY = 3
SPECIAL_TOKENS = ["<|endoftext|>"]
def build_tokenizer() -> Tokenizer:
"""
Builds and returns an untrained tokenizer with all components configured.
Call .train_from_iterator() or .train() on the returned object to train it.
Pipeline:
Raw text
-> Normalizer (handled externally in our normalize() fn)
-> Pre-tokenizer (custom regex splits + byte level conversion)
-> BPE Model (learns merge rules during training)
-> Decoder (reverses byte level for human readable output)
"""
# ---- 1. BPE Model ------------------------------------------------
# unk_token=None because byte-level means we NEVER have unknowns
# every character always maps to at least one byte token
model = BPE(
unk_token=None, # no unknown token - byte fallback handles everything
byte_fallback=True, # unknown chars represented as <0xXX> byte tokens
# e.g. ∇ -> <0xE2><0x88><0x87>
)
tokenizer = Tokenizer(model)
# ---- 2. Pre-tokenizer --------------------------------------------
# Sequence chains two pre-tokenizers in order:
#
# Step A: Our custom regex splits text into meaningful chunks
# (contractions, abbreviations, numbers, operators etc.)
#
# Step B: ByteLevel converts each chunk's characters to their
# byte representation using a 256-char printable alphabet
# e.g. é (bytes 0xC3 0xA9) -> "é"
#
# add_prefix_space=False because our regex already handles
# whitespace explicitly as its own token category
tokenizer.pre_tokenizer = Sequence([
get_pretokenizer(), # Step A - our regex
ByteLevel(add_prefix_space=False), # Step B - byte conversion
])
# ---- 3. Decoder --------------------------------------------------
# Reverses the ByteLevel encoding so output is human readable
# Without this tokenizer.decode() would return "é" instead of "é"
tokenizer.decoder = ByteLevelDecoder()
return tokenizer
# ------------------------------------------------------------------ #
# TRAINER CONFIG
# ------------------------------------------------------------------ #
def build_trainer() -> BpeTrainer:
"""
Configures the BPE trainer.
vocab_size breakdown:
256 base byte tokens (one per possible byte value, always present)
+ 31,743 learned BPE merge tokens
+ 1 special token (<|endoftext|>)
= 32,000 total
The trainer automatically accounts for the 256 base tokens,
so setting vocab_size=32_000 gives you the right final count.
"""
return BpeTrainer(
vocab_size=VOCAB_SIZE,
min_frequency=MIN_FREQUENCY,
special_tokens=SPECIAL_TOKENS,
# show_progress shows a progress bar during training
show_progress=True,
# initial_alphabet tells the trainer to include all 256 bytes
# as base tokens before any merges happen
# This is what guarantees byte-level fallback works
initial_alphabet=ByteLevel.alphabet(),
)
# CONVENIENCE: get special token IDs after training
def get_special_token_ids(tokenizer: Tokenizer) -> dict:
"""
Returns a dict of special token string -> token ID.
Call this AFTER training to get the final IDs.
Example:
ids = get_special_token_ids(tokenizer)
eot_id = ids["<|endoftext|>"] # typically 0
"""
return {
token: tokenizer.token_to_id(token)
for token in SPECIAL_TOKENS
}
# QUICK SANITY CHECK
if __name__ == "__main__":
print("Building tokenizer...")
tokenizer = build_tokenizer()
print("Building trainer...")
trainer = build_trainer()
# Verify pre-tokenizer chain is set up correctly
print("\nPre-tokenizer chain:")
print(f" {tokenizer.pre_tokenizer}")
# Verify decoder is set
print(f"\nDecoder:")
print(f" {tokenizer.decoder}")
# Verify trainer config
print(f"\nTrainer config:")
print(f" vocab_size : {trainer.vocab_size}")
print(f" min_frequency : {trainer.min_frequency}")
print(f" special_tokens: {trainer.special_tokens}")
print(f" base alphabet : {len(ByteLevel.alphabet())} byte tokens")
print("\nAll good - ready to train.")
print("Next step: pipe FineWeb-Edu text into tokenizer.train_from_iterator()") |