| from tokenizers import Tokenizer, AddedToken
|
| from tokenizers.models import BPE
|
| from tokenizers.trainers import BpeTrainer
|
| from tokenizers.pre_tokenizers import Sequence, ByteLevel
|
| from tokenizers.decoders import ByteLevel as ByteLevelDecoder
|
|
|
| from pretokenizer import get_pretokenizer
|
|
|
| VOCAB_SIZE = 32_000
|
| MIN_FREQUENCY = 3
|
| SPECIAL_TOKENS = ["<|endoftext|>"]
|
|
|
| def build_tokenizer() -> Tokenizer:
|
| """
|
| Builds and returns an untrained tokenizer with all components configured.
|
| Call .train_from_iterator() or .train() on the returned object to train it.
|
|
|
| Pipeline:
|
| Raw text
|
| -> Normalizer (handled externally in our normalize() fn)
|
| -> Pre-tokenizer (custom regex splits + byte level conversion)
|
| -> BPE Model (learns merge rules during training)
|
| -> Decoder (reverses byte level for human readable output)
|
| """
|
|
|
|
|
|
|
|
|
| model = BPE(
|
| unk_token=None,
|
| byte_fallback=True,
|
|
|
| )
|
|
|
| tokenizer = Tokenizer(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| tokenizer.pre_tokenizer = Sequence([
|
| get_pretokenizer(),
|
| ByteLevel(add_prefix_space=False),
|
| ])
|
|
|
|
|
|
|
|
|
| tokenizer.decoder = ByteLevelDecoder()
|
|
|
| return tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
| def build_trainer() -> BpeTrainer:
|
| """
|
| Configures the BPE trainer.
|
|
|
| vocab_size breakdown:
|
| 256 base byte tokens (one per possible byte value, always present)
|
| + 31,743 learned BPE merge tokens
|
| + 1 special token (<|endoftext|>)
|
| = 32,000 total
|
|
|
| The trainer automatically accounts for the 256 base tokens,
|
| so setting vocab_size=32_000 gives you the right final count.
|
| """
|
| return BpeTrainer(
|
| vocab_size=VOCAB_SIZE,
|
| min_frequency=MIN_FREQUENCY,
|
| special_tokens=SPECIAL_TOKENS,
|
|
|
|
|
| show_progress=True,
|
|
|
|
|
|
|
|
|
| initial_alphabet=ByteLevel.alphabet(),
|
| )
|
|
|
|
|
|
|
| def get_special_token_ids(tokenizer: Tokenizer) -> dict:
|
| """
|
| Returns a dict of special token string -> token ID.
|
| Call this AFTER training to get the final IDs.
|
|
|
| Example:
|
| ids = get_special_token_ids(tokenizer)
|
| eot_id = ids["<|endoftext|>"] # typically 0
|
| """
|
| return {
|
| token: tokenizer.token_to_id(token)
|
| for token in SPECIAL_TOKENS
|
| }
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| print("Building tokenizer...")
|
| tokenizer = build_tokenizer()
|
|
|
| print("Building trainer...")
|
| trainer = build_trainer()
|
|
|
|
|
| print("\nPre-tokenizer chain:")
|
| print(f" {tokenizer.pre_tokenizer}")
|
|
|
|
|
| print(f"\nDecoder:")
|
| print(f" {tokenizer.decoder}")
|
|
|
|
|
| print(f"\nTrainer config:")
|
| print(f" vocab_size : {trainer.vocab_size}")
|
| print(f" min_frequency : {trainer.min_frequency}")
|
| print(f" special_tokens: {trainer.special_tokens}")
|
| print(f" base alphabet : {len(ByteLevel.alphabet())} byte tokens")
|
|
|
| print("\nAll good - ready to train.")
|
| print("Next step: pipe FineWeb-Edu text into tokenizer.train_from_iterator()") |