File size: 5,203 Bytes
7f974df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | from tokenizers.processors import TemplateProcessing
from tokenizers import Tokenizer
# ------------------------------------------------------------------ #
# POST-PROCESSOR
# Runs after BPE encoding, appends <|endoftext|> to every sequence
# ------------------------------------------------------------------ #
def add_post_processor(tokenizer: Tokenizer) -> Tokenizer:
"""
Adds a post-processor to the tokenizer that appends
<|endoftext|> to every encoded sequence.
Must be called AFTER training because we need the real
token ID of <|endoftext|> from the trained vocab.
Args:
tokenizer: a trained Tokenizer object
Returns:
The same tokenizer with post-processor attached
"""
# Get the real ID from the trained vocab
# This is why we can only do this after training
eot_id = tokenizer.token_to_id("<|endoftext|>")
if eot_id is None:
raise ValueError(
"<|endoftext|> not found in vocab. "
"Make sure the tokenizer is trained before adding post-processor."
)
# TemplateProcessing defines the final sequence structure
# using a simple template syntax:
#
# $A -> the encoded sequence (single sequence)
# $A $B -> two sequences (for pair tasks like QA)
# <|endoftext|>:ID -> insert this special token with its ID
#
# Our template:
# single : [tokens...] <|endoftext|>
# pair : [tokens_A...] <|endoftext|> [tokens_B...] <|endoftext|>
#
# pair template handles future use cases like
# question-context pairs without needing to change the tokenizer
tokenizer.post_processor = TemplateProcessing(
single="$A <|endoftext|>:0",
pair="$A <|endoftext|>:0 $B:1 <|endoftext|>:0",
special_tokens=[
("<|endoftext|>", eot_id),
],
)
print(f"Post-processor added: <|endoftext|> (ID: {eot_id}) appended to sequences")
return tokenizer
# ------------------------------------------------------------------ #
# VERIFICATION
# ------------------------------------------------------------------ #
def verify_post_processor(tokenizer: Tokenizer):
"""
Verifies the post-processor is working correctly.
Checks that <|endoftext|> appears at end of every encoded sequence.
"""
eot_id = tokenizer.token_to_id("<|endoftext|>")
eot_token = "<|endoftext|>"
print("\n" + "="*60)
print(" POST-PROCESSOR VERIFICATION")
print("="*60 + "\n")
test_cases = [
# Single documents
"The mitochondria is the powerhouse of the cell.",
"CO2 levels rose by 1.5e-3 ppm.",
# Short edge cases
"Hi.",
"42",
]
all_passed = True
for text in test_cases:
encoded = tokenizer.encode(text)
last_token = encoded.tokens[-1]
last_id = encoded.ids[-1]
passed = last_token == eot_token and last_id == eot_id
if not passed:
all_passed = False
status = "PASS" if passed else "FAIL"
print(f"[{status}] {repr(text)}")
print(f" tokens : {encoded.tokens}")
print(f" last : {last_token!r} (ID: {last_id})")
print()
# Verify pair encoding
encoded_pair = tokenizer.encode("question here", "answer here")
pair_ids = encoded_pair.ids
eot_positions = [i for i, id in enumerate(pair_ids) if id == eot_id]
print(f"Pair encoding test:")
print(f" tokens : {encoded_pair.tokens}")
print(f" eot positions: {eot_positions}")
print(f" expected : 2 eot tokens (one after each sequence)")
print(f" [{'PASS' if len(eot_positions) == 2 else 'FAIL'}]")
print(f"\nAll tests passed: {all_passed}")
# ------------------------------------------------------------------ #
# HOW THIS FITS INTO THE FULL PIPELINE
# ------------------------------------------------------------------ #
# The correct order when building your full tokenizer:
#
# 1. build_tokenizer() <- sets up model + pre-tokenizer + decoder
# 2. train_from_iterator() <- trains BPE, assigns real vocab IDs
# 3. add_post_processor() <- NOW we can add post-processor (needs real IDs)
# 4. tokenizer.save() <- saves everything including post-processor
#
# Loading later:
# tokenizer = Tokenizer.from_file("fineweb_edu_tokenizer.json")
# <- post-processor is automatically restored, no extra steps
if __name__ == "__main__":
import sys
# Load a trained tokenizer from disk to test
# Pass the path as argument: python post_processor.py fineweb_edu_tokenizer.json
# Or it will try the default path
path = sys.argv[1] if len(sys.argv) > 1 else "fineweb_edu_tokenizer.json"
print(f"Loading tokenizer from: {path}")
tokenizer = Tokenizer.from_file(path)
tokenizer = add_post_processor(tokenizer)
verify_post_processor(tokenizer)
# Save with post-processor included
tokenizer.save(path)
print(f"\nTokenizer re-saved with post-processor to: {path}") |