sllm / tokenizer /post_processor.py
geeteshcodes's picture
Initial commit
7f974df verified
raw
history blame
5.2 kB
from tokenizers.processors import TemplateProcessing
from tokenizers import Tokenizer
# ------------------------------------------------------------------ #
# POST-PROCESSOR
# Runs after BPE encoding, appends <|endoftext|> to every sequence
# ------------------------------------------------------------------ #
def add_post_processor(tokenizer: Tokenizer) -> Tokenizer:
"""
Adds a post-processor to the tokenizer that appends
<|endoftext|> to every encoded sequence.
Must be called AFTER training because we need the real
token ID of <|endoftext|> from the trained vocab.
Args:
tokenizer: a trained Tokenizer object
Returns:
The same tokenizer with post-processor attached
"""
# Get the real ID from the trained vocab
# This is why we can only do this after training
eot_id = tokenizer.token_to_id("<|endoftext|>")
if eot_id is None:
raise ValueError(
"<|endoftext|> not found in vocab. "
"Make sure the tokenizer is trained before adding post-processor."
)
# TemplateProcessing defines the final sequence structure
# using a simple template syntax:
#
# $A -> the encoded sequence (single sequence)
# $A $B -> two sequences (for pair tasks like QA)
# <|endoftext|>:ID -> insert this special token with its ID
#
# Our template:
# single : [tokens...] <|endoftext|>
# pair : [tokens_A...] <|endoftext|> [tokens_B...] <|endoftext|>
#
# pair template handles future use cases like
# question-context pairs without needing to change the tokenizer
tokenizer.post_processor = TemplateProcessing(
single="$A <|endoftext|>:0",
pair="$A <|endoftext|>:0 $B:1 <|endoftext|>:0",
special_tokens=[
("<|endoftext|>", eot_id),
],
)
print(f"Post-processor added: <|endoftext|> (ID: {eot_id}) appended to sequences")
return tokenizer
# ------------------------------------------------------------------ #
# VERIFICATION
# ------------------------------------------------------------------ #
def verify_post_processor(tokenizer: Tokenizer):
"""
Verifies the post-processor is working correctly.
Checks that <|endoftext|> appears at end of every encoded sequence.
"""
eot_id = tokenizer.token_to_id("<|endoftext|>")
eot_token = "<|endoftext|>"
print("\n" + "="*60)
print(" POST-PROCESSOR VERIFICATION")
print("="*60 + "\n")
test_cases = [
# Single documents
"The mitochondria is the powerhouse of the cell.",
"CO2 levels rose by 1.5e-3 ppm.",
# Short edge cases
"Hi.",
"42",
]
all_passed = True
for text in test_cases:
encoded = tokenizer.encode(text)
last_token = encoded.tokens[-1]
last_id = encoded.ids[-1]
passed = last_token == eot_token and last_id == eot_id
if not passed:
all_passed = False
status = "PASS" if passed else "FAIL"
print(f"[{status}] {repr(text)}")
print(f" tokens : {encoded.tokens}")
print(f" last : {last_token!r} (ID: {last_id})")
print()
# Verify pair encoding
encoded_pair = tokenizer.encode("question here", "answer here")
pair_ids = encoded_pair.ids
eot_positions = [i for i, id in enumerate(pair_ids) if id == eot_id]
print(f"Pair encoding test:")
print(f" tokens : {encoded_pair.tokens}")
print(f" eot positions: {eot_positions}")
print(f" expected : 2 eot tokens (one after each sequence)")
print(f" [{'PASS' if len(eot_positions) == 2 else 'FAIL'}]")
print(f"\nAll tests passed: {all_passed}")
# ------------------------------------------------------------------ #
# HOW THIS FITS INTO THE FULL PIPELINE
# ------------------------------------------------------------------ #
# The correct order when building your full tokenizer:
#
# 1. build_tokenizer() <- sets up model + pre-tokenizer + decoder
# 2. train_from_iterator() <- trains BPE, assigns real vocab IDs
# 3. add_post_processor() <- NOW we can add post-processor (needs real IDs)
# 4. tokenizer.save() <- saves everything including post-processor
#
# Loading later:
# tokenizer = Tokenizer.from_file("fineweb_edu_tokenizer.json")
# <- post-processor is automatically restored, no extra steps
if __name__ == "__main__":
import sys
# Load a trained tokenizer from disk to test
# Pass the path as argument: python post_processor.py fineweb_edu_tokenizer.json
# Or it will try the default path
path = sys.argv[1] if len(sys.argv) > 1 else "fineweb_edu_tokenizer.json"
print(f"Loading tokenizer from: {path}")
tokenizer = Tokenizer.from_file(path)
tokenizer = add_post_processor(tokenizer)
verify_post_processor(tokenizer)
# Save with post-processor included
tokenizer.save(path)
print(f"\nTokenizer re-saved with post-processor to: {path}")