from tokenizers.processors import TemplateProcessing from tokenizers import Tokenizer # ------------------------------------------------------------------ # # POST-PROCESSOR # Runs after BPE encoding, appends <|endoftext|> to every sequence # ------------------------------------------------------------------ # def add_post_processor(tokenizer: Tokenizer) -> Tokenizer: """ Adds a post-processor to the tokenizer that appends <|endoftext|> to every encoded sequence. Must be called AFTER training because we need the real token ID of <|endoftext|> from the trained vocab. Args: tokenizer: a trained Tokenizer object Returns: The same tokenizer with post-processor attached """ # Get the real ID from the trained vocab # This is why we can only do this after training eot_id = tokenizer.token_to_id("<|endoftext|>") if eot_id is None: raise ValueError( "<|endoftext|> not found in vocab. " "Make sure the tokenizer is trained before adding post-processor." ) # TemplateProcessing defines the final sequence structure # using a simple template syntax: # # $A -> the encoded sequence (single sequence) # $A $B -> two sequences (for pair tasks like QA) # <|endoftext|>:ID -> insert this special token with its ID # # Our template: # single : [tokens...] <|endoftext|> # pair : [tokens_A...] <|endoftext|> [tokens_B...] <|endoftext|> # # pair template handles future use cases like # question-context pairs without needing to change the tokenizer tokenizer.post_processor = TemplateProcessing( single="$A <|endoftext|>:0", pair="$A <|endoftext|>:0 $B:1 <|endoftext|>:0", special_tokens=[ ("<|endoftext|>", eot_id), ], ) print(f"Post-processor added: <|endoftext|> (ID: {eot_id}) appended to sequences") return tokenizer # ------------------------------------------------------------------ # # VERIFICATION # ------------------------------------------------------------------ # def verify_post_processor(tokenizer: Tokenizer): """ Verifies the post-processor is working correctly. Checks that <|endoftext|> appears at end of every encoded sequence. """ eot_id = tokenizer.token_to_id("<|endoftext|>") eot_token = "<|endoftext|>" print("\n" + "="*60) print(" POST-PROCESSOR VERIFICATION") print("="*60 + "\n") test_cases = [ # Single documents "The mitochondria is the powerhouse of the cell.", "CO2 levels rose by 1.5e-3 ppm.", # Short edge cases "Hi.", "42", ] all_passed = True for text in test_cases: encoded = tokenizer.encode(text) last_token = encoded.tokens[-1] last_id = encoded.ids[-1] passed = last_token == eot_token and last_id == eot_id if not passed: all_passed = False status = "PASS" if passed else "FAIL" print(f"[{status}] {repr(text)}") print(f" tokens : {encoded.tokens}") print(f" last : {last_token!r} (ID: {last_id})") print() # Verify pair encoding encoded_pair = tokenizer.encode("question here", "answer here") pair_ids = encoded_pair.ids eot_positions = [i for i, id in enumerate(pair_ids) if id == eot_id] print(f"Pair encoding test:") print(f" tokens : {encoded_pair.tokens}") print(f" eot positions: {eot_positions}") print(f" expected : 2 eot tokens (one after each sequence)") print(f" [{'PASS' if len(eot_positions) == 2 else 'FAIL'}]") print(f"\nAll tests passed: {all_passed}") # ------------------------------------------------------------------ # # HOW THIS FITS INTO THE FULL PIPELINE # ------------------------------------------------------------------ # # The correct order when building your full tokenizer: # # 1. build_tokenizer() <- sets up model + pre-tokenizer + decoder # 2. train_from_iterator() <- trains BPE, assigns real vocab IDs # 3. add_post_processor() <- NOW we can add post-processor (needs real IDs) # 4. tokenizer.save() <- saves everything including post-processor # # Loading later: # tokenizer = Tokenizer.from_file("fineweb_edu_tokenizer.json") # <- post-processor is automatically restored, no extra steps if __name__ == "__main__": import sys # Load a trained tokenizer from disk to test # Pass the path as argument: python post_processor.py fineweb_edu_tokenizer.json # Or it will try the default path path = sys.argv[1] if len(sys.argv) > 1 else "fineweb_edu_tokenizer.json" print(f"Loading tokenizer from: {path}") tokenizer = Tokenizer.from_file(path) tokenizer = add_post_processor(tokenizer) verify_post_processor(tokenizer) # Save with post-processor included tokenizer.save(path) print(f"\nTokenizer re-saved with post-processor to: {path}")