| from tokenizers.processors import TemplateProcessing
|
| from tokenizers import Tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| def add_post_processor(tokenizer: Tokenizer) -> Tokenizer:
|
| """
|
| Adds a post-processor to the tokenizer that appends
|
| <|endoftext|> to every encoded sequence.
|
|
|
| Must be called AFTER training because we need the real
|
| token ID of <|endoftext|> from the trained vocab.
|
|
|
| Args:
|
| tokenizer: a trained Tokenizer object
|
|
|
| Returns:
|
| The same tokenizer with post-processor attached
|
| """
|
|
|
|
|
|
|
| eot_id = tokenizer.token_to_id("<|endoftext|>")
|
|
|
| if eot_id is None:
|
| raise ValueError(
|
| "<|endoftext|> not found in vocab. "
|
| "Make sure the tokenizer is trained before adding post-processor."
|
| )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| tokenizer.post_processor = TemplateProcessing(
|
| single="$A <|endoftext|>:0",
|
| pair="$A <|endoftext|>:0 $B:1 <|endoftext|>:0",
|
| special_tokens=[
|
| ("<|endoftext|>", eot_id),
|
| ],
|
| )
|
|
|
| print(f"Post-processor added: <|endoftext|> (ID: {eot_id}) appended to sequences")
|
|
|
| return tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
| def verify_post_processor(tokenizer: Tokenizer):
|
| """
|
| Verifies the post-processor is working correctly.
|
| Checks that <|endoftext|> appears at end of every encoded sequence.
|
| """
|
|
|
| eot_id = tokenizer.token_to_id("<|endoftext|>")
|
| eot_token = "<|endoftext|>"
|
|
|
| print("\n" + "="*60)
|
| print(" POST-PROCESSOR VERIFICATION")
|
| print("="*60 + "\n")
|
|
|
| test_cases = [
|
|
|
| "The mitochondria is the powerhouse of the cell.",
|
| "CO2 levels rose by 1.5e-3 ppm.",
|
|
|
| "Hi.",
|
| "42",
|
| ]
|
|
|
| all_passed = True
|
|
|
| for text in test_cases:
|
| encoded = tokenizer.encode(text)
|
| last_token = encoded.tokens[-1]
|
| last_id = encoded.ids[-1]
|
| passed = last_token == eot_token and last_id == eot_id
|
|
|
| if not passed:
|
| all_passed = False
|
|
|
| status = "PASS" if passed else "FAIL"
|
| print(f"[{status}] {repr(text)}")
|
| print(f" tokens : {encoded.tokens}")
|
| print(f" last : {last_token!r} (ID: {last_id})")
|
| print()
|
|
|
|
|
| encoded_pair = tokenizer.encode("question here", "answer here")
|
| pair_ids = encoded_pair.ids
|
| eot_positions = [i for i, id in enumerate(pair_ids) if id == eot_id]
|
|
|
| print(f"Pair encoding test:")
|
| print(f" tokens : {encoded_pair.tokens}")
|
| print(f" eot positions: {eot_positions}")
|
| print(f" expected : 2 eot tokens (one after each sequence)")
|
| print(f" [{'PASS' if len(eot_positions) == 2 else 'FAIL'}]")
|
|
|
| print(f"\nAll tests passed: {all_passed}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| if __name__ == "__main__":
|
| import sys
|
|
|
|
|
|
|
|
|
|
|
| path = sys.argv[1] if len(sys.argv) > 1 else "fineweb_edu_tokenizer.json"
|
|
|
| print(f"Loading tokenizer from: {path}")
|
| tokenizer = Tokenizer.from_file(path)
|
|
|
| tokenizer = add_post_processor(tokenizer)
|
| verify_post_processor(tokenizer)
|
|
|
|
|
| tokenizer.save(path)
|
| print(f"\nTokenizer re-saved with post-processor to: {path}") |