sllm / tokenizer /post_processor.py

Initial commit

7f974df verified 4 days ago

5.2 kB

	from tokenizers.processors import TemplateProcessing
	from tokenizers import Tokenizer


	# ------------------------------------------------------------------ #
	# POST-PROCESSOR
	# Runs after BPE encoding, appends <\|endoftext\|> to every sequence
	# ------------------------------------------------------------------ #

	def add_post_processor(tokenizer: Tokenizer) -> Tokenizer:
	"""
	Adds a post-processor to the tokenizer that appends
	<\|endoftext\|> to every encoded sequence.

	Must be called AFTER training because we need the real
	token ID of <\|endoftext\|> from the trained vocab.

	Args:
	tokenizer: a trained Tokenizer object

	Returns:
	The same tokenizer with post-processor attached
	"""

	# Get the real ID from the trained vocab
	# This is why we can only do this after training
	eot_id = tokenizer.token_to_id("<\|endoftext\|>")

	if eot_id is None:
	raise ValueError(
	"<\|endoftext\|> not found in vocab. "
	"Make sure the tokenizer is trained before adding post-processor."
	)

	# TemplateProcessing defines the final sequence structure
	# using a simple template syntax:
	#
	# $A -> the encoded sequence (single sequence)
	# $A $B -> two sequences (for pair tasks like QA)
	# <\|endoftext\|>:ID -> insert this special token with its ID
	#
	# Our template:
	# single : [tokens...] <\|endoftext\|>
	# pair : [tokens_A...] <\|endoftext\|> [tokens_B...] <\|endoftext\|>
	#
	# pair template handles future use cases like
	# question-context pairs without needing to change the tokenizer

	tokenizer.post_processor = TemplateProcessing(
	single="$A <\|endoftext\|>:0",
	pair="$A <\|endoftext\|>:0 $B:1 <\|endoftext\|>:0",
	special_tokens=[
	("<\|endoftext\|>", eot_id),
	],
	)

	print(f"Post-processor added: <\|endoftext\|> (ID: {eot_id}) appended to sequences")

	return tokenizer


	# ------------------------------------------------------------------ #
	# VERIFICATION
	# ------------------------------------------------------------------ #

	def verify_post_processor(tokenizer: Tokenizer):
	"""
	Verifies the post-processor is working correctly.
	Checks that <\|endoftext\|> appears at end of every encoded sequence.
	"""

	eot_id = tokenizer.token_to_id("<\|endoftext\|>")
	eot_token = "<\|endoftext\|>"

	print("\n" + "="*60)
	print(" POST-PROCESSOR VERIFICATION")
	print("="*60 + "\n")

	test_cases = [
	# Single documents
	"The mitochondria is the powerhouse of the cell.",
	"CO2 levels rose by 1.5e-3 ppm.",
	# Short edge cases
	"Hi.",
	"42",
	]

	all_passed = True

	for text in test_cases:
	encoded = tokenizer.encode(text)
	last_token = encoded.tokens[-1]
	last_id = encoded.ids[-1]
	passed = last_token == eot_token and last_id == eot_id

	if not passed:
	all_passed = False

	status = "PASS" if passed else "FAIL"
	print(f"[{status}] {repr(text)}")
	print(f" tokens : {encoded.tokens}")
	print(f" last : {last_token!r} (ID: {last_id})")
	print()

	# Verify pair encoding
	encoded_pair = tokenizer.encode("question here", "answer here")
	pair_ids = encoded_pair.ids
	eot_positions = [i for i, id in enumerate(pair_ids) if id == eot_id]

	print(f"Pair encoding test:")
	print(f" tokens : {encoded_pair.tokens}")
	print(f" eot positions: {eot_positions}")
	print(f" expected : 2 eot tokens (one after each sequence)")
	print(f" [{'PASS' if len(eot_positions) == 2 else 'FAIL'}]")

	print(f"\nAll tests passed: {all_passed}")


	# ------------------------------------------------------------------ #
	# HOW THIS FITS INTO THE FULL PIPELINE
	# ------------------------------------------------------------------ #

	# The correct order when building your full tokenizer:
	#
	# 1. build_tokenizer() <- sets up model + pre-tokenizer + decoder
	# 2. train_from_iterator() <- trains BPE, assigns real vocab IDs
	# 3. add_post_processor() <- NOW we can add post-processor (needs real IDs)
	# 4. tokenizer.save() <- saves everything including post-processor
	#
	# Loading later:
	# tokenizer = Tokenizer.from_file("fineweb_edu_tokenizer.json")
	# <- post-processor is automatically restored, no extra steps


	if __name__ == "__main__":
	import sys

	# Load a trained tokenizer from disk to test
	# Pass the path as argument: python post_processor.py fineweb_edu_tokenizer.json
	# Or it will try the default path

	path = sys.argv[1] if len(sys.argv) > 1 else "fineweb_edu_tokenizer.json"

	print(f"Loading tokenizer from: {path}")
	tokenizer = Tokenizer.from_file(path)

	tokenizer = add_post_processor(tokenizer)
	verify_post_processor(tokenizer)

	# Save with post-processor included
	tokenizer.save(path)
	print(f"\nTokenizer re-saved with post-processor to: {path}")