File size: 5,203 Bytes

7f974df

from tokenizers.processors import TemplateProcessing
from tokenizers import Tokenizer


# ------------------------------------------------------------------ #
#  POST-PROCESSOR
#  Runs after BPE encoding, appends <|endoftext|> to every sequence
# ------------------------------------------------------------------ #

def add_post_processor(tokenizer: Tokenizer) -> Tokenizer:
    """

    Adds a post-processor to the tokenizer that appends

    <|endoftext|> to every encoded sequence.



    Must be called AFTER training because we need the real

    token ID of <|endoftext|> from the trained vocab.



    Args:

        tokenizer: a trained Tokenizer object



    Returns:

        The same tokenizer with post-processor attached

    """

    # Get the real ID from the trained vocab
    # This is why we can only do this after training
    eot_id = tokenizer.token_to_id("<|endoftext|>")

    if eot_id is None:
        raise ValueError(
            "<|endoftext|> not found in vocab. "
            "Make sure the tokenizer is trained before adding post-processor."
        )

    # TemplateProcessing defines the final sequence structure
    # using a simple template syntax:
    #
    #   $A         -> the encoded sequence (single sequence)
    #   $A $B      -> two sequences (for pair tasks like QA)
    #   <|endoftext|>:ID -> insert this special token with its ID
    #
    # Our template:
    #   single   : [tokens...] <|endoftext|>
    #   pair     : [tokens_A...] <|endoftext|> [tokens_B...] <|endoftext|>
    #
    # pair template handles future use cases like
    # question-context pairs without needing to change the tokenizer

    tokenizer.post_processor = TemplateProcessing(
        single="$A <|endoftext|>:0",
        pair="$A <|endoftext|>:0 $B:1 <|endoftext|>:0",
        special_tokens=[
            ("<|endoftext|>", eot_id),
        ],
    )

    print(f"Post-processor added: <|endoftext|> (ID: {eot_id}) appended to sequences")

    return tokenizer


# ------------------------------------------------------------------ #
#  VERIFICATION
# ------------------------------------------------------------------ #

def verify_post_processor(tokenizer: Tokenizer):
    """

    Verifies the post-processor is working correctly.

    Checks that <|endoftext|> appears at end of every encoded sequence.

    """

    eot_id    = tokenizer.token_to_id("<|endoftext|>")
    eot_token = "<|endoftext|>"

    print("\n" + "="*60)
    print("  POST-PROCESSOR VERIFICATION")
    print("="*60 + "\n")

    test_cases = [
        # Single documents
        "The mitochondria is the powerhouse of the cell.",
        "CO2 levels rose by 1.5e-3 ppm.",
        # Short edge cases
        "Hi.",
        "42",
    ]

    all_passed = True

    for text in test_cases:
        encoded     = tokenizer.encode(text)
        last_token  = encoded.tokens[-1]
        last_id     = encoded.ids[-1]
        passed      = last_token == eot_token and last_id == eot_id

        if not passed:
            all_passed = False

        status = "PASS" if passed else "FAIL"
        print(f"[{status}] {repr(text)}")
        print(f"       tokens : {encoded.tokens}")
        print(f"       last   : {last_token!r} (ID: {last_id})")
        print()

    # Verify pair encoding
    encoded_pair = tokenizer.encode("question here", "answer here")
    pair_ids     = encoded_pair.ids
    eot_positions = [i for i, id in enumerate(pair_ids) if id == eot_id]

    print(f"Pair encoding test:")
    print(f"  tokens      : {encoded_pair.tokens}")
    print(f"  eot positions: {eot_positions}")
    print(f"  expected     : 2 eot tokens (one after each sequence)")
    print(f"  [{'PASS' if len(eot_positions) == 2 else 'FAIL'}]")

    print(f"\nAll tests passed: {all_passed}")


# ------------------------------------------------------------------ #
#  HOW THIS FITS INTO THE FULL PIPELINE
# ------------------------------------------------------------------ #

# The correct order when building your full tokenizer:
#
#   1. build_tokenizer()       <- sets up model + pre-tokenizer + decoder
#   2. train_from_iterator()   <- trains BPE, assigns real vocab IDs
#   3. add_post_processor()    <- NOW we can add post-processor (needs real IDs)
#   4. tokenizer.save()        <- saves everything including post-processor
#
# Loading later:
#   tokenizer = Tokenizer.from_file("fineweb_edu_tokenizer.json")
#   <- post-processor is automatically restored, no extra steps


if __name__ == "__main__":
    import sys

    # Load a trained tokenizer from disk to test
    # Pass the path as argument: python post_processor.py fineweb_edu_tokenizer.json
    # Or it will try the default path

    path = sys.argv[1] if len(sys.argv) > 1 else "fineweb_edu_tokenizer.json"

    print(f"Loading tokenizer from: {path}")
    tokenizer = Tokenizer.from_file(path)

    tokenizer = add_post_processor(tokenizer)
    verify_post_processor(tokenizer)

    # Save with post-processor included
    tokenizer.save(path)
    print(f"\nTokenizer re-saved with post-processor to: {path}")