File size: 5,203 Bytes
7f974df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from tokenizers.processors import TemplateProcessing
from tokenizers import Tokenizer


# ------------------------------------------------------------------ #
#  POST-PROCESSOR
#  Runs after BPE encoding, appends <|endoftext|> to every sequence
# ------------------------------------------------------------------ #

def add_post_processor(tokenizer: Tokenizer) -> Tokenizer:
    """

    Adds a post-processor to the tokenizer that appends

    <|endoftext|> to every encoded sequence.



    Must be called AFTER training because we need the real

    token ID of <|endoftext|> from the trained vocab.



    Args:

        tokenizer: a trained Tokenizer object



    Returns:

        The same tokenizer with post-processor attached

    """

    # Get the real ID from the trained vocab
    # This is why we can only do this after training
    eot_id = tokenizer.token_to_id("<|endoftext|>")

    if eot_id is None:
        raise ValueError(
            "<|endoftext|> not found in vocab. "
            "Make sure the tokenizer is trained before adding post-processor."
        )

    # TemplateProcessing defines the final sequence structure
    # using a simple template syntax:
    #
    #   $A         -> the encoded sequence (single sequence)
    #   $A $B      -> two sequences (for pair tasks like QA)
    #   <|endoftext|>:ID -> insert this special token with its ID
    #
    # Our template:
    #   single   : [tokens...] <|endoftext|>
    #   pair     : [tokens_A...] <|endoftext|> [tokens_B...] <|endoftext|>
    #
    # pair template handles future use cases like
    # question-context pairs without needing to change the tokenizer

    tokenizer.post_processor = TemplateProcessing(
        single="$A <|endoftext|>:0",
        pair="$A <|endoftext|>:0 $B:1 <|endoftext|>:0",
        special_tokens=[
            ("<|endoftext|>", eot_id),
        ],
    )

    print(f"Post-processor added: <|endoftext|> (ID: {eot_id}) appended to sequences")

    return tokenizer


# ------------------------------------------------------------------ #
#  VERIFICATION
# ------------------------------------------------------------------ #

def verify_post_processor(tokenizer: Tokenizer):
    """

    Verifies the post-processor is working correctly.

    Checks that <|endoftext|> appears at end of every encoded sequence.

    """

    eot_id    = tokenizer.token_to_id("<|endoftext|>")
    eot_token = "<|endoftext|>"

    print("\n" + "="*60)
    print("  POST-PROCESSOR VERIFICATION")
    print("="*60 + "\n")

    test_cases = [
        # Single documents
        "The mitochondria is the powerhouse of the cell.",
        "CO2 levels rose by 1.5e-3 ppm.",
        # Short edge cases
        "Hi.",
        "42",
    ]

    all_passed = True

    for text in test_cases:
        encoded     = tokenizer.encode(text)
        last_token  = encoded.tokens[-1]
        last_id     = encoded.ids[-1]
        passed      = last_token == eot_token and last_id == eot_id

        if not passed:
            all_passed = False

        status = "PASS" if passed else "FAIL"
        print(f"[{status}] {repr(text)}")
        print(f"       tokens : {encoded.tokens}")
        print(f"       last   : {last_token!r} (ID: {last_id})")
        print()

    # Verify pair encoding
    encoded_pair = tokenizer.encode("question here", "answer here")
    pair_ids     = encoded_pair.ids
    eot_positions = [i for i, id in enumerate(pair_ids) if id == eot_id]

    print(f"Pair encoding test:")
    print(f"  tokens      : {encoded_pair.tokens}")
    print(f"  eot positions: {eot_positions}")
    print(f"  expected     : 2 eot tokens (one after each sequence)")
    print(f"  [{'PASS' if len(eot_positions) == 2 else 'FAIL'}]")

    print(f"\nAll tests passed: {all_passed}")


# ------------------------------------------------------------------ #
#  HOW THIS FITS INTO THE FULL PIPELINE
# ------------------------------------------------------------------ #

# The correct order when building your full tokenizer:
#
#   1. build_tokenizer()       <- sets up model + pre-tokenizer + decoder
#   2. train_from_iterator()   <- trains BPE, assigns real vocab IDs
#   3. add_post_processor()    <- NOW we can add post-processor (needs real IDs)
#   4. tokenizer.save()        <- saves everything including post-processor
#
# Loading later:
#   tokenizer = Tokenizer.from_file("fineweb_edu_tokenizer.json")
#   <- post-processor is automatically restored, no extra steps


if __name__ == "__main__":
    import sys

    # Load a trained tokenizer from disk to test
    # Pass the path as argument: python post_processor.py fineweb_edu_tokenizer.json
    # Or it will try the default path

    path = sys.argv[1] if len(sys.argv) > 1 else "fineweb_edu_tokenizer.json"

    print(f"Loading tokenizer from: {path}")
    tokenizer = Tokenizer.from_file(path)

    tokenizer = add_post_processor(tokenizer)
    verify_post_processor(tokenizer)

    # Save with post-processor included
    tokenizer.save(path)
    print(f"\nTokenizer re-saved with post-processor to: {path}")