File size: 5,950 Bytes
7f974df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import re
from tokenizers.pre_tokenizers import PreTokenizer, Split
from tokenizers import Regex

#  Each category is defined separately so its easy to understand, modify, or debug individually


# 1. Contractions
#    Matches: 's  't  're  've  'll  'm  'd
#    Example: "don't" -> ["don", "'t"]
CONTRACTIONS = r"'(?:s|t|re|ve|ll|m|d)"

# 2. Abbreviations
#    Matches: letter(s) separated by dots, optional trailing dot
#    Example: "U.S.A" -> ["U.S.A"]
#             "e.g."  -> ["e.g."]
#             "Ph.D"  -> ["Ph.D"]
#    \b = word boundary, ensures we dont partially match inside a word
ABBREVIATIONS = r"\b[A-Za-z](?:\.[A-Za-z])+\.?"

# 3. Scientific Notation
#    Matches: number, optional decimal, e/E, optional sign, exponent
#    Example: "1.5e-3"  -> ["1.5e-3"]
#             "3e10"    -> ["3e10"]
#             "2.0E+4"  -> ["2.0E+4"]
#    Must come BEFORE decimals otherwise "1.5" in "1.5e-3" matches first
SCIENTIFIC = r"\d+\.?\d*[eE][+-]?\d+"

# 4. Decimal Numbers
#    Matches: digits, dot, digits
#    Example: "3.14"  -> ["3.14"]
#             "0.001" -> ["0.001"]
#    Must come BEFORE integers otherwise "3" in "3.14" matches first
DECIMALS = r"\d+\.\d+"

# 5. Integers
#    Matches: any sequence of digits
#    Example: "42"   -> ["42"]
#             "1984" -> ["1984"]
#    Comes last among numbers since scientific and decimal match first
INTEGERS = r"\d+"

# 6. Multi-character Operators
#    Matches: common programming operators that are 2 characters
#    Example: "==" -> ["=="]   "!=" -> ["!="]
#             "->" -> ["->"]   "+=" -> ["+="]
#    Must come BEFORE single punctuation catch-all
#    [-+*/]= matches +=, -=, *=, /= in one pattern
OPERATORS = r"==|!=|->|<=|>=|\*\*|//|[-+*/]="

# 7. Snake Case Identifiers
#    Matches: words that contain underscores (code identifiers)
#    Example: "snake_case"  -> ["snake_case"]
#             "var_name_2"  -> ["var_name_2"]
#             "_private"    -> ["_private"]
#    Must come BEFORE regular words otherwise "snake" matches first
SNAKE_CASE = r"[A-Za-z_][A-Za-z0-9_]*"

# 8. Regular Unicode Words
#    Matches: any sequence of word characters (letters, digits)
#    \w+ in unicode mode covers non-english letters too
#    Example: "hello" -> ["hello"]
#             "café"  -> ["café"]
WORDS = r"\w+"

# 9. Whitespace
#    Newlines are matched separately from spaces/tabs
#    This preserves document structure (paragraph breaks etc.)
#    Example: "\n\n" -> ["\n\n"]  "   " -> ["   "]
WHITESPACE = r"\n+|[ \t]+"

# 10. Punctuation Catch-all
#     Matches any single non-whitespace character that nothing above caught
#     Example: "!" -> ["!"]  "@" -> ["@"]  "." -> ["."]
PUNCTUATION = r"[^\s]"

# ------------------------------------------------------------------ #
#  Combine all patterns in ORDER - first match wins
# ------------------------------------------------------------------ #

PRETOKENIZER_PATTERN = "|".join([
    CONTRACTIONS,   # 1 - most specific first
    ABBREVIATIONS,  # 2 - before plain words
    SCIENTIFIC,     # 3 - before decimals
    DECIMALS,       # 4 - before integers
    INTEGERS,       # 5
    OPERATORS,      # 6 - before single punctuation
    SNAKE_CASE,     # 7 - before plain words
    WORDS,          # 8
    WHITESPACE,     # 9
    PUNCTUATION,    # 10 - catch everything else
])


def get_pretokenizer():
    """

    Returns a HuggingFace Split pre-tokenizer using our custom regex.

    

    Split behavior:

    - pattern    : the regex to split/match on

    - behavior   : "removed"  -> splits on matches and discards them

                   "isolated" -> splits on matches and keeps them as tokens

                   "merged_with_previous" / "merged_with_next"

    

    We use "isolated" because we WANT to keep whitespace, operators,

    punctuation etc. as their own tokens rather than discard them.

    """
    return Split(
        pattern=Regex(PRETOKENIZER_PATTERN),
        behavior="isolated",
        invert=True  # invert=True means: match the pattern and KEEP matches as tokens
                     # (rather than treating matches as split points)
    )


# ------------------------------------------------------------------ #
#  Quick test - run this file directly to verify behavior
# ------------------------------------------------------------------ #

if __name__ == "__main__":
    from tokenizers import Tokenizer
    from tokenizers.models import BPE

    # Build a bare tokenizer just to test the pre-tokenizer
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = get_pretokenizer()

    test_cases = [
        # Contractions
        ("Contractions",        "don't she'll they've"),
        # Abbreviations  
        ("Abbreviations",       "U.S.A has a Ph.D e.g. this"),
        # Scientific notation
        ("Scientific",          "the value is 1.5e-3 and 2.0E+4"),
        # Decimals
        ("Decimals",            "pi is 3.14159 and e is 2.718"),
        # Integers
        ("Integers",            "there are 1000 students in 2024"),
        # Operators
        ("Operators",           "if x==0 or y!=1 then z+=2"),
        # Snake case
        ("Snake case",          "my_variable and snake_case_name"),
        # Mixed real world
        ("Real world",          "The CO2 level is 415.2 ppm\n\nSee e.g. Smith et al."),
        # Code like
        ("Code-like",           "def my_func(x):\n    return x**2 + 1"),
    ]

    print(f"\n{'='*60}")
    print(f"  PRE-TOKENIZER TEST")
    print(f"{'='*60}\n")

    for label, text in test_cases:
        tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text)
        token_strings = [t[0] for t in tokens]  # tokens are (string, offset) tuples
        print(f"[{label}]")
        print(f"  Input  : {repr(text)}")
        print(f"  Tokens : {token_strings}")
        print()