File size: 5,950 Bytes
7f974df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | import re
from tokenizers.pre_tokenizers import PreTokenizer, Split
from tokenizers import Regex
# Each category is defined separately so its easy to understand, modify, or debug individually
# 1. Contractions
# Matches: 's 't 're 've 'll 'm 'd
# Example: "don't" -> ["don", "'t"]
CONTRACTIONS = r"'(?:s|t|re|ve|ll|m|d)"
# 2. Abbreviations
# Matches: letter(s) separated by dots, optional trailing dot
# Example: "U.S.A" -> ["U.S.A"]
# "e.g." -> ["e.g."]
# "Ph.D" -> ["Ph.D"]
# \b = word boundary, ensures we dont partially match inside a word
ABBREVIATIONS = r"\b[A-Za-z](?:\.[A-Za-z])+\.?"
# 3. Scientific Notation
# Matches: number, optional decimal, e/E, optional sign, exponent
# Example: "1.5e-3" -> ["1.5e-3"]
# "3e10" -> ["3e10"]
# "2.0E+4" -> ["2.0E+4"]
# Must come BEFORE decimals otherwise "1.5" in "1.5e-3" matches first
SCIENTIFIC = r"\d+\.?\d*[eE][+-]?\d+"
# 4. Decimal Numbers
# Matches: digits, dot, digits
# Example: "3.14" -> ["3.14"]
# "0.001" -> ["0.001"]
# Must come BEFORE integers otherwise "3" in "3.14" matches first
DECIMALS = r"\d+\.\d+"
# 5. Integers
# Matches: any sequence of digits
# Example: "42" -> ["42"]
# "1984" -> ["1984"]
# Comes last among numbers since scientific and decimal match first
INTEGERS = r"\d+"
# 6. Multi-character Operators
# Matches: common programming operators that are 2 characters
# Example: "==" -> ["=="] "!=" -> ["!="]
# "->" -> ["->"] "+=" -> ["+="]
# Must come BEFORE single punctuation catch-all
# [-+*/]= matches +=, -=, *=, /= in one pattern
OPERATORS = r"==|!=|->|<=|>=|\*\*|//|[-+*/]="
# 7. Snake Case Identifiers
# Matches: words that contain underscores (code identifiers)
# Example: "snake_case" -> ["snake_case"]
# "var_name_2" -> ["var_name_2"]
# "_private" -> ["_private"]
# Must come BEFORE regular words otherwise "snake" matches first
SNAKE_CASE = r"[A-Za-z_][A-Za-z0-9_]*"
# 8. Regular Unicode Words
# Matches: any sequence of word characters (letters, digits)
# \w+ in unicode mode covers non-english letters too
# Example: "hello" -> ["hello"]
# "café" -> ["café"]
WORDS = r"\w+"
# 9. Whitespace
# Newlines are matched separately from spaces/tabs
# This preserves document structure (paragraph breaks etc.)
# Example: "\n\n" -> ["\n\n"] " " -> [" "]
WHITESPACE = r"\n+|[ \t]+"
# 10. Punctuation Catch-all
# Matches any single non-whitespace character that nothing above caught
# Example: "!" -> ["!"] "@" -> ["@"] "." -> ["."]
PUNCTUATION = r"[^\s]"
# ------------------------------------------------------------------ #
# Combine all patterns in ORDER - first match wins
# ------------------------------------------------------------------ #
PRETOKENIZER_PATTERN = "|".join([
CONTRACTIONS, # 1 - most specific first
ABBREVIATIONS, # 2 - before plain words
SCIENTIFIC, # 3 - before decimals
DECIMALS, # 4 - before integers
INTEGERS, # 5
OPERATORS, # 6 - before single punctuation
SNAKE_CASE, # 7 - before plain words
WORDS, # 8
WHITESPACE, # 9
PUNCTUATION, # 10 - catch everything else
])
def get_pretokenizer():
"""
Returns a HuggingFace Split pre-tokenizer using our custom regex.
Split behavior:
- pattern : the regex to split/match on
- behavior : "removed" -> splits on matches and discards them
"isolated" -> splits on matches and keeps them as tokens
"merged_with_previous" / "merged_with_next"
We use "isolated" because we WANT to keep whitespace, operators,
punctuation etc. as their own tokens rather than discard them.
"""
return Split(
pattern=Regex(PRETOKENIZER_PATTERN),
behavior="isolated",
invert=True # invert=True means: match the pattern and KEEP matches as tokens
# (rather than treating matches as split points)
)
# ------------------------------------------------------------------ #
# Quick test - run this file directly to verify behavior
# ------------------------------------------------------------------ #
if __name__ == "__main__":
from tokenizers import Tokenizer
from tokenizers.models import BPE
# Build a bare tokenizer just to test the pre-tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = get_pretokenizer()
test_cases = [
# Contractions
("Contractions", "don't she'll they've"),
# Abbreviations
("Abbreviations", "U.S.A has a Ph.D e.g. this"),
# Scientific notation
("Scientific", "the value is 1.5e-3 and 2.0E+4"),
# Decimals
("Decimals", "pi is 3.14159 and e is 2.718"),
# Integers
("Integers", "there are 1000 students in 2024"),
# Operators
("Operators", "if x==0 or y!=1 then z+=2"),
# Snake case
("Snake case", "my_variable and snake_case_name"),
# Mixed real world
("Real world", "The CO2 level is 415.2 ppm\n\nSee e.g. Smith et al."),
# Code like
("Code-like", "def my_func(x):\n return x**2 + 1"),
]
print(f"\n{'='*60}")
print(f" PRE-TOKENIZER TEST")
print(f"{'='*60}\n")
for label, text in test_cases:
tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text)
token_strings = [t[0] for t in tokens] # tokens are (string, offset) tuples
print(f"[{label}]")
print(f" Input : {repr(text)}")
print(f" Tokens : {token_strings}")
print() |