sllm / tokenizer /pretokenizer.py
geeteshcodes's picture
Initial commit
7f974df verified
import re
from tokenizers.pre_tokenizers import PreTokenizer, Split
from tokenizers import Regex
# Each category is defined separately so its easy to understand, modify, or debug individually
# 1. Contractions
# Matches: 's 't 're 've 'll 'm 'd
# Example: "don't" -> ["don", "'t"]
CONTRACTIONS = r"'(?:s|t|re|ve|ll|m|d)"
# 2. Abbreviations
# Matches: letter(s) separated by dots, optional trailing dot
# Example: "U.S.A" -> ["U.S.A"]
# "e.g." -> ["e.g."]
# "Ph.D" -> ["Ph.D"]
# \b = word boundary, ensures we dont partially match inside a word
ABBREVIATIONS = r"\b[A-Za-z](?:\.[A-Za-z])+\.?"
# 3. Scientific Notation
# Matches: number, optional decimal, e/E, optional sign, exponent
# Example: "1.5e-3" -> ["1.5e-3"]
# "3e10" -> ["3e10"]
# "2.0E+4" -> ["2.0E+4"]
# Must come BEFORE decimals otherwise "1.5" in "1.5e-3" matches first
SCIENTIFIC = r"\d+\.?\d*[eE][+-]?\d+"
# 4. Decimal Numbers
# Matches: digits, dot, digits
# Example: "3.14" -> ["3.14"]
# "0.001" -> ["0.001"]
# Must come BEFORE integers otherwise "3" in "3.14" matches first
DECIMALS = r"\d+\.\d+"
# 5. Integers
# Matches: any sequence of digits
# Example: "42" -> ["42"]
# "1984" -> ["1984"]
# Comes last among numbers since scientific and decimal match first
INTEGERS = r"\d+"
# 6. Multi-character Operators
# Matches: common programming operators that are 2 characters
# Example: "==" -> ["=="] "!=" -> ["!="]
# "->" -> ["->"] "+=" -> ["+="]
# Must come BEFORE single punctuation catch-all
# [-+*/]= matches +=, -=, *=, /= in one pattern
OPERATORS = r"==|!=|->|<=|>=|\*\*|//|[-+*/]="
# 7. Snake Case Identifiers
# Matches: words that contain underscores (code identifiers)
# Example: "snake_case" -> ["snake_case"]
# "var_name_2" -> ["var_name_2"]
# "_private" -> ["_private"]
# Must come BEFORE regular words otherwise "snake" matches first
SNAKE_CASE = r"[A-Za-z_][A-Za-z0-9_]*"
# 8. Regular Unicode Words
# Matches: any sequence of word characters (letters, digits)
# \w+ in unicode mode covers non-english letters too
# Example: "hello" -> ["hello"]
# "café" -> ["café"]
WORDS = r"\w+"
# 9. Whitespace
# Newlines are matched separately from spaces/tabs
# This preserves document structure (paragraph breaks etc.)
# Example: "\n\n" -> ["\n\n"] " " -> [" "]
WHITESPACE = r"\n+|[ \t]+"
# 10. Punctuation Catch-all
# Matches any single non-whitespace character that nothing above caught
# Example: "!" -> ["!"] "@" -> ["@"] "." -> ["."]
PUNCTUATION = r"[^\s]"
# ------------------------------------------------------------------ #
# Combine all patterns in ORDER - first match wins
# ------------------------------------------------------------------ #
PRETOKENIZER_PATTERN = "|".join([
CONTRACTIONS, # 1 - most specific first
ABBREVIATIONS, # 2 - before plain words
SCIENTIFIC, # 3 - before decimals
DECIMALS, # 4 - before integers
INTEGERS, # 5
OPERATORS, # 6 - before single punctuation
SNAKE_CASE, # 7 - before plain words
WORDS, # 8
WHITESPACE, # 9
PUNCTUATION, # 10 - catch everything else
])
def get_pretokenizer():
"""
Returns a HuggingFace Split pre-tokenizer using our custom regex.
Split behavior:
- pattern : the regex to split/match on
- behavior : "removed" -> splits on matches and discards them
"isolated" -> splits on matches and keeps them as tokens
"merged_with_previous" / "merged_with_next"
We use "isolated" because we WANT to keep whitespace, operators,
punctuation etc. as their own tokens rather than discard them.
"""
return Split(
pattern=Regex(PRETOKENIZER_PATTERN),
behavior="isolated",
invert=True # invert=True means: match the pattern and KEEP matches as tokens
# (rather than treating matches as split points)
)
# ------------------------------------------------------------------ #
# Quick test - run this file directly to verify behavior
# ------------------------------------------------------------------ #
if __name__ == "__main__":
from tokenizers import Tokenizer
from tokenizers.models import BPE
# Build a bare tokenizer just to test the pre-tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = get_pretokenizer()
test_cases = [
# Contractions
("Contractions", "don't she'll they've"),
# Abbreviations
("Abbreviations", "U.S.A has a Ph.D e.g. this"),
# Scientific notation
("Scientific", "the value is 1.5e-3 and 2.0E+4"),
# Decimals
("Decimals", "pi is 3.14159 and e is 2.718"),
# Integers
("Integers", "there are 1000 students in 2024"),
# Operators
("Operators", "if x==0 or y!=1 then z+=2"),
# Snake case
("Snake case", "my_variable and snake_case_name"),
# Mixed real world
("Real world", "The CO2 level is 415.2 ppm\n\nSee e.g. Smith et al."),
# Code like
("Code-like", "def my_func(x):\n return x**2 + 1"),
]
print(f"\n{'='*60}")
print(f" PRE-TOKENIZER TEST")
print(f"{'='*60}\n")
for label, text in test_cases:
tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text)
token_strings = [t[0] for t in tokens] # tokens are (string, offset) tuples
print(f"[{label}]")
print(f" Input : {repr(text)}")
print(f" Tokens : {token_strings}")
print()