import re from tokenizers.pre_tokenizers import PreTokenizer, Split from tokenizers import Regex # Each category is defined separately so its easy to understand, modify, or debug individually # 1. Contractions # Matches: 's 't 're 've 'll 'm 'd # Example: "don't" -> ["don", "'t"] CONTRACTIONS = r"'(?:s|t|re|ve|ll|m|d)" # 2. Abbreviations # Matches: letter(s) separated by dots, optional trailing dot # Example: "U.S.A" -> ["U.S.A"] # "e.g." -> ["e.g."] # "Ph.D" -> ["Ph.D"] # \b = word boundary, ensures we dont partially match inside a word ABBREVIATIONS = r"\b[A-Za-z](?:\.[A-Za-z])+\.?" # 3. Scientific Notation # Matches: number, optional decimal, e/E, optional sign, exponent # Example: "1.5e-3" -> ["1.5e-3"] # "3e10" -> ["3e10"] # "2.0E+4" -> ["2.0E+4"] # Must come BEFORE decimals otherwise "1.5" in "1.5e-3" matches first SCIENTIFIC = r"\d+\.?\d*[eE][+-]?\d+" # 4. Decimal Numbers # Matches: digits, dot, digits # Example: "3.14" -> ["3.14"] # "0.001" -> ["0.001"] # Must come BEFORE integers otherwise "3" in "3.14" matches first DECIMALS = r"\d+\.\d+" # 5. Integers # Matches: any sequence of digits # Example: "42" -> ["42"] # "1984" -> ["1984"] # Comes last among numbers since scientific and decimal match first INTEGERS = r"\d+" # 6. Multi-character Operators # Matches: common programming operators that are 2 characters # Example: "==" -> ["=="] "!=" -> ["!="] # "->" -> ["->"] "+=" -> ["+="] # Must come BEFORE single punctuation catch-all # [-+*/]= matches +=, -=, *=, /= in one pattern OPERATORS = r"==|!=|->|<=|>=|\*\*|//|[-+*/]=" # 7. Snake Case Identifiers # Matches: words that contain underscores (code identifiers) # Example: "snake_case" -> ["snake_case"] # "var_name_2" -> ["var_name_2"] # "_private" -> ["_private"] # Must come BEFORE regular words otherwise "snake" matches first SNAKE_CASE = r"[A-Za-z_][A-Za-z0-9_]*" # 8. Regular Unicode Words # Matches: any sequence of word characters (letters, digits) # \w+ in unicode mode covers non-english letters too # Example: "hello" -> ["hello"] # "café" -> ["café"] WORDS = r"\w+" # 9. Whitespace # Newlines are matched separately from spaces/tabs # This preserves document structure (paragraph breaks etc.) # Example: "\n\n" -> ["\n\n"] " " -> [" "] WHITESPACE = r"\n+|[ \t]+" # 10. Punctuation Catch-all # Matches any single non-whitespace character that nothing above caught # Example: "!" -> ["!"] "@" -> ["@"] "." -> ["."] PUNCTUATION = r"[^\s]" # ------------------------------------------------------------------ # # Combine all patterns in ORDER - first match wins # ------------------------------------------------------------------ # PRETOKENIZER_PATTERN = "|".join([ CONTRACTIONS, # 1 - most specific first ABBREVIATIONS, # 2 - before plain words SCIENTIFIC, # 3 - before decimals DECIMALS, # 4 - before integers INTEGERS, # 5 OPERATORS, # 6 - before single punctuation SNAKE_CASE, # 7 - before plain words WORDS, # 8 WHITESPACE, # 9 PUNCTUATION, # 10 - catch everything else ]) def get_pretokenizer(): """ Returns a HuggingFace Split pre-tokenizer using our custom regex. Split behavior: - pattern : the regex to split/match on - behavior : "removed" -> splits on matches and discards them "isolated" -> splits on matches and keeps them as tokens "merged_with_previous" / "merged_with_next" We use "isolated" because we WANT to keep whitespace, operators, punctuation etc. as their own tokens rather than discard them. """ return Split( pattern=Regex(PRETOKENIZER_PATTERN), behavior="isolated", invert=True # invert=True means: match the pattern and KEEP matches as tokens # (rather than treating matches as split points) ) # ------------------------------------------------------------------ # # Quick test - run this file directly to verify behavior # ------------------------------------------------------------------ # if __name__ == "__main__": from tokenizers import Tokenizer from tokenizers.models import BPE # Build a bare tokenizer just to test the pre-tokenizer tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = get_pretokenizer() test_cases = [ # Contractions ("Contractions", "don't she'll they've"), # Abbreviations ("Abbreviations", "U.S.A has a Ph.D e.g. this"), # Scientific notation ("Scientific", "the value is 1.5e-3 and 2.0E+4"), # Decimals ("Decimals", "pi is 3.14159 and e is 2.718"), # Integers ("Integers", "there are 1000 students in 2024"), # Operators ("Operators", "if x==0 or y!=1 then z+=2"), # Snake case ("Snake case", "my_variable and snake_case_name"), # Mixed real world ("Real world", "The CO2 level is 415.2 ppm\n\nSee e.g. Smith et al."), # Code like ("Code-like", "def my_func(x):\n return x**2 + 1"), ] print(f"\n{'='*60}") print(f" PRE-TOKENIZER TEST") print(f"{'='*60}\n") for label, text in test_cases: tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text) token_strings = [t[0] for t in tokens] # tokens are (string, offset) tuples print(f"[{label}]") print(f" Input : {repr(text)}") print(f" Tokens : {token_strings}") print()