| import re | |
| from tokenizers.pre_tokenizers import PreTokenizer, Split | |
| from tokenizers import Regex | |
| # Each category is defined separately so its easy to understand, modify, or debug individually | |
| # 1. Contractions | |
| # Matches: 's 't 're 've 'll 'm 'd | |
| # Example: "don't" -> ["don", "'t"] | |
| CONTRACTIONS = r"'(?:s|t|re|ve|ll|m|d)" | |
| # 2. Abbreviations | |
| # Matches: letter(s) separated by dots, optional trailing dot | |
| # Example: "U.S.A" -> ["U.S.A"] | |
| # "e.g." -> ["e.g."] | |
| # "Ph.D" -> ["Ph.D"] | |
| # \b = word boundary, ensures we dont partially match inside a word | |
| ABBREVIATIONS = r"\b[A-Za-z](?:\.[A-Za-z])+\.?" | |
| # 3. Scientific Notation | |
| # Matches: number, optional decimal, e/E, optional sign, exponent | |
| # Example: "1.5e-3" -> ["1.5e-3"] | |
| # "3e10" -> ["3e10"] | |
| # "2.0E+4" -> ["2.0E+4"] | |
| # Must come BEFORE decimals otherwise "1.5" in "1.5e-3" matches first | |
| SCIENTIFIC = r"\d+\.?\d*[eE][+-]?\d+" | |
| # 4. Decimal Numbers | |
| # Matches: digits, dot, digits | |
| # Example: "3.14" -> ["3.14"] | |
| # "0.001" -> ["0.001"] | |
| # Must come BEFORE integers otherwise "3" in "3.14" matches first | |
| DECIMALS = r"\d+\.\d+" | |
| # 5. Integers | |
| # Matches: any sequence of digits | |
| # Example: "42" -> ["42"] | |
| # "1984" -> ["1984"] | |
| # Comes last among numbers since scientific and decimal match first | |
| INTEGERS = r"\d+" | |
| # 6. Multi-character Operators | |
| # Matches: common programming operators that are 2 characters | |
| # Example: "==" -> ["=="] "!=" -> ["!="] | |
| # "->" -> ["->"] "+=" -> ["+="] | |
| # Must come BEFORE single punctuation catch-all | |
| # [-+*/]= matches +=, -=, *=, /= in one pattern | |
| OPERATORS = r"==|!=|->|<=|>=|\*\*|//|[-+*/]=" | |
| # 7. Snake Case Identifiers | |
| # Matches: words that contain underscores (code identifiers) | |
| # Example: "snake_case" -> ["snake_case"] | |
| # "var_name_2" -> ["var_name_2"] | |
| # "_private" -> ["_private"] | |
| # Must come BEFORE regular words otherwise "snake" matches first | |
| SNAKE_CASE = r"[A-Za-z_][A-Za-z0-9_]*" | |
| # 8. Regular Unicode Words | |
| # Matches: any sequence of word characters (letters, digits) | |
| # \w+ in unicode mode covers non-english letters too | |
| # Example: "hello" -> ["hello"] | |
| # "café" -> ["café"] | |
| WORDS = r"\w+" | |
| # 9. Whitespace | |
| # Newlines are matched separately from spaces/tabs | |
| # This preserves document structure (paragraph breaks etc.) | |
| # Example: "\n\n" -> ["\n\n"] " " -> [" "] | |
| WHITESPACE = r"\n+|[ \t]+" | |
| # 10. Punctuation Catch-all | |
| # Matches any single non-whitespace character that nothing above caught | |
| # Example: "!" -> ["!"] "@" -> ["@"] "." -> ["."] | |
| PUNCTUATION = r"[^\s]" | |
| # ------------------------------------------------------------------ # | |
| # Combine all patterns in ORDER - first match wins | |
| # ------------------------------------------------------------------ # | |
| PRETOKENIZER_PATTERN = "|".join([ | |
| CONTRACTIONS, # 1 - most specific first | |
| ABBREVIATIONS, # 2 - before plain words | |
| SCIENTIFIC, # 3 - before decimals | |
| DECIMALS, # 4 - before integers | |
| INTEGERS, # 5 | |
| OPERATORS, # 6 - before single punctuation | |
| SNAKE_CASE, # 7 - before plain words | |
| WORDS, # 8 | |
| WHITESPACE, # 9 | |
| PUNCTUATION, # 10 - catch everything else | |
| ]) | |
| def get_pretokenizer(): | |
| """ | |
| Returns a HuggingFace Split pre-tokenizer using our custom regex. | |
| Split behavior: | |
| - pattern : the regex to split/match on | |
| - behavior : "removed" -> splits on matches and discards them | |
| "isolated" -> splits on matches and keeps them as tokens | |
| "merged_with_previous" / "merged_with_next" | |
| We use "isolated" because we WANT to keep whitespace, operators, | |
| punctuation etc. as their own tokens rather than discard them. | |
| """ | |
| return Split( | |
| pattern=Regex(PRETOKENIZER_PATTERN), | |
| behavior="isolated", | |
| invert=True # invert=True means: match the pattern and KEEP matches as tokens | |
| # (rather than treating matches as split points) | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # Quick test - run this file directly to verify behavior | |
| # ------------------------------------------------------------------ # | |
| if __name__ == "__main__": | |
| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| # Build a bare tokenizer just to test the pre-tokenizer | |
| tokenizer = Tokenizer(BPE()) | |
| tokenizer.pre_tokenizer = get_pretokenizer() | |
| test_cases = [ | |
| # Contractions | |
| ("Contractions", "don't she'll they've"), | |
| # Abbreviations | |
| ("Abbreviations", "U.S.A has a Ph.D e.g. this"), | |
| # Scientific notation | |
| ("Scientific", "the value is 1.5e-3 and 2.0E+4"), | |
| # Decimals | |
| ("Decimals", "pi is 3.14159 and e is 2.718"), | |
| # Integers | |
| ("Integers", "there are 1000 students in 2024"), | |
| # Operators | |
| ("Operators", "if x==0 or y!=1 then z+=2"), | |
| # Snake case | |
| ("Snake case", "my_variable and snake_case_name"), | |
| # Mixed real world | |
| ("Real world", "The CO2 level is 415.2 ppm\n\nSee e.g. Smith et al."), | |
| # Code like | |
| ("Code-like", "def my_func(x):\n return x**2 + 1"), | |
| ] | |
| print(f"\n{'='*60}") | |
| print(f" PRE-TOKENIZER TEST") | |
| print(f"{'='*60}\n") | |
| for label, text in test_cases: | |
| tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text) | |
| token_strings = [t[0] for t in tokens] # tokens are (string, offset) tuples | |
| print(f"[{label}]") | |
| print(f" Input : {repr(text)}") | |
| print(f" Tokens : {token_strings}") | |
| print() |