File size: 5,950 Bytes

7f974df

import re
from tokenizers.pre_tokenizers import PreTokenizer, Split
from tokenizers import Regex

#  Each category is defined separately so its easy to understand, modify, or debug individually


# 1. Contractions
#    Matches: 's  't  're  've  'll  'm  'd
#    Example: "don't" -> ["don", "'t"]
CONTRACTIONS = r"'(?:s|t|re|ve|ll|m|d)"

# 2. Abbreviations
#    Matches: letter(s) separated by dots, optional trailing dot
#    Example: "U.S.A" -> ["U.S.A"]
#             "e.g."  -> ["e.g."]
#             "Ph.D"  -> ["Ph.D"]
#    \b = word boundary, ensures we dont partially match inside a word
ABBREVIATIONS = r"\b[A-Za-z](?:\.[A-Za-z])+\.?"

# 3. Scientific Notation
#    Matches: number, optional decimal, e/E, optional sign, exponent
#    Example: "1.5e-3"  -> ["1.5e-3"]
#             "3e10"    -> ["3e10"]
#             "2.0E+4"  -> ["2.0E+4"]
#    Must come BEFORE decimals otherwise "1.5" in "1.5e-3" matches first
SCIENTIFIC = r"\d+\.?\d*[eE][+-]?\d+"

# 4. Decimal Numbers
#    Matches: digits, dot, digits
#    Example: "3.14"  -> ["3.14"]
#             "0.001" -> ["0.001"]
#    Must come BEFORE integers otherwise "3" in "3.14" matches first
DECIMALS = r"\d+\.\d+"

# 5. Integers
#    Matches: any sequence of digits
#    Example: "42"   -> ["42"]
#             "1984" -> ["1984"]
#    Comes last among numbers since scientific and decimal match first
INTEGERS = r"\d+"

# 6. Multi-character Operators
#    Matches: common programming operators that are 2 characters
#    Example: "==" -> ["=="]   "!=" -> ["!="]
#             "->" -> ["->"]   "+=" -> ["+="]
#    Must come BEFORE single punctuation catch-all
#    [-+*/]= matches +=, -=, *=, /= in one pattern
OPERATORS = r"==|!=|->|<=|>=|\*\*|//|[-+*/]="

# 7. Snake Case Identifiers
#    Matches: words that contain underscores (code identifiers)
#    Example: "snake_case"  -> ["snake_case"]
#             "var_name_2"  -> ["var_name_2"]
#             "_private"    -> ["_private"]
#    Must come BEFORE regular words otherwise "snake" matches first
SNAKE_CASE = r"[A-Za-z_][A-Za-z0-9_]*"

# 8. Regular Unicode Words
#    Matches: any sequence of word characters (letters, digits)
#    \w+ in unicode mode covers non-english letters too
#    Example: "hello" -> ["hello"]
#             "café"  -> ["café"]
WORDS = r"\w+"

# 9. Whitespace
#    Newlines are matched separately from spaces/tabs
#    This preserves document structure (paragraph breaks etc.)
#    Example: "\n\n" -> ["\n\n"]  "   " -> ["   "]
WHITESPACE = r"\n+|[ \t]+"

# 10. Punctuation Catch-all
#     Matches any single non-whitespace character that nothing above caught
#     Example: "!" -> ["!"]  "@" -> ["@"]  "." -> ["."]
PUNCTUATION = r"[^\s]"

# ------------------------------------------------------------------ #
#  Combine all patterns in ORDER - first match wins
# ------------------------------------------------------------------ #

PRETOKENIZER_PATTERN = "|".join([
    CONTRACTIONS,   # 1 - most specific first
    ABBREVIATIONS,  # 2 - before plain words
    SCIENTIFIC,     # 3 - before decimals
    DECIMALS,       # 4 - before integers
    INTEGERS,       # 5
    OPERATORS,      # 6 - before single punctuation
    SNAKE_CASE,     # 7 - before plain words
    WORDS,          # 8
    WHITESPACE,     # 9
    PUNCTUATION,    # 10 - catch everything else
])


def get_pretokenizer():
    """

    Returns a HuggingFace Split pre-tokenizer using our custom regex.

    

    Split behavior:

    - pattern    : the regex to split/match on

    - behavior   : "removed"  -> splits on matches and discards them

                   "isolated" -> splits on matches and keeps them as tokens

                   "merged_with_previous" / "merged_with_next"

    

    We use "isolated" because we WANT to keep whitespace, operators,

    punctuation etc. as their own tokens rather than discard them.

    """
    return Split(
        pattern=Regex(PRETOKENIZER_PATTERN),
        behavior="isolated",
        invert=True  # invert=True means: match the pattern and KEEP matches as tokens
                     # (rather than treating matches as split points)
    )


# ------------------------------------------------------------------ #
#  Quick test - run this file directly to verify behavior
# ------------------------------------------------------------------ #

if __name__ == "__main__":
    from tokenizers import Tokenizer
    from tokenizers.models import BPE

    # Build a bare tokenizer just to test the pre-tokenizer
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = get_pretokenizer()

    test_cases = [
        # Contractions
        ("Contractions",        "don't she'll they've"),
        # Abbreviations  
        ("Abbreviations",       "U.S.A has a Ph.D e.g. this"),
        # Scientific notation
        ("Scientific",          "the value is 1.5e-3 and 2.0E+4"),
        # Decimals
        ("Decimals",            "pi is 3.14159 and e is 2.718"),
        # Integers
        ("Integers",            "there are 1000 students in 2024"),
        # Operators
        ("Operators",           "if x==0 or y!=1 then z+=2"),
        # Snake case
        ("Snake case",          "my_variable and snake_case_name"),
        # Mixed real world
        ("Real world",          "The CO2 level is 415.2 ppm\n\nSee e.g. Smith et al."),
        # Code like
        ("Code-like",           "def my_func(x):\n    return x**2 + 1"),
    ]

    print(f"\n{'='*60}")
    print(f"  PRE-TOKENIZER TEST")
    print(f"{'='*60}\n")

    for label, text in test_cases:
        tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text)
        token_strings = [t[0] for t in tokens]  # tokens are (string, offset) tuples
        print(f"[{label}]")
        print(f"  Input  : {repr(text)}")
        print(f"  Tokens : {token_strings}")
        print()