sllm / tokenizer /pretokenizer.py

Initial commit

7f974df verified 3 days ago

5.95 kB

	import re
	from tokenizers.pre_tokenizers import PreTokenizer, Split
	from tokenizers import Regex

	# Each category is defined separately so its easy to understand, modify, or debug individually


	# 1. Contractions
	# Matches: 's 't 're 've 'll 'm 'd
	# Example: "don't" -> ["don", "'t"]
	CONTRACTIONS = r"'(?:s\|t\|re\|ve\|ll\|m\|d)"

	# 2. Abbreviations
	# Matches: letter(s) separated by dots, optional trailing dot
	# Example: "U.S.A" -> ["U.S.A"]
	# "e.g." -> ["e.g."]
	# "Ph.D" -> ["Ph.D"]
	# \b = word boundary, ensures we dont partially match inside a word
	ABBREVIATIONS = r"\b[A-Za-z](?:\.[A-Za-z])+\.?"

	# 3. Scientific Notation
	# Matches: number, optional decimal, e/E, optional sign, exponent
	# Example: "1.5e-3" -> ["1.5e-3"]
	# "3e10" -> ["3e10"]
	# "2.0E+4" -> ["2.0E+4"]
	# Must come BEFORE decimals otherwise "1.5" in "1.5e-3" matches first
	SCIENTIFIC = r"\d+\.?\d*[eE][+-]?\d+"

	# 4. Decimal Numbers
	# Matches: digits, dot, digits
	# Example: "3.14" -> ["3.14"]
	# "0.001" -> ["0.001"]
	# Must come BEFORE integers otherwise "3" in "3.14" matches first
	DECIMALS = r"\d+\.\d+"

	# 5. Integers
	# Matches: any sequence of digits
	# Example: "42" -> ["42"]
	# "1984" -> ["1984"]
	# Comes last among numbers since scientific and decimal match first
	INTEGERS = r"\d+"

	# 6. Multi-character Operators
	# Matches: common programming operators that are 2 characters
	# Example: "==" -> ["=="] "!=" -> ["!="]
	# "->" -> ["->"] "+=" -> ["+="]
	# Must come BEFORE single punctuation catch-all
	# [-+/]= matches +=, -=, =, /= in one pattern
	OPERATORS = r"==\|!=\|->\|<=\|>=\|\\\|//\|[-+*/]="

	# 7. Snake Case Identifiers
	# Matches: words that contain underscores (code identifiers)
	# Example: "snake_case" -> ["snake_case"]
	# "var_name_2" -> ["var_name_2"]
	# "_private" -> ["_private"]
	# Must come BEFORE regular words otherwise "snake" matches first
	SNAKE_CASE = r"[A-Za-z_][A-Za-z0-9_]*"

	# 8. Regular Unicode Words
	# Matches: any sequence of word characters (letters, digits)
	# \w+ in unicode mode covers non-english letters too
	# Example: "hello" -> ["hello"]
	# "café" -> ["café"]
	WORDS = r"\w+"

	# 9. Whitespace
	# Newlines are matched separately from spaces/tabs
	# This preserves document structure (paragraph breaks etc.)
	# Example: "\n\n" -> ["\n\n"] " " -> [" "]
	WHITESPACE = r"\n+\|[ \t]+"

	# 10. Punctuation Catch-all
	# Matches any single non-whitespace character that nothing above caught
	# Example: "!" -> ["!"] "@" -> ["@"] "." -> ["."]
	PUNCTUATION = r"[^\s]"

	# ------------------------------------------------------------------ #
	# Combine all patterns in ORDER - first match wins
	# ------------------------------------------------------------------ #

	PRETOKENIZER_PATTERN = "\|".join([
	CONTRACTIONS, # 1 - most specific first
	ABBREVIATIONS, # 2 - before plain words
	SCIENTIFIC, # 3 - before decimals
	DECIMALS, # 4 - before integers
	INTEGERS, # 5
	OPERATORS, # 6 - before single punctuation
	SNAKE_CASE, # 7 - before plain words
	WORDS, # 8
	WHITESPACE, # 9
	PUNCTUATION, # 10 - catch everything else
	])


	def get_pretokenizer():
	"""
	Returns a HuggingFace Split pre-tokenizer using our custom regex.

	Split behavior:
	- pattern : the regex to split/match on
	- behavior : "removed" -> splits on matches and discards them
	"isolated" -> splits on matches and keeps them as tokens
	"merged_with_previous" / "merged_with_next"

	We use "isolated" because we WANT to keep whitespace, operators,
	punctuation etc. as their own tokens rather than discard them.
	"""
	return Split(
	pattern=Regex(PRETOKENIZER_PATTERN),
	behavior="isolated",
	invert=True # invert=True means: match the pattern and KEEP matches as tokens
	# (rather than treating matches as split points)
	)


	# ------------------------------------------------------------------ #
	# Quick test - run this file directly to verify behavior
	# ------------------------------------------------------------------ #

	if __name__ == "__main__":
	from tokenizers import Tokenizer
	from tokenizers.models import BPE

	# Build a bare tokenizer just to test the pre-tokenizer
	tokenizer = Tokenizer(BPE())
	tokenizer.pre_tokenizer = get_pretokenizer()

	test_cases = [
	# Contractions
	("Contractions", "don't she'll they've"),
	# Abbreviations
	("Abbreviations", "U.S.A has a Ph.D e.g. this"),
	# Scientific notation
	("Scientific", "the value is 1.5e-3 and 2.0E+4"),
	# Decimals
	("Decimals", "pi is 3.14159 and e is 2.718"),
	# Integers
	("Integers", "there are 1000 students in 2024"),
	# Operators
	("Operators", "if x==0 or y!=1 then z+=2"),
	# Snake case
	("Snake case", "my_variable and snake_case_name"),
	# Mixed real world
	("Real world", "The CO2 level is 415.2 ppm\n\nSee e.g. Smith et al."),
	# Code like
	("Code-like", "def my_func(x):\n return x**2 + 1"),
	]

	print(f"\n{'='*60}")
	print(f" PRE-TOKENIZER TEST")
	print(f"{'='*60}\n")

	for label, text in test_cases:
	tokens = tokenizer.pre_tokenizer.pre_tokenize_str(text)
	token_strings = [t[0] for t in tokens] # tokens are (string, offset) tuples
	print(f"[{label}]")
	print(f" Input : {repr(text)}")
	print(f" Tokens : {token_strings}")
	print()