HusseinEid
/

cute-tokenizer

private-use-area

lossless-roundtrip

Model card Files Files and versions

cute-tokenizer / cute_tokenizer /patterns.py

HusseinEid's picture

Super-squash branch 'main' using huggingface_hub

68a4c53 6 days ago

history blame contribute delete

3.8 kB

	"""Token regex and identifier-splitting helpers.

	Uses the third-party `regex` module (NOT stdlib `re`) so that Unicode property
	classes like `\\p{Emoji_Presentation}` work. This is a hard requirement for
	the pure-Python fallback. The Rust extension uses the `regex` crate's own
	Unicode tables; the two are kept in parity by `tests/property/test_python_rust_parity.py`.
	"""

	from __future__ import annotations

	from collections.abc import Iterator

	import regex as re

	from ._accel_loader import USE_RUST, accel

	# The token regex covers, in priority order:
	# 1. Emoji sequences (incl. ZWJ, VS16, keycap, emoji modifiers)
	# 2. Word tokens (letters, digits, underscores, internal hyphens)
	# 3. Multi-char operators (==, !=, <=, >=, +=, ->, &&, \|\|, :=, etc.)
	# 4. Single non-space punctuation
	#
	# Note: we deliberately do NOT match \s+ here. Whitespace is preserved via
	# gap-fill in the pre-tokenizer (fix #6 from the build plan), avoiding the
	# double-counting bug present in the v2.1 draft.
	TOKEN_REGEX = re.compile(
	r"""
	(?:
	[\p{Emoji_Presentation}\p{Extended_Pictographic}]
	(?:‍[\p{Emoji_Presentation}\p{Extended_Pictographic}])*
	[️⃣\p{Emoji_Modifier}]*
	)+ # emoji sequence
	\| [\p{L}\p{N}_](?:[\p{L}\p{N}_\-]*[\p{L}\p{N}_])? # word / identifier
	\| (?:!=\|==\|<=\|>=\|<<\|>>\|\+=\|-=\|\*=\|/=\|%=\|&&\|\\|\\|\|->\|=>\|::\|:=\|\.\.\.\|\.\.)
	\| [^\s\w] # single punctuation
	""",
	re.VERBOSE \| re.UNICODE,
	)

	# A strict ASCII identifier matcher; we only sub-split on ASCII identifiers
	# because non-ASCII identifiers don't have well-defined camelCase semantics.
	IDENTIFIER_REGEX = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")

	# Sub-part splitter for camelCase / PascalCase / SCREAMING_CASE / snake_case / digits.
	_SUBPART_REGEX = re.compile(
	r"[A-Z]+(?=[A-Z][a-z])" # acronym before camel: HTTPRequest -> HTTP
	r"\|[A-Z]?[a-z]+" # Capitalized or lowercase run
	r"\|[A-Z]+" # all-caps run
	r"\|[0-9]+" # digits
	)


	def is_identifier(s: str) -> bool:
	"""True iff `s` matches the conservative ASCII identifier shape."""
	if USE_RUST:
	return accel.is_identifier(s)
	return bool(IDENTIFIER_REGEX.match(s))


	def split_identifier(ident: str) -> list[str]:
	"""Split camelCase / PascalCase / snake_case / SCREAMING_CASE into pieces.

	Property: ``''.join(split_identifier(x))`` reconstructs `x` minus underscores.

	Examples
	--------
	>>> split_identifier("myVar")
	['my', 'Var']
	>>> split_identifier("HTTPRequestParser")
	['HTTP', 'Request', 'Parser']
	>>> split_identifier("MAX_BUFFER_SIZE")
	['MAX', 'BUFFER', 'SIZE']
	>>> split_identifier("get_user_id_42")
	['get', 'user', 'id', '42']
	>>> split_identifier("")
	[]
	"""
	if USE_RUST:
	return accel.split_identifier(ident)
	if not ident:
	return []
	parts: list[str] = []
	for chunk in ident.split("_"):
	if not chunk:
	continue
	sub = _SUBPART_REGEX.findall(chunk)
	if sub:
	parts.extend(sub)
	else:
	parts.append(chunk)
	return parts


	def iter_tokens(text: str) -> Iterator[tuple[str, int, int]]:
	"""Yield (token, start, end) for each non-whitespace token in `text`.

	Whitespace between matches is intentionally NOT yielded; consumers are
	responsible for gap-filling if they need round-trip preservation.
	"""
	for m in TOKEN_REGEX.finditer(text):
	yield m.group(), m.start(), m.end()


	__all__ = [
	"IDENTIFIER_REGEX",
	"TOKEN_REGEX",
	"is_identifier",
	"iter_tokens",
	"split_identifier",
	]