"""Token regex and identifier-splitting helpers. Uses the third-party `regex` module (NOT stdlib `re`) so that Unicode property classes like `\\p{Emoji_Presentation}` work. This is a hard requirement for the pure-Python fallback. The Rust extension uses the `regex` crate's own Unicode tables; the two are kept in parity by `tests/property/test_python_rust_parity.py`. """ from __future__ import annotations from collections.abc import Iterator import regex as re from ._accel_loader import USE_RUST, accel # The token regex covers, in priority order: # 1. Emoji sequences (incl. ZWJ, VS16, keycap, emoji modifiers) # 2. Word tokens (letters, digits, underscores, internal hyphens) # 3. Multi-char operators (==, !=, <=, >=, +=, ->, &&, ||, :=, etc.) # 4. Single non-space punctuation # # Note: we deliberately do NOT match \s+ here. Whitespace is preserved via # gap-fill in the pre-tokenizer (fix #6 from the build plan), avoiding the # double-counting bug present in the v2.1 draft. TOKEN_REGEX = re.compile( r""" (?: [\p{Emoji_Presentation}\p{Extended_Pictographic}] (?:‍[\p{Emoji_Presentation}\p{Extended_Pictographic}])* [️⃣\p{Emoji_Modifier}]* )+ # emoji sequence | [\p{L}\p{N}_](?:[\p{L}\p{N}_\-]*[\p{L}\p{N}_])? # word / identifier | (?:!=|==|<=|>=|<<|>>|\+=|-=|\*=|/=|%=|&&|\|\||->|=>|::|:=|\.\.\.|\.\.) | [^\s\w] # single punctuation """, re.VERBOSE | re.UNICODE, ) # A strict ASCII identifier matcher; we only sub-split on ASCII identifiers # because non-ASCII identifiers don't have well-defined camelCase semantics. IDENTIFIER_REGEX = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") # Sub-part splitter for camelCase / PascalCase / SCREAMING_CASE / snake_case / digits. _SUBPART_REGEX = re.compile( r"[A-Z]+(?=[A-Z][a-z])" # acronym before camel: HTTPRequest -> HTTP r"|[A-Z]?[a-z]+" # Capitalized or lowercase run r"|[A-Z]+" # all-caps run r"|[0-9]+" # digits ) def is_identifier(s: str) -> bool: """True iff `s` matches the conservative ASCII identifier shape.""" if USE_RUST: return accel.is_identifier(s) return bool(IDENTIFIER_REGEX.match(s)) def split_identifier(ident: str) -> list[str]: """Split camelCase / PascalCase / snake_case / SCREAMING_CASE into pieces. Property: ``''.join(split_identifier(x))`` reconstructs `x` minus underscores. Examples -------- >>> split_identifier("myVar") ['my', 'Var'] >>> split_identifier("HTTPRequestParser") ['HTTP', 'Request', 'Parser'] >>> split_identifier("MAX_BUFFER_SIZE") ['MAX', 'BUFFER', 'SIZE'] >>> split_identifier("get_user_id_42") ['get', 'user', 'id', '42'] >>> split_identifier("") [] """ if USE_RUST: return accel.split_identifier(ident) if not ident: return [] parts: list[str] = [] for chunk in ident.split("_"): if not chunk: continue sub = _SUBPART_REGEX.findall(chunk) if sub: parts.extend(sub) else: parts.append(chunk) return parts def iter_tokens(text: str) -> Iterator[tuple[str, int, int]]: """Yield (token, start, end) for each non-whitespace token in `text`. Whitespace between matches is intentionally NOT yielded; consumers are responsible for gap-filling if they need round-trip preservation. """ for m in TOKEN_REGEX.finditer(text): yield m.group(), m.start(), m.end() __all__ = [ "IDENTIFIER_REGEX", "TOKEN_REGEX", "is_identifier", "iter_tokens", "split_identifier", ]