File size: 3,797 Bytes
68a4c53 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | """Token regex and identifier-splitting helpers.
Uses the third-party `regex` module (NOT stdlib `re`) so that Unicode property
classes like `\\p{Emoji_Presentation}` work. This is a hard requirement for
the pure-Python fallback. The Rust extension uses the `regex` crate's own
Unicode tables; the two are kept in parity by `tests/property/test_python_rust_parity.py`.
"""
from __future__ import annotations
from collections.abc import Iterator
import regex as re
from ._accel_loader import USE_RUST, accel
# The token regex covers, in priority order:
# 1. Emoji sequences (incl. ZWJ, VS16, keycap, emoji modifiers)
# 2. Word tokens (letters, digits, underscores, internal hyphens)
# 3. Multi-char operators (==, !=, <=, >=, +=, ->, &&, ||, :=, etc.)
# 4. Single non-space punctuation
#
# Note: we deliberately do NOT match \s+ here. Whitespace is preserved via
# gap-fill in the pre-tokenizer (fix #6 from the build plan), avoiding the
# double-counting bug present in the v2.1 draft.
TOKEN_REGEX = re.compile(
r"""
(?:
[\p{Emoji_Presentation}\p{Extended_Pictographic}]
(?:[\p{Emoji_Presentation}\p{Extended_Pictographic}])*
[️⃣\p{Emoji_Modifier}]*
)+ # emoji sequence
| [\p{L}\p{N}_](?:[\p{L}\p{N}_\-]*[\p{L}\p{N}_])? # word / identifier
| (?:!=|==|<=|>=|<<|>>|\+=|-=|\*=|/=|%=|&&|\|\||->|=>|::|:=|\.\.\.|\.\.)
| [^\s\w] # single punctuation
""",
re.VERBOSE | re.UNICODE,
)
# A strict ASCII identifier matcher; we only sub-split on ASCII identifiers
# because non-ASCII identifiers don't have well-defined camelCase semantics.
IDENTIFIER_REGEX = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
# Sub-part splitter for camelCase / PascalCase / SCREAMING_CASE / snake_case / digits.
_SUBPART_REGEX = re.compile(
r"[A-Z]+(?=[A-Z][a-z])" # acronym before camel: HTTPRequest -> HTTP
r"|[A-Z]?[a-z]+" # Capitalized or lowercase run
r"|[A-Z]+" # all-caps run
r"|[0-9]+" # digits
)
def is_identifier(s: str) -> bool:
"""True iff `s` matches the conservative ASCII identifier shape."""
if USE_RUST:
return accel.is_identifier(s)
return bool(IDENTIFIER_REGEX.match(s))
def split_identifier(ident: str) -> list[str]:
"""Split camelCase / PascalCase / snake_case / SCREAMING_CASE into pieces.
Property: ``''.join(split_identifier(x))`` reconstructs `x` minus underscores.
Examples
--------
>>> split_identifier("myVar")
['my', 'Var']
>>> split_identifier("HTTPRequestParser")
['HTTP', 'Request', 'Parser']
>>> split_identifier("MAX_BUFFER_SIZE")
['MAX', 'BUFFER', 'SIZE']
>>> split_identifier("get_user_id_42")
['get', 'user', 'id', '42']
>>> split_identifier("")
[]
"""
if USE_RUST:
return accel.split_identifier(ident)
if not ident:
return []
parts: list[str] = []
for chunk in ident.split("_"):
if not chunk:
continue
sub = _SUBPART_REGEX.findall(chunk)
if sub:
parts.extend(sub)
else:
parts.append(chunk)
return parts
def iter_tokens(text: str) -> Iterator[tuple[str, int, int]]:
"""Yield (token, start, end) for each non-whitespace token in `text`.
Whitespace between matches is intentionally NOT yielded; consumers are
responsible for gap-filling if they need round-trip preservation.
"""
for m in TOKEN_REGEX.finditer(text):
yield m.group(), m.start(), m.end()
__all__ = [
"IDENTIFIER_REGEX",
"TOKEN_REGEX",
"is_identifier",
"iter_tokens",
"split_identifier",
]
|