Spaces:
Runtime error
Runtime error
| # | |
| # Secret Labs' Regular Expression Engine | |
| # | |
| # convert re-style regular expression to sre pattern | |
| # | |
| # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. | |
| # | |
| # See the __init__.py file for information on usage and redistribution. | |
| # | |
| """Internal support module for sre""" | |
| # XXX: show string offset and offending character for all errors | |
| from ._constants import * | |
| SPECIAL_CHARS = ".\\[{()*+?^$|" | |
| REPEAT_CHARS = "*+?{" | |
| DIGITS = frozenset("0123456789") | |
| OCTDIGITS = frozenset("01234567") | |
| HEXDIGITS = frozenset("0123456789abcdefABCDEF") | |
| ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") | |
| WHITESPACE = frozenset(" \t\n\r\v\f") | |
| _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) | |
| _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) | |
| ESCAPES = { | |
| r"\a": (LITERAL, ord("\a")), | |
| r"\b": (LITERAL, ord("\b")), | |
| r"\f": (LITERAL, ord("\f")), | |
| r"\n": (LITERAL, ord("\n")), | |
| r"\r": (LITERAL, ord("\r")), | |
| r"\t": (LITERAL, ord("\t")), | |
| r"\v": (LITERAL, ord("\v")), | |
| r"\\": (LITERAL, ord("\\")) | |
| } | |
| CATEGORIES = { | |
| r"\A": (AT, AT_BEGINNING_STRING), # start of string | |
| r"\b": (AT, AT_BOUNDARY), | |
| r"\B": (AT, AT_NON_BOUNDARY), | |
| r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), | |
| r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), | |
| r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), | |
| r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), | |
| r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), | |
| r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), | |
| r"\Z": (AT, AT_END_STRING), # end of string | |
| } | |
| FLAGS = { | |
| # standard flags | |
| "i": SRE_FLAG_IGNORECASE, | |
| "L": SRE_FLAG_LOCALE, | |
| "m": SRE_FLAG_MULTILINE, | |
| "s": SRE_FLAG_DOTALL, | |
| "x": SRE_FLAG_VERBOSE, | |
| # extensions | |
| "a": SRE_FLAG_ASCII, | |
| "t": SRE_FLAG_TEMPLATE, | |
| "u": SRE_FLAG_UNICODE, | |
| } | |
| TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | |
| GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE | |
| class State: | |
| # keeps track of state for parsing | |
| def __init__(self): | |
| self.flags = 0 | |
| self.groupdict = {} | |
| self.groupwidths = [None] # group 0 | |
| self.lookbehindgroups = None | |
| self.grouprefpos = {} | |
| def groups(self): | |
| return len(self.groupwidths) | |
| def opengroup(self, name=None): | |
| gid = self.groups | |
| self.groupwidths.append(None) | |
| if self.groups > MAXGROUPS: | |
| raise error("too many groups") | |
| if name is not None: | |
| ogid = self.groupdict.get(name, None) | |
| if ogid is not None: | |
| raise error("redefinition of group name %r as group %d; " | |
| "was group %d" % (name, gid, ogid)) | |
| self.groupdict[name] = gid | |
| return gid | |
| def closegroup(self, gid, p): | |
| self.groupwidths[gid] = p.getwidth() | |
| def checkgroup(self, gid): | |
| return gid < self.groups and self.groupwidths[gid] is not None | |
| def checklookbehindgroup(self, gid, source): | |
| if self.lookbehindgroups is not None: | |
| if not self.checkgroup(gid): | |
| raise source.error('cannot refer to an open group') | |
| if gid >= self.lookbehindgroups: | |
| raise source.error('cannot refer to group defined in the same ' | |
| 'lookbehind subpattern') | |
| class SubPattern: | |
| # a subpattern, in intermediate form | |
| def __init__(self, state, data=None): | |
| self.state = state | |
| if data is None: | |
| data = [] | |
| self.data = data | |
| self.width = None | |
| def dump(self, level=0): | |
| seqtypes = (tuple, list) | |
| for op, av in self.data: | |
| print(level*" " + str(op), end='') | |
| if op is IN: | |
| # member sublanguage | |
| print() | |
| for op, a in av: | |
| print((level+1)*" " + str(op), a) | |
| elif op is BRANCH: | |
| print() | |
| for i, a in enumerate(av[1]): | |
| if i: | |
| print(level*" " + "OR") | |
| a.dump(level+1) | |
| elif op is GROUPREF_EXISTS: | |
| condgroup, item_yes, item_no = av | |
| print('', condgroup) | |
| item_yes.dump(level+1) | |
| if item_no: | |
| print(level*" " + "ELSE") | |
| item_no.dump(level+1) | |
| elif isinstance(av, SubPattern): | |
| print() | |
| av.dump(level+1) | |
| elif isinstance(av, seqtypes): | |
| nl = False | |
| for a in av: | |
| if isinstance(a, SubPattern): | |
| if not nl: | |
| print() | |
| a.dump(level+1) | |
| nl = True | |
| else: | |
| if not nl: | |
| print(' ', end='') | |
| print(a, end='') | |
| nl = False | |
| if not nl: | |
| print() | |
| else: | |
| print('', av) | |
| def __repr__(self): | |
| return repr(self.data) | |
| def __len__(self): | |
| return len(self.data) | |
| def __delitem__(self, index): | |
| del self.data[index] | |
| def __getitem__(self, index): | |
| if isinstance(index, slice): | |
| return SubPattern(self.state, self.data[index]) | |
| return self.data[index] | |
| def __setitem__(self, index, code): | |
| self.data[index] = code | |
| def insert(self, index, code): | |
| self.data.insert(index, code) | |
| def append(self, code): | |
| self.data.append(code) | |
| def getwidth(self): | |
| # determine the width (min, max) for this subpattern | |
| if self.width is not None: | |
| return self.width | |
| lo = hi = 0 | |
| for op, av in self.data: | |
| if op is BRANCH: | |
| i = MAXREPEAT - 1 | |
| j = 0 | |
| for av in av[1]: | |
| l, h = av.getwidth() | |
| i = min(i, l) | |
| j = max(j, h) | |
| lo = lo + i | |
| hi = hi + j | |
| elif op is ATOMIC_GROUP: | |
| i, j = av.getwidth() | |
| lo = lo + i | |
| hi = hi + j | |
| elif op is SUBPATTERN: | |
| i, j = av[-1].getwidth() | |
| lo = lo + i | |
| hi = hi + j | |
| elif op in _REPEATCODES: | |
| i, j = av[2].getwidth() | |
| lo = lo + i * av[0] | |
| hi = hi + j * av[1] | |
| elif op in _UNITCODES: | |
| lo = lo + 1 | |
| hi = hi + 1 | |
| elif op is GROUPREF: | |
| i, j = self.state.groupwidths[av] | |
| lo = lo + i | |
| hi = hi + j | |
| elif op is GROUPREF_EXISTS: | |
| i, j = av[1].getwidth() | |
| if av[2] is not None: | |
| l, h = av[2].getwidth() | |
| i = min(i, l) | |
| j = max(j, h) | |
| else: | |
| i = 0 | |
| lo = lo + i | |
| hi = hi + j | |
| elif op is SUCCESS: | |
| break | |
| self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) | |
| return self.width | |
| class Tokenizer: | |
| def __init__(self, string): | |
| self.istext = isinstance(string, str) | |
| self.string = string | |
| if not self.istext: | |
| string = str(string, 'latin1') | |
| self.decoded_string = string | |
| self.index = 0 | |
| self.next = None | |
| self.__next() | |
| def __next(self): | |
| index = self.index | |
| try: | |
| char = self.decoded_string[index] | |
| except IndexError: | |
| self.next = None | |
| return | |
| if char == "\\": | |
| index += 1 | |
| try: | |
| char += self.decoded_string[index] | |
| except IndexError: | |
| raise error("bad escape (end of pattern)", | |
| self.string, len(self.string) - 1) from None | |
| self.index = index + 1 | |
| self.next = char | |
| def match(self, char): | |
| if char == self.next: | |
| self.__next() | |
| return True | |
| return False | |
| def get(self): | |
| this = self.next | |
| self.__next() | |
| return this | |
| def getwhile(self, n, charset): | |
| result = '' | |
| for _ in range(n): | |
| c = self.next | |
| if c not in charset: | |
| break | |
| result += c | |
| self.__next() | |
| return result | |
| def getuntil(self, terminator, name): | |
| result = '' | |
| while True: | |
| c = self.next | |
| self.__next() | |
| if c is None: | |
| if not result: | |
| raise self.error("missing " + name) | |
| raise self.error("missing %s, unterminated name" % terminator, | |
| len(result)) | |
| if c == terminator: | |
| if not result: | |
| raise self.error("missing " + name, 1) | |
| break | |
| result += c | |
| return result | |
| def pos(self): | |
| return self.index - len(self.next or '') | |
| def tell(self): | |
| return self.index - len(self.next or '') | |
| def seek(self, index): | |
| self.index = index | |
| self.__next() | |
| def error(self, msg, offset=0): | |
| if not self.istext: | |
| msg = msg.encode('ascii', 'backslashreplace').decode('ascii') | |
| return error(msg, self.string, self.tell() - offset) | |
| def checkgroupname(self, name, offset, nested): | |
| if not name.isidentifier(): | |
| msg = "bad character in group name %r" % name | |
| raise self.error(msg, len(name) + offset) | |
| if not (self.istext or name.isascii()): | |
| import warnings | |
| warnings.warn( | |
| "bad character in group name %a at position %d" % | |
| (name, self.tell() - len(name) - offset), | |
| DeprecationWarning, stacklevel=nested + 7 | |
| ) | |
| def _class_escape(source, escape): | |
| # handle escape code inside character class | |
| code = ESCAPES.get(escape) | |
| if code: | |
| return code | |
| code = CATEGORIES.get(escape) | |
| if code and code[0] is IN: | |
| return code | |
| try: | |
| c = escape[1:2] | |
| if c == "x": | |
| # hexadecimal escape (exactly two digits) | |
| escape += source.getwhile(2, HEXDIGITS) | |
| if len(escape) != 4: | |
| raise source.error("incomplete escape %s" % escape, len(escape)) | |
| return LITERAL, int(escape[2:], 16) | |
| elif c == "u" and source.istext: | |
| # unicode escape (exactly four digits) | |
| escape += source.getwhile(4, HEXDIGITS) | |
| if len(escape) != 6: | |
| raise source.error("incomplete escape %s" % escape, len(escape)) | |
| return LITERAL, int(escape[2:], 16) | |
| elif c == "U" and source.istext: | |
| # unicode escape (exactly eight digits) | |
| escape += source.getwhile(8, HEXDIGITS) | |
| if len(escape) != 10: | |
| raise source.error("incomplete escape %s" % escape, len(escape)) | |
| c = int(escape[2:], 16) | |
| chr(c) # raise ValueError for invalid code | |
| return LITERAL, c | |
| elif c == "N" and source.istext: | |
| import unicodedata | |
| # named unicode escape e.g. \N{EM DASH} | |
| if not source.match('{'): | |
| raise source.error("missing {") | |
| charname = source.getuntil('}', 'character name') | |
| try: | |
| c = ord(unicodedata.lookup(charname)) | |
| except (KeyError, TypeError): | |
| raise source.error("undefined character name %r" % charname, | |
| len(charname) + len(r'\N{}')) from None | |
| return LITERAL, c | |
| elif c in OCTDIGITS: | |
| # octal escape (up to three digits) | |
| escape += source.getwhile(2, OCTDIGITS) | |
| c = int(escape[1:], 8) | |
| if c > 0o377: | |
| raise source.error('octal escape value %s outside of ' | |
| 'range 0-0o377' % escape, len(escape)) | |
| return LITERAL, c | |
| elif c in DIGITS: | |
| raise ValueError | |
| if len(escape) == 2: | |
| if c in ASCIILETTERS: | |
| raise source.error('bad escape %s' % escape, len(escape)) | |
| return LITERAL, ord(escape[1]) | |
| except ValueError: | |
| pass | |
| raise source.error("bad escape %s" % escape, len(escape)) | |
| def _escape(source, escape, state): | |
| # handle escape code in expression | |
| code = CATEGORIES.get(escape) | |
| if code: | |
| return code | |
| code = ESCAPES.get(escape) | |
| if code: | |
| return code | |
| try: | |
| c = escape[1:2] | |
| if c == "x": | |
| # hexadecimal escape | |
| escape += source.getwhile(2, HEXDIGITS) | |
| if len(escape) != 4: | |
| raise source.error("incomplete escape %s" % escape, len(escape)) | |
| return LITERAL, int(escape[2:], 16) | |
| elif c == "u" and source.istext: | |
| # unicode escape (exactly four digits) | |
| escape += source.getwhile(4, HEXDIGITS) | |
| if len(escape) != 6: | |
| raise source.error("incomplete escape %s" % escape, len(escape)) | |
| return LITERAL, int(escape[2:], 16) | |
| elif c == "U" and source.istext: | |
| # unicode escape (exactly eight digits) | |
| escape += source.getwhile(8, HEXDIGITS) | |
| if len(escape) != 10: | |
| raise source.error("incomplete escape %s" % escape, len(escape)) | |
| c = int(escape[2:], 16) | |
| chr(c) # raise ValueError for invalid code | |
| return LITERAL, c | |
| elif c == "N" and source.istext: | |
| import unicodedata | |
| # named unicode escape e.g. \N{EM DASH} | |
| if not source.match('{'): | |
| raise source.error("missing {") | |
| charname = source.getuntil('}', 'character name') | |
| try: | |
| c = ord(unicodedata.lookup(charname)) | |
| except (KeyError, TypeError): | |
| raise source.error("undefined character name %r" % charname, | |
| len(charname) + len(r'\N{}')) from None | |
| return LITERAL, c | |
| elif c == "0": | |
| # octal escape | |
| escape += source.getwhile(2, OCTDIGITS) | |
| return LITERAL, int(escape[1:], 8) | |
| elif c in DIGITS: | |
| # octal escape *or* decimal group reference (sigh) | |
| if source.next in DIGITS: | |
| escape += source.get() | |
| if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and | |
| source.next in OCTDIGITS): | |
| # got three octal digits; this is an octal escape | |
| escape += source.get() | |
| c = int(escape[1:], 8) | |
| if c > 0o377: | |
| raise source.error('octal escape value %s outside of ' | |
| 'range 0-0o377' % escape, | |
| len(escape)) | |
| return LITERAL, c | |
| # not an octal escape, so this is a group reference | |
| group = int(escape[1:]) | |
| if group < state.groups: | |
| if not state.checkgroup(group): | |
| raise source.error("cannot refer to an open group", | |
| len(escape)) | |
| state.checklookbehindgroup(group, source) | |
| return GROUPREF, group | |
| raise source.error("invalid group reference %d" % group, len(escape) - 1) | |
| if len(escape) == 2: | |
| if c in ASCIILETTERS: | |
| raise source.error("bad escape %s" % escape, len(escape)) | |
| return LITERAL, ord(escape[1]) | |
| except ValueError: | |
| pass | |
| raise source.error("bad escape %s" % escape, len(escape)) | |
| def _uniq(items): | |
| return list(dict.fromkeys(items)) | |
| def _parse_sub(source, state, verbose, nested): | |
| # parse an alternation: a|b|c | |
| items = [] | |
| itemsappend = items.append | |
| sourcematch = source.match | |
| start = source.tell() | |
| while True: | |
| itemsappend(_parse(source, state, verbose, nested + 1, | |
| not nested and not items)) | |
| if not sourcematch("|"): | |
| break | |
| if not nested: | |
| verbose = state.flags & SRE_FLAG_VERBOSE | |
| if len(items) == 1: | |
| return items[0] | |
| subpattern = SubPattern(state) | |
| # check if all items share a common prefix | |
| while True: | |
| prefix = None | |
| for item in items: | |
| if not item: | |
| break | |
| if prefix is None: | |
| prefix = item[0] | |
| elif item[0] != prefix: | |
| break | |
| else: | |
| # all subitems start with a common "prefix". | |
| # move it out of the branch | |
| for item in items: | |
| del item[0] | |
| subpattern.append(prefix) | |
| continue # check next one | |
| break | |
| # check if the branch can be replaced by a character set | |
| set = [] | |
| for item in items: | |
| if len(item) != 1: | |
| break | |
| op, av = item[0] | |
| if op is LITERAL: | |
| set.append((op, av)) | |
| elif op is IN and av[0][0] is not NEGATE: | |
| set.extend(av) | |
| else: | |
| break | |
| else: | |
| # we can store this as a character set instead of a | |
| # branch (the compiler may optimize this even more) | |
| subpattern.append((IN, _uniq(set))) | |
| return subpattern | |
| subpattern.append((BRANCH, (None, items))) | |
| return subpattern | |
| def _parse(source, state, verbose, nested, first=False): | |
| # parse a simple pattern | |
| subpattern = SubPattern(state) | |
| # precompute constants into local variables | |
| subpatternappend = subpattern.append | |
| sourceget = source.get | |
| sourcematch = source.match | |
| _len = len | |
| _ord = ord | |
| while True: | |
| this = source.next | |
| if this is None: | |
| break # end of pattern | |
| if this in "|)": | |
| break # end of subpattern | |
| sourceget() | |
| if verbose: | |
| # skip whitespace and comments | |
| if this in WHITESPACE: | |
| continue | |
| if this == "#": | |
| while True: | |
| this = sourceget() | |
| if this is None or this == "\n": | |
| break | |
| continue | |
| if this[0] == "\\": | |
| code = _escape(source, this, state) | |
| subpatternappend(code) | |
| elif this not in SPECIAL_CHARS: | |
| subpatternappend((LITERAL, _ord(this))) | |
| elif this == "[": | |
| here = source.tell() - 1 | |
| # character set | |
| set = [] | |
| setappend = set.append | |
| ## if sourcematch(":"): | |
| ## pass # handle character classes | |
| if source.next == '[': | |
| import warnings | |
| warnings.warn( | |
| 'Possible nested set at position %d' % source.tell(), | |
| FutureWarning, stacklevel=nested + 6 | |
| ) | |
| negate = sourcematch("^") | |
| # check remaining characters | |
| while True: | |
| this = sourceget() | |
| if this is None: | |
| raise source.error("unterminated character set", | |
| source.tell() - here) | |
| if this == "]" and set: | |
| break | |
| elif this[0] == "\\": | |
| code1 = _class_escape(source, this) | |
| else: | |
| if set and this in '-&~|' and source.next == this: | |
| import warnings | |
| warnings.warn( | |
| 'Possible set %s at position %d' % ( | |
| 'difference' if this == '-' else | |
| 'intersection' if this == '&' else | |
| 'symmetric difference' if this == '~' else | |
| 'union', | |
| source.tell() - 1), | |
| FutureWarning, stacklevel=nested + 6 | |
| ) | |
| code1 = LITERAL, _ord(this) | |
| if sourcematch("-"): | |
| # potential range | |
| that = sourceget() | |
| if that is None: | |
| raise source.error("unterminated character set", | |
| source.tell() - here) | |
| if that == "]": | |
| if code1[0] is IN: | |
| code1 = code1[1][0] | |
| setappend(code1) | |
| setappend((LITERAL, _ord("-"))) | |
| break | |
| if that[0] == "\\": | |
| code2 = _class_escape(source, that) | |
| else: | |
| if that == '-': | |
| import warnings | |
| warnings.warn( | |
| 'Possible set difference at position %d' % ( | |
| source.tell() - 2), | |
| FutureWarning, stacklevel=nested + 6 | |
| ) | |
| code2 = LITERAL, _ord(that) | |
| if code1[0] != LITERAL or code2[0] != LITERAL: | |
| msg = "bad character range %s-%s" % (this, that) | |
| raise source.error(msg, len(this) + 1 + len(that)) | |
| lo = code1[1] | |
| hi = code2[1] | |
| if hi < lo: | |
| msg = "bad character range %s-%s" % (this, that) | |
| raise source.error(msg, len(this) + 1 + len(that)) | |
| setappend((RANGE, (lo, hi))) | |
| else: | |
| if code1[0] is IN: | |
| code1 = code1[1][0] | |
| setappend(code1) | |
| set = _uniq(set) | |
| # XXX: <fl> should move set optimization to compiler! | |
| if _len(set) == 1 and set[0][0] is LITERAL: | |
| # optimization | |
| if negate: | |
| subpatternappend((NOT_LITERAL, set[0][1])) | |
| else: | |
| subpatternappend(set[0]) | |
| else: | |
| if negate: | |
| set.insert(0, (NEGATE, None)) | |
| # charmap optimization can't be added here because | |
| # global flags still are not known | |
| subpatternappend((IN, set)) | |
| elif this in REPEAT_CHARS: | |
| # repeat previous item | |
| here = source.tell() | |
| if this == "?": | |
| min, max = 0, 1 | |
| elif this == "*": | |
| min, max = 0, MAXREPEAT | |
| elif this == "+": | |
| min, max = 1, MAXREPEAT | |
| elif this == "{": | |
| if source.next == "}": | |
| subpatternappend((LITERAL, _ord(this))) | |
| continue | |
| min, max = 0, MAXREPEAT | |
| lo = hi = "" | |
| while source.next in DIGITS: | |
| lo += sourceget() | |
| if sourcematch(","): | |
| while source.next in DIGITS: | |
| hi += sourceget() | |
| else: | |
| hi = lo | |
| if not sourcematch("}"): | |
| subpatternappend((LITERAL, _ord(this))) | |
| source.seek(here) | |
| continue | |
| if lo: | |
| min = int(lo) | |
| if min >= MAXREPEAT: | |
| raise OverflowError("the repetition number is too large") | |
| if hi: | |
| max = int(hi) | |
| if max >= MAXREPEAT: | |
| raise OverflowError("the repetition number is too large") | |
| if max < min: | |
| raise source.error("min repeat greater than max repeat", | |
| source.tell() - here) | |
| else: | |
| raise AssertionError("unsupported quantifier %r" % (char,)) | |
| # figure out which item to repeat | |
| if subpattern: | |
| item = subpattern[-1:] | |
| else: | |
| item = None | |
| if not item or item[0][0] is AT: | |
| raise source.error("nothing to repeat", | |
| source.tell() - here + len(this)) | |
| if item[0][0] in _REPEATCODES: | |
| raise source.error("multiple repeat", | |
| source.tell() - here + len(this)) | |
| if item[0][0] is SUBPATTERN: | |
| group, add_flags, del_flags, p = item[0][1] | |
| if group is None and not add_flags and not del_flags: | |
| item = p | |
| if sourcematch("?"): | |
| # Non-Greedy Match | |
| subpattern[-1] = (MIN_REPEAT, (min, max, item)) | |
| elif sourcematch("+"): | |
| # Possessive Match (Always Greedy) | |
| subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) | |
| else: | |
| # Greedy Match | |
| subpattern[-1] = (MAX_REPEAT, (min, max, item)) | |
| elif this == ".": | |
| subpatternappend((ANY, None)) | |
| elif this == "(": | |
| start = source.tell() - 1 | |
| capture = True | |
| atomic = False | |
| name = None | |
| add_flags = 0 | |
| del_flags = 0 | |
| if sourcematch("?"): | |
| # options | |
| char = sourceget() | |
| if char is None: | |
| raise source.error("unexpected end of pattern") | |
| if char == "P": | |
| # python extensions | |
| if sourcematch("<"): | |
| # named group: skip forward to end of name | |
| name = source.getuntil(">", "group name") | |
| source.checkgroupname(name, 1, nested) | |
| elif sourcematch("="): | |
| # named backreference | |
| name = source.getuntil(")", "group name") | |
| source.checkgroupname(name, 1, nested) | |
| gid = state.groupdict.get(name) | |
| if gid is None: | |
| msg = "unknown group name %r" % name | |
| raise source.error(msg, len(name) + 1) | |
| if not state.checkgroup(gid): | |
| raise source.error("cannot refer to an open group", | |
| len(name) + 1) | |
| state.checklookbehindgroup(gid, source) | |
| subpatternappend((GROUPREF, gid)) | |
| continue | |
| else: | |
| char = sourceget() | |
| if char is None: | |
| raise source.error("unexpected end of pattern") | |
| raise source.error("unknown extension ?P" + char, | |
| len(char) + 2) | |
| elif char == ":": | |
| # non-capturing group | |
| capture = False | |
| elif char == "#": | |
| # comment | |
| while True: | |
| if source.next is None: | |
| raise source.error("missing ), unterminated comment", | |
| source.tell() - start) | |
| if sourceget() == ")": | |
| break | |
| continue | |
| elif char in "=!<": | |
| # lookahead assertions | |
| dir = 1 | |
| if char == "<": | |
| char = sourceget() | |
| if char is None: | |
| raise source.error("unexpected end of pattern") | |
| if char not in "=!": | |
| raise source.error("unknown extension ?<" + char, | |
| len(char) + 2) | |
| dir = -1 # lookbehind | |
| lookbehindgroups = state.lookbehindgroups | |
| if lookbehindgroups is None: | |
| state.lookbehindgroups = state.groups | |
| p = _parse_sub(source, state, verbose, nested + 1) | |
| if dir < 0: | |
| if lookbehindgroups is None: | |
| state.lookbehindgroups = None | |
| if not sourcematch(")"): | |
| raise source.error("missing ), unterminated subpattern", | |
| source.tell() - start) | |
| if char == "=": | |
| subpatternappend((ASSERT, (dir, p))) | |
| else: | |
| subpatternappend((ASSERT_NOT, (dir, p))) | |
| continue | |
| elif char == "(": | |
| # conditional backreference group | |
| condname = source.getuntil(")", "group name") | |
| if condname.isidentifier(): | |
| source.checkgroupname(condname, 1, nested) | |
| condgroup = state.groupdict.get(condname) | |
| if condgroup is None: | |
| msg = "unknown group name %r" % condname | |
| raise source.error(msg, len(condname) + 1) | |
| else: | |
| try: | |
| condgroup = int(condname) | |
| if condgroup < 0: | |
| raise ValueError | |
| except ValueError: | |
| msg = "bad character in group name %r" % condname | |
| raise source.error(msg, len(condname) + 1) from None | |
| if not condgroup: | |
| raise source.error("bad group number", | |
| len(condname) + 1) | |
| if condgroup >= MAXGROUPS: | |
| msg = "invalid group reference %d" % condgroup | |
| raise source.error(msg, len(condname) + 1) | |
| if condgroup not in state.grouprefpos: | |
| state.grouprefpos[condgroup] = ( | |
| source.tell() - len(condname) - 1 | |
| ) | |
| if not (condname.isdecimal() and condname.isascii()): | |
| import warnings | |
| warnings.warn( | |
| "bad character in group name %s at position %d" % | |
| (repr(condname) if source.istext else ascii(condname), | |
| source.tell() - len(condname) - 1), | |
| DeprecationWarning, stacklevel=nested + 6 | |
| ) | |
| state.checklookbehindgroup(condgroup, source) | |
| item_yes = _parse(source, state, verbose, nested + 1) | |
| if source.match("|"): | |
| item_no = _parse(source, state, verbose, nested + 1) | |
| if source.next == "|": | |
| raise source.error("conditional backref with more than two branches") | |
| else: | |
| item_no = None | |
| if not source.match(")"): | |
| raise source.error("missing ), unterminated subpattern", | |
| source.tell() - start) | |
| subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) | |
| continue | |
| elif char == ">": | |
| # non-capturing, atomic group | |
| capture = False | |
| atomic = True | |
| elif char in FLAGS or char == "-": | |
| # flags | |
| flags = _parse_flags(source, state, char) | |
| if flags is None: # global flags | |
| if not first or subpattern: | |
| raise source.error('global flags not at the start ' | |
| 'of the expression', | |
| source.tell() - start) | |
| verbose = state.flags & SRE_FLAG_VERBOSE | |
| continue | |
| add_flags, del_flags = flags | |
| capture = False | |
| else: | |
| raise source.error("unknown extension ?" + char, | |
| len(char) + 1) | |
| # parse group contents | |
| if capture: | |
| try: | |
| group = state.opengroup(name) | |
| except error as err: | |
| raise source.error(err.msg, len(name) + 1) from None | |
| else: | |
| group = None | |
| sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and | |
| not (del_flags & SRE_FLAG_VERBOSE)) | |
| p = _parse_sub(source, state, sub_verbose, nested + 1) | |
| if not source.match(")"): | |
| raise source.error("missing ), unterminated subpattern", | |
| source.tell() - start) | |
| if group is not None: | |
| state.closegroup(group, p) | |
| if atomic: | |
| assert group is None | |
| subpatternappend((ATOMIC_GROUP, p)) | |
| else: | |
| subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) | |
| elif this == "^": | |
| subpatternappend((AT, AT_BEGINNING)) | |
| elif this == "$": | |
| subpatternappend((AT, AT_END)) | |
| else: | |
| raise AssertionError("unsupported special character %r" % (char,)) | |
| # unpack non-capturing groups | |
| for i in range(len(subpattern))[::-1]: | |
| op, av = subpattern[i] | |
| if op is SUBPATTERN: | |
| group, add_flags, del_flags, p = av | |
| if group is None and not add_flags and not del_flags: | |
| subpattern[i: i+1] = p | |
| return subpattern | |
| def _parse_flags(source, state, char): | |
| sourceget = source.get | |
| add_flags = 0 | |
| del_flags = 0 | |
| if char != "-": | |
| while True: | |
| flag = FLAGS[char] | |
| if source.istext: | |
| if char == 'L': | |
| msg = "bad inline flags: cannot use 'L' flag with a str pattern" | |
| raise source.error(msg) | |
| else: | |
| if char == 'u': | |
| msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" | |
| raise source.error(msg) | |
| add_flags |= flag | |
| if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: | |
| msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" | |
| raise source.error(msg) | |
| char = sourceget() | |
| if char is None: | |
| raise source.error("missing -, : or )") | |
| if char in ")-:": | |
| break | |
| if char not in FLAGS: | |
| msg = "unknown flag" if char.isalpha() else "missing -, : or )" | |
| raise source.error(msg, len(char)) | |
| if char == ")": | |
| state.flags |= add_flags | |
| return None | |
| if add_flags & GLOBAL_FLAGS: | |
| raise source.error("bad inline flags: cannot turn on global flag", 1) | |
| if char == "-": | |
| char = sourceget() | |
| if char is None: | |
| raise source.error("missing flag") | |
| if char not in FLAGS: | |
| msg = "unknown flag" if char.isalpha() else "missing flag" | |
| raise source.error(msg, len(char)) | |
| while True: | |
| flag = FLAGS[char] | |
| if flag & TYPE_FLAGS: | |
| msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" | |
| raise source.error(msg) | |
| del_flags |= flag | |
| char = sourceget() | |
| if char is None: | |
| raise source.error("missing :") | |
| if char == ":": | |
| break | |
| if char not in FLAGS: | |
| msg = "unknown flag" if char.isalpha() else "missing :" | |
| raise source.error(msg, len(char)) | |
| assert char == ":" | |
| if del_flags & GLOBAL_FLAGS: | |
| raise source.error("bad inline flags: cannot turn off global flag", 1) | |
| if add_flags & del_flags: | |
| raise source.error("bad inline flags: flag turned on and off", 1) | |
| return add_flags, del_flags | |
| def fix_flags(src, flags): | |
| # Check and fix flags according to the type of pattern (str or bytes) | |
| if isinstance(src, str): | |
| if flags & SRE_FLAG_LOCALE: | |
| raise ValueError("cannot use LOCALE flag with a str pattern") | |
| if not flags & SRE_FLAG_ASCII: | |
| flags |= SRE_FLAG_UNICODE | |
| elif flags & SRE_FLAG_UNICODE: | |
| raise ValueError("ASCII and UNICODE flags are incompatible") | |
| else: | |
| if flags & SRE_FLAG_UNICODE: | |
| raise ValueError("cannot use UNICODE flag with a bytes pattern") | |
| if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: | |
| raise ValueError("ASCII and LOCALE flags are incompatible") | |
| return flags | |
| def parse(str, flags=0, state=None): | |
| # parse 're' pattern into list of (opcode, argument) tuples | |
| source = Tokenizer(str) | |
| if state is None: | |
| state = State() | |
| state.flags = flags | |
| state.str = str | |
| p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) | |
| p.state.flags = fix_flags(str, p.state.flags) | |
| if source.next is not None: | |
| assert source.next == ")" | |
| raise source.error("unbalanced parenthesis") | |
| for g in p.state.grouprefpos: | |
| if g >= p.state.groups: | |
| msg = "invalid group reference %d" % g | |
| raise error(msg, str, p.state.grouprefpos[g]) | |
| if flags & SRE_FLAG_DEBUG: | |
| p.dump() | |
| return p | |
| def parse_template(source, state): | |
| # parse 're' replacement string into list of literals and | |
| # group references | |
| s = Tokenizer(source) | |
| sget = s.get | |
| groups = [] | |
| literals = [] | |
| literal = [] | |
| lappend = literal.append | |
| def addgroup(index, pos): | |
| if index > state.groups: | |
| raise s.error("invalid group reference %d" % index, pos) | |
| if literal: | |
| literals.append(''.join(literal)) | |
| del literal[:] | |
| groups.append((len(literals), index)) | |
| literals.append(None) | |
| groupindex = state.groupindex | |
| while True: | |
| this = sget() | |
| if this is None: | |
| break # end of replacement string | |
| if this[0] == "\\": | |
| # group | |
| c = this[1] | |
| if c == "g": | |
| if not s.match("<"): | |
| raise s.error("missing <") | |
| name = s.getuntil(">", "group name") | |
| if name.isidentifier(): | |
| s.checkgroupname(name, 1, -1) | |
| try: | |
| index = groupindex[name] | |
| except KeyError: | |
| raise IndexError("unknown group name %r" % name) from None | |
| else: | |
| try: | |
| index = int(name) | |
| if index < 0: | |
| raise ValueError | |
| except ValueError: | |
| raise s.error("bad character in group name %r" % name, | |
| len(name) + 1) from None | |
| if index >= MAXGROUPS: | |
| raise s.error("invalid group reference %d" % index, | |
| len(name) + 1) | |
| if not (name.isdecimal() and name.isascii()): | |
| import warnings | |
| warnings.warn( | |
| "bad character in group name %s at position %d" % | |
| (repr(name) if s.istext else ascii(name), | |
| s.tell() - len(name) - 1), | |
| DeprecationWarning, stacklevel=5 | |
| ) | |
| addgroup(index, len(name) + 1) | |
| elif c == "0": | |
| if s.next in OCTDIGITS: | |
| this += sget() | |
| if s.next in OCTDIGITS: | |
| this += sget() | |
| lappend(chr(int(this[1:], 8) & 0xff)) | |
| elif c in DIGITS: | |
| isoctal = False | |
| if s.next in DIGITS: | |
| this += sget() | |
| if (c in OCTDIGITS and this[2] in OCTDIGITS and | |
| s.next in OCTDIGITS): | |
| this += sget() | |
| isoctal = True | |
| c = int(this[1:], 8) | |
| if c > 0o377: | |
| raise s.error('octal escape value %s outside of ' | |
| 'range 0-0o377' % this, len(this)) | |
| lappend(chr(c)) | |
| if not isoctal: | |
| addgroup(int(this[1:]), len(this) - 1) | |
| else: | |
| try: | |
| this = chr(ESCAPES[this][1]) | |
| except KeyError: | |
| if c in ASCIILETTERS: | |
| raise s.error('bad escape %s' % this, len(this)) from None | |
| lappend(this) | |
| else: | |
| lappend(this) | |
| if literal: | |
| literals.append(''.join(literal)) | |
| if not isinstance(source, str): | |
| # The tokenizer implicitly decodes bytes objects as latin-1, we must | |
| # therefore re-encode the final representation. | |
| literals = [None if s is None else s.encode('latin-1') for s in literals] | |
| return groups, literals | |
| def expand_template(template, match): | |
| g = match.group | |
| empty = match.string[:0] | |
| groups, literals = template | |
| literals = literals[:] | |
| try: | |
| for index, group in groups: | |
| literals[index] = g(group) or empty | |
| except IndexError: | |
| raise error("invalid group reference %d" % index) from None | |
| return empty.join(literals) | |