contextClassifier
/
venv
/Lib
/site-packages
/prompt_toolkit
/contrib
/regular_languages
/regex_parser.py
| """ | |
| Parser for parsing a regular expression. | |
| Take a string representing a regular expression and return the root node of its | |
| parse tree. | |
| usage:: | |
| root_node = parse_regex('(hello|world)') | |
| Remarks: | |
| - The regex parser processes multiline, it ignores all whitespace and supports | |
| multiple named groups with the same name and #-style comments. | |
| Limitations: | |
| - Lookahead is not supported. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| __all__ = [ | |
| "Repeat", | |
| "Variable", | |
| "Regex", | |
| "Lookahead", | |
| "tokenize_regex", | |
| "parse_regex", | |
| ] | |
| class Node: | |
| """ | |
| Base class for all the grammar nodes. | |
| (You don't initialize this one.) | |
| """ | |
| def __add__(self, other_node: Node) -> NodeSequence: | |
| return NodeSequence([self, other_node]) | |
| def __or__(self, other_node: Node) -> AnyNode: | |
| return AnyNode([self, other_node]) | |
| class AnyNode(Node): | |
| """ | |
| Union operation (OR operation) between several grammars. You don't | |
| initialize this yourself, but it's a result of a "Grammar1 | Grammar2" | |
| operation. | |
| """ | |
| def __init__(self, children: list[Node]) -> None: | |
| self.children = children | |
| def __or__(self, other_node: Node) -> AnyNode: | |
| return AnyNode(self.children + [other_node]) | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}({self.children!r})" | |
| class NodeSequence(Node): | |
| """ | |
| Concatenation operation of several grammars. You don't initialize this | |
| yourself, but it's a result of a "Grammar1 + Grammar2" operation. | |
| """ | |
| def __init__(self, children: list[Node]) -> None: | |
| self.children = children | |
| def __add__(self, other_node: Node) -> NodeSequence: | |
| return NodeSequence(self.children + [other_node]) | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}({self.children!r})" | |
| class Regex(Node): | |
| """ | |
| Regular expression. | |
| """ | |
| def __init__(self, regex: str) -> None: | |
| re.compile(regex) # Validate | |
| self.regex = regex | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}(/{self.regex}/)" | |
| class Lookahead(Node): | |
| """ | |
| Lookahead expression. | |
| """ | |
| def __init__(self, childnode: Node, negative: bool = False) -> None: | |
| self.childnode = childnode | |
| self.negative = negative | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}({self.childnode!r})" | |
| class Variable(Node): | |
| """ | |
| Mark a variable in the regular grammar. This will be translated into a | |
| named group. Each variable can have his own completer, validator, etc.. | |
| :param childnode: The grammar which is wrapped inside this variable. | |
| :param varname: String. | |
| """ | |
| def __init__(self, childnode: Node, varname: str = "") -> None: | |
| self.childnode = childnode | |
| self.varname = varname | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}(childnode={self.childnode!r}, varname={self.varname!r})" | |
| class Repeat(Node): | |
| def __init__( | |
| self, | |
| childnode: Node, | |
| min_repeat: int = 0, | |
| max_repeat: int | None = None, | |
| greedy: bool = True, | |
| ) -> None: | |
| self.childnode = childnode | |
| self.min_repeat = min_repeat | |
| self.max_repeat = max_repeat | |
| self.greedy = greedy | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}(childnode={self.childnode!r})" | |
| def tokenize_regex(input: str) -> list[str]: | |
| """ | |
| Takes a string, representing a regular expression as input, and tokenizes | |
| it. | |
| :param input: string, representing a regular expression. | |
| :returns: List of tokens. | |
| """ | |
| # Regular expression for tokenizing other regular expressions. | |
| p = re.compile( | |
| r"""^( | |
| \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group. | |
| \(\?#[^)]*\) | # Comment | |
| \(\?= | # Start of lookahead assertion | |
| \(\?! | # Start of negative lookahead assertion | |
| \(\?<= | # If preceded by. | |
| \(\?< | # If not preceded by. | |
| \(?: | # Start of group. (non capturing.) | |
| \( | # Start of group. | |
| \(?[iLmsux] | # Flags. | |
| \(?P=[a-zA-Z]+\) | # Back reference to named group | |
| \) | # End of group. | |
| \{[^{}]*\} | # Repetition | |
| \*\? | \+\? | \?\?\ | # Non greedy repetition. | |
| \* | \+ | \? | # Repetition | |
| \#.*\n | # Comment | |
| \\. | | |
| # Character group. | |
| \[ | |
| ( [^\]\\] | \\.)* | |
| \] | | |
| [^(){}] | | |
| . | |
| )""", | |
| re.VERBOSE, | |
| ) | |
| tokens = [] | |
| while input: | |
| m = p.match(input) | |
| if m: | |
| token, input = input[: m.end()], input[m.end() :] | |
| if not token.isspace(): | |
| tokens.append(token) | |
| else: | |
| raise Exception("Could not tokenize input regex.") | |
| return tokens | |
| def parse_regex(regex_tokens: list[str]) -> Node: | |
| """ | |
| Takes a list of tokens from the tokenizer, and returns a parse tree. | |
| """ | |
| # We add a closing brace because that represents the final pop of the stack. | |
| tokens: list[str] = [")"] + regex_tokens[::-1] | |
| def wrap(lst: list[Node]) -> Node: | |
| """Turn list into sequence when it contains several items.""" | |
| if len(lst) == 1: | |
| return lst[0] | |
| else: | |
| return NodeSequence(lst) | |
| def _parse() -> Node: | |
| or_list: list[list[Node]] = [] | |
| result: list[Node] = [] | |
| def wrapped_result() -> Node: | |
| if or_list == []: | |
| return wrap(result) | |
| else: | |
| or_list.append(result) | |
| return AnyNode([wrap(i) for i in or_list]) | |
| while tokens: | |
| t = tokens.pop() | |
| if t.startswith("(?P<"): | |
| variable = Variable(_parse(), varname=t[4:-1]) | |
| result.append(variable) | |
| elif t in ("*", "*?"): | |
| greedy = t == "*" | |
| result[-1] = Repeat(result[-1], greedy=greedy) | |
| elif t in ("+", "+?"): | |
| greedy = t == "+" | |
| result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy) | |
| elif t in ("?", "??"): | |
| if result == []: | |
| raise Exception("Nothing to repeat." + repr(tokens)) | |
| else: | |
| greedy = t == "?" | |
| result[-1] = Repeat( | |
| result[-1], min_repeat=0, max_repeat=1, greedy=greedy | |
| ) | |
| elif t == "|": | |
| or_list.append(result) | |
| result = [] | |
| elif t in ("(", "(?:"): | |
| result.append(_parse()) | |
| elif t == "(?!": | |
| result.append(Lookahead(_parse(), negative=True)) | |
| elif t == "(?=": | |
| result.append(Lookahead(_parse(), negative=False)) | |
| elif t == ")": | |
| return wrapped_result() | |
| elif t.startswith("#"): | |
| pass | |
| elif t.startswith("{"): | |
| # TODO: implement! | |
| raise Exception(f"{t}-style repetition not yet supported") | |
| elif t.startswith("(?"): | |
| raise Exception(f"{t!r} not supported") | |
| elif t.isspace(): | |
| pass | |
| else: | |
| result.append(Regex(t)) | |
| raise Exception("Expecting ')' token") | |
| result = _parse() | |
| if len(tokens) != 0: | |
| raise Exception("Unmatched parentheses.") | |
| else: | |
| return result | |