| |
| |
|
|
| """Parser engine for the grammar tables generated by pgen. |
| |
| The grammar table must be loaded first. |
| |
| See Parser/parser.c in the Python distribution for additional info on |
| how this parsing engine works. |
| |
| """ |
|
|
| from collections.abc import Callable, Iterator |
| from contextlib import contextmanager |
| from typing import TYPE_CHECKING, Union, cast |
|
|
| from blib2to3.pgen2.grammar import Grammar |
| from blib2to3.pytree import NL, Context, Leaf, Node, RawNode, convert |
|
|
| |
| from . import grammar, token, tokenize |
|
|
| if TYPE_CHECKING: |
| from blib2to3.pgen2.driver import TokenProxy |
|
|
|
|
| Results = dict[str, NL] |
| Convert = Callable[[Grammar, RawNode], Union[Node, Leaf]] |
| DFA = list[list[tuple[int, int]]] |
| DFAS = tuple[DFA, dict[int, int]] |
|
|
|
|
| def lam_sub(grammar: Grammar, node: RawNode) -> NL: |
| assert node[3] is not None |
| return Node(type=node[0], children=node[3], context=node[2]) |
|
|
|
|
| |
| DUMMY_NODE = (-1, None, None, None) |
|
|
|
|
| def stack_copy( |
| stack: list[tuple[DFAS, int, RawNode]], |
| ) -> list[tuple[DFAS, int, RawNode]]: |
| """Nodeless stack copy.""" |
| return [(dfa, label, DUMMY_NODE) for dfa, label, _ in stack] |
|
|
|
|
| class Recorder: |
| def __init__(self, parser: "Parser", ilabels: list[int], context: Context) -> None: |
| self.parser = parser |
| self._ilabels = ilabels |
| self.context = context |
|
|
| self._dead_ilabels: set[int] = set() |
| self._start_point = self.parser.stack |
| self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels} |
|
|
| @property |
| def ilabels(self) -> set[int]: |
| return self._dead_ilabels.symmetric_difference(self._ilabels) |
|
|
| @contextmanager |
| def switch_to(self, ilabel: int) -> Iterator[None]: |
| with self.backtrack(): |
| self.parser.stack = self._points[ilabel] |
| try: |
| yield |
| except ParseError: |
| self._dead_ilabels.add(ilabel) |
| finally: |
| self.parser.stack = self._start_point |
|
|
| @contextmanager |
| def backtrack(self) -> Iterator[None]: |
| """ |
| Use the node-level invariant ones for basic parsing operations (push/pop/shift). |
| These still will operate on the stack; but they won't create any new nodes, or |
| modify the contents of any other existing nodes. |
| |
| This saves us a ton of time when we are backtracking, since we |
| want to restore to the initial state as quick as possible, which |
| can only be done by having as little mutatations as possible. |
| """ |
| is_backtracking = self.parser.is_backtracking |
| try: |
| self.parser.is_backtracking = True |
| yield |
| finally: |
| self.parser.is_backtracking = is_backtracking |
|
|
| def add_token(self, tok_type: int, tok_val: str, raw: bool = False) -> None: |
| for ilabel in self.ilabels: |
| with self.switch_to(ilabel): |
| if raw: |
| self.parser._addtoken(ilabel, tok_type, tok_val, self.context) |
| else: |
| self.parser.addtoken(tok_type, tok_val, self.context) |
|
|
| def determine_route( |
| self, value: str | None = None, force: bool = False |
| ) -> int | None: |
| alive_ilabels = self.ilabels |
| if len(alive_ilabels) == 0: |
| *_, most_successful_ilabel = self._dead_ilabels |
| raise ParseError("bad input", most_successful_ilabel, value, self.context) |
|
|
| ilabel, *rest = alive_ilabels |
| if force or not rest: |
| return ilabel |
| else: |
| return None |
|
|
|
|
| class ParseError(Exception): |
| """Exception to signal the parser is stuck.""" |
|
|
| def __init__( |
| self, msg: str, type: int | None, value: str | None, context: Context |
| ) -> None: |
| Exception.__init__( |
| self, f"{msg}: type={type!r}, value={value!r}, context={context!r}" |
| ) |
| self.msg = msg |
| self.type = type |
| self.value = value |
| self.context = context |
|
|
|
|
| class Parser: |
| """Parser engine. |
| |
| The proper usage sequence is: |
| |
| p = Parser(grammar, [converter]) # create instance |
| p.setup([start]) # prepare for parsing |
| <for each input token>: |
| if p.addtoken(...): # parse a token; may raise ParseError |
| break |
| root = p.rootnode # root of abstract syntax tree |
| |
| A Parser instance may be reused by calling setup() repeatedly. |
| |
| A Parser instance contains state pertaining to the current token |
| sequence, and should not be used concurrently by different threads |
| to parse separate token sequences. |
| |
| See driver.py for how to get input tokens by tokenizing a file or |
| string. |
| |
| Parsing is complete when addtoken() returns True; the root of the |
| abstract syntax tree can then be retrieved from the rootnode |
| instance variable. When a syntax error occurs, addtoken() raises |
| the ParseError exception. There is no error recovery; the parser |
| cannot be used after a syntax error was reported (but it can be |
| reinitialized by calling setup()). |
| |
| """ |
|
|
| def __init__(self, grammar: Grammar, convert: Convert | None = None) -> None: |
| """Constructor. |
| |
| The grammar argument is a grammar.Grammar instance; see the |
| grammar module for more information. |
| |
| The parser is not ready yet for parsing; you must call the |
| setup() method to get it started. |
| |
| The optional convert argument is a function mapping concrete |
| syntax tree nodes to abstract syntax tree nodes. If not |
| given, no conversion is done and the syntax tree produced is |
| the concrete syntax tree. If given, it must be a function of |
| two arguments, the first being the grammar (a grammar.Grammar |
| instance), and the second being the concrete syntax tree node |
| to be converted. The syntax tree is converted from the bottom |
| up. |
| |
| **post-note: the convert argument is ignored since for Black's |
| usage, convert will always be blib2to3.pytree.convert. Allowing |
| this to be dynamic hurts mypyc's ability to use early binding. |
| These docs are left for historical and informational value. |
| |
| A concrete syntax tree node is a (type, value, context, nodes) |
| tuple, where type is the node type (a token or symbol number), |
| value is None for symbols and a string for tokens, context is |
| None or an opaque value used for error reporting (typically a |
| (lineno, offset) pair), and nodes is a list of children for |
| symbols, and None for tokens. |
| |
| An abstract syntax tree node may be anything; this is entirely |
| up to the converter function. |
| |
| """ |
| self.grammar = grammar |
| |
| self.convert = convert or lam_sub |
| self.is_backtracking = False |
| self.last_token: int | None = None |
|
|
| def setup(self, proxy: "TokenProxy", start: int | None = None) -> None: |
| """Prepare for parsing. |
| |
| This *must* be called before starting to parse. |
| |
| The optional argument is an alternative start symbol; it |
| defaults to the grammar's start symbol. |
| |
| You can use a Parser instance to parse any number of programs; |
| each time you call setup() the parser is reset to an initial |
| state determined by the (implicit or explicit) start symbol. |
| |
| """ |
| if start is None: |
| start = self.grammar.start |
| |
| |
| |
| newnode: RawNode = (start, None, None, []) |
| stackentry = (self.grammar.dfas[start], 0, newnode) |
| self.stack: list[tuple[DFAS, int, RawNode]] = [stackentry] |
| self.rootnode: NL | None = None |
| self.used_names: set[str] = set() |
| self.proxy = proxy |
| self.last_token = None |
|
|
| def addtoken(self, type: int, value: str, context: Context) -> bool: |
| """Add a token; return True iff this is the end of the program.""" |
| |
| ilabels = self.classify(type, value, context) |
| assert len(ilabels) >= 1 |
|
|
| |
| |
| if len(ilabels) == 1: |
| [ilabel] = ilabels |
| return self._addtoken(ilabel, type, value, context) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| with self.proxy.release() as proxy: |
| counter, force = 0, False |
| recorder = Recorder(self, ilabels, context) |
| recorder.add_token(type, value, raw=True) |
|
|
| next_token_value = value |
| while recorder.determine_route(next_token_value) is None: |
| if not proxy.can_advance(counter): |
| force = True |
| break |
|
|
| next_token_type, next_token_value, *_ = proxy.eat(counter) |
| if next_token_type in (tokenize.COMMENT, tokenize.NL): |
| counter += 1 |
| continue |
|
|
| if next_token_type == tokenize.OP: |
| next_token_type = grammar.opmap[next_token_value] |
|
|
| recorder.add_token(next_token_type, next_token_value) |
| counter += 1 |
|
|
| ilabel = cast(int, recorder.determine_route(next_token_value, force=force)) |
| assert ilabel is not None |
|
|
| return self._addtoken(ilabel, type, value, context) |
|
|
| def _addtoken(self, ilabel: int, type: int, value: str, context: Context) -> bool: |
| |
| while True: |
| dfa, state, node = self.stack[-1] |
| states, first = dfa |
| arcs = states[state] |
| |
| for i, newstate in arcs: |
| t = self.grammar.labels[i][0] |
| if t >= 256: |
| |
| itsdfa = self.grammar.dfas[t] |
| itsstates, itsfirst = itsdfa |
| if ilabel in itsfirst: |
| |
| self.push(t, itsdfa, newstate, context) |
| break |
|
|
| elif ilabel == i: |
| |
| |
| self.shift(type, value, newstate, context) |
| |
| state = newstate |
| while states[state] == [(0, state)]: |
| self.pop() |
| if not self.stack: |
| |
| return True |
| dfa, state, node = self.stack[-1] |
| states, first = dfa |
| |
| self.last_token = type |
| return False |
|
|
| else: |
| if (0, state) in arcs: |
| |
| self.pop() |
| if not self.stack: |
| |
| raise ParseError("too much input", type, value, context) |
| else: |
| |
| raise ParseError("bad input", type, value, context) |
|
|
| def classify(self, type: int, value: str, context: Context) -> list[int]: |
| """Turn a token into a label. (Internal) |
| |
| Depending on whether the value is a soft-keyword or not, |
| this function may return multiple labels to choose from.""" |
| if type == token.NAME: |
| |
| self.used_names.add(value) |
| |
| if value in self.grammar.keywords: |
| return [self.grammar.keywords[value]] |
| elif value in self.grammar.soft_keywords: |
| assert type in self.grammar.tokens |
| |
| |
| |
| |
| |
| if self.last_token not in ( |
| None, |
| token.INDENT, |
| token.DEDENT, |
| token.NEWLINE, |
| token.SEMI, |
| token.COLON, |
| ): |
| return [self.grammar.tokens[type]] |
| return [ |
| self.grammar.tokens[type], |
| self.grammar.soft_keywords[value], |
| ] |
|
|
| ilabel = self.grammar.tokens.get(type) |
| if ilabel is None: |
| raise ParseError("bad token", type, value, context) |
| return [ilabel] |
|
|
| def shift(self, type: int, value: str, newstate: int, context: Context) -> None: |
| """Shift a token. (Internal)""" |
| if self.is_backtracking: |
| dfa, state, _ = self.stack[-1] |
| self.stack[-1] = (dfa, newstate, DUMMY_NODE) |
| else: |
| dfa, state, node = self.stack[-1] |
| rawnode: RawNode = (type, value, context, None) |
| newnode = convert(self.grammar, rawnode) |
| assert node[-1] is not None |
| node[-1].append(newnode) |
| self.stack[-1] = (dfa, newstate, node) |
|
|
| def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None: |
| """Push a nonterminal. (Internal)""" |
| if self.is_backtracking: |
| dfa, state, _ = self.stack[-1] |
| self.stack[-1] = (dfa, newstate, DUMMY_NODE) |
| self.stack.append((newdfa, 0, DUMMY_NODE)) |
| else: |
| dfa, state, node = self.stack[-1] |
| newnode: RawNode = (type, None, context, []) |
| self.stack[-1] = (dfa, newstate, node) |
| self.stack.append((newdfa, 0, newnode)) |
|
|
| def pop(self) -> None: |
| """Pop a nonterminal. (Internal)""" |
| if self.is_backtracking: |
| self.stack.pop() |
| else: |
| popdfa, popstate, popnode = self.stack.pop() |
| newnode = convert(self.grammar, popnode) |
| if self.stack: |
| dfa, state, node = self.stack[-1] |
| assert node[-1] is not None |
| node[-1].append(newnode) |
| else: |
| self.rootnode = newnode |
| self.rootnode.used_names = self.used_names |
|
|