| """
|
| Python Lexical Analyser
|
|
|
| Lexical Analyser Specification
|
| """
|
| from __future__ import absolute_import
|
|
|
| from . import Actions
|
| from . import DFA
|
| from . import Errors
|
| from . import Machines
|
| from . import Regexps
|
|
|
|
|
| DUMP_NFA = 1
|
| DUMP_DFA = 2
|
|
|
|
|
| class State(object):
|
| """
|
| This class is used as part of a Plex.Lexicon specification to
|
| introduce a user-defined state.
|
|
|
| Constructor:
|
|
|
| State(name, token_specifications)
|
| """
|
|
|
| name = None
|
| tokens = None
|
|
|
| def __init__(self, name, tokens):
|
| self.name = name
|
| self.tokens = tokens
|
|
|
|
|
| class Lexicon(object):
|
| """
|
| Lexicon(specification) builds a lexical analyser from the given
|
| |specification|. The specification consists of a list of
|
| specification items. Each specification item may be either:
|
|
|
| 1) A token definition, which is a tuple:
|
|
|
| (pattern, action)
|
|
|
| The |pattern| is a regular axpression built using the
|
| constructors defined in the Plex module.
|
|
|
| The |action| is the action to be performed when this pattern
|
| is recognised (see below).
|
|
|
| 2) A state definition:
|
|
|
| State(name, tokens)
|
|
|
| where |name| is a character string naming the state,
|
| and |tokens| is a list of token definitions as
|
| above. The meaning and usage of states is described
|
| below.
|
|
|
| Actions
|
| -------
|
|
|
| The |action| in a token specification may be one of three things:
|
|
|
| 1) A function, which is called as follows:
|
|
|
| function(scanner, text)
|
|
|
| where |scanner| is the relevant Scanner instance, and |text|
|
| is the matched text. If the function returns anything
|
| other than None, that value is returned as the value of the
|
| token. If it returns None, scanning continues as if the IGNORE
|
| action were specified (see below).
|
|
|
| 2) One of the following special actions:
|
|
|
| IGNORE means that the recognised characters will be treated as
|
| white space and ignored. Scanning will continue until
|
| the next non-ignored token is recognised before returning.
|
|
|
| TEXT causes the scanned text itself to be returned as the
|
| value of the token.
|
|
|
| 3) Any other value, which is returned as the value of the token.
|
|
|
| States
|
| ------
|
|
|
| At any given time, the scanner is in one of a number of states.
|
| Associated with each state is a set of possible tokens. When scanning,
|
| only tokens associated with the current state are recognised.
|
|
|
| There is a default state, whose name is the empty string. Token
|
| definitions which are not inside any State definition belong to
|
| the default state.
|
|
|
| The initial state of the scanner is the default state. The state can
|
| be changed in one of two ways:
|
|
|
| 1) Using Begin(state_name) as the action of a token.
|
|
|
| 2) Calling the begin(state_name) method of the Scanner.
|
|
|
| To change back to the default state, use '' as the state name.
|
| """
|
|
|
| machine = None
|
| tables = None
|
|
|
| def __init__(self, specifications, debug=None, debug_flags=7):
|
| if not isinstance(specifications, list):
|
| raise Errors.InvalidScanner("Scanner definition is not a list")
|
|
|
| nfa = Machines.Machine()
|
| default_initial_state = nfa.new_initial_state('')
|
| token_number = 1
|
|
|
| for spec in specifications:
|
| if isinstance(spec, State):
|
| user_initial_state = nfa.new_initial_state(spec.name)
|
| for token in spec.tokens:
|
| self.add_token_to_machine(
|
| nfa, user_initial_state, token, token_number)
|
| token_number += 1
|
| elif isinstance(spec, tuple):
|
| self.add_token_to_machine(
|
| nfa, default_initial_state, spec, token_number)
|
| token_number += 1
|
| else:
|
| raise Errors.InvalidToken(
|
| token_number,
|
| "Expected a token definition (tuple) or State instance")
|
|
|
| if debug and (debug_flags & 1):
|
| debug.write("\n============= NFA ===========\n")
|
| nfa.dump(debug)
|
|
|
| dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug)
|
|
|
| if debug and (debug_flags & 2):
|
| debug.write("\n============= DFA ===========\n")
|
| dfa.dump(debug)
|
|
|
| self.machine = dfa
|
|
|
| def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
|
| try:
|
| (re, action_spec) = self.parse_token_definition(token_spec)
|
| if isinstance(action_spec, Actions.Action):
|
| action = action_spec
|
| else:
|
| try:
|
| action_spec.__call__
|
| except AttributeError:
|
| action = Actions.Return(action_spec)
|
| else:
|
| action = Actions.Call(action_spec)
|
| final_state = machine.new_state()
|
| re.build_machine(machine, initial_state, final_state,
|
| match_bol=1, nocase=0)
|
| final_state.set_action(action, priority=-token_number)
|
| except Errors.PlexError as e:
|
| raise e.__class__("Token number %d: %s" % (token_number, e))
|
|
|
| def parse_token_definition(self, token_spec):
|
| if not isinstance(token_spec, tuple):
|
| raise Errors.InvalidToken("Token definition is not a tuple")
|
| if len(token_spec) != 2:
|
| raise Errors.InvalidToken("Wrong number of items in token definition")
|
|
|
| pattern, action = token_spec
|
| if not isinstance(pattern, Regexps.RE):
|
| raise Errors.InvalidToken("Pattern is not an RE instance")
|
| return (pattern, action)
|
|
|
| def get_initial_state(self, name):
|
| return self.machine.get_initial_state(name)
|
|
|