| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from __future__ import absolute_import |
| |
|
| | import types |
| |
|
| | from . import Actions |
| | from . import DFA |
| | from . import Errors |
| | from . import Machines |
| | from . import Regexps |
| |
|
| | |
| | DUMP_NFA = 1 |
| | DUMP_DFA = 2 |
| |
|
| |
|
| | class State(object): |
| | """ |
| | This class is used as part of a Plex.Lexicon specification to |
| | introduce a user-defined state. |
| | |
| | Constructor: |
| | |
| | State(name, token_specifications) |
| | """ |
| |
|
| | name = None |
| | tokens = None |
| |
|
| | def __init__(self, name, tokens): |
| | self.name = name |
| | self.tokens = tokens |
| |
|
| |
|
| | class Lexicon(object): |
| | """ |
| | Lexicon(specification) builds a lexical analyser from the given |
| | |specification|. The specification consists of a list of |
| | specification items. Each specification item may be either: |
| | |
| | 1) A token definition, which is a tuple: |
| | |
| | (pattern, action) |
| | |
| | The |pattern| is a regular axpression built using the |
| | constructors defined in the Plex module. |
| | |
| | The |action| is the action to be performed when this pattern |
| | is recognised (see below). |
| | |
| | 2) A state definition: |
| | |
| | State(name, tokens) |
| | |
| | where |name| is a character string naming the state, |
| | and |tokens| is a list of token definitions as |
| | above. The meaning and usage of states is described |
| | below. |
| | |
| | Actions |
| | ------- |
| | |
| | The |action| in a token specication may be one of three things: |
| | |
| | 1) A function, which is called as follows: |
| | |
| | function(scanner, text) |
| | |
| | where |scanner| is the relevant Scanner instance, and |text| |
| | is the matched text. If the function returns anything |
| | other than None, that value is returned as the value of the |
| | token. If it returns None, scanning continues as if the IGNORE |
| | action were specified (see below). |
| | |
| | 2) One of the following special actions: |
| | |
| | IGNORE means that the recognised characters will be treated as |
| | white space and ignored. Scanning will continue until |
| | the next non-ignored token is recognised before returning. |
| | |
| | TEXT causes the scanned text itself to be returned as the |
| | value of the token. |
| | |
| | 3) Any other value, which is returned as the value of the token. |
| | |
| | States |
| | ------ |
| | |
| | At any given time, the scanner is in one of a number of states. |
| | Associated with each state is a set of possible tokens. When scanning, |
| | only tokens associated with the current state are recognised. |
| | |
| | There is a default state, whose name is the empty string. Token |
| | definitions which are not inside any State definition belong to |
| | the default state. |
| | |
| | The initial state of the scanner is the default state. The state can |
| | be changed in one of two ways: |
| | |
| | 1) Using Begin(state_name) as the action of a token. |
| | |
| | 2) Calling the begin(state_name) method of the Scanner. |
| | |
| | To change back to the default state, use '' as the state name. |
| | """ |
| |
|
| | machine = None |
| | tables = None |
| |
|
| | def __init__(self, specifications, debug=None, debug_flags=7, timings=None): |
| | if not isinstance(specifications, list): |
| | raise Errors.InvalidScanner("Scanner definition is not a list") |
| | if timings: |
| | from .Timing import time |
| |
|
| | total_time = 0.0 |
| | time1 = time() |
| | nfa = Machines.Machine() |
| | default_initial_state = nfa.new_initial_state('') |
| | token_number = 1 |
| | for spec in specifications: |
| | if isinstance(spec, State): |
| | user_initial_state = nfa.new_initial_state(spec.name) |
| | for token in spec.tokens: |
| | self.add_token_to_machine( |
| | nfa, user_initial_state, token, token_number) |
| | token_number += 1 |
| | elif isinstance(spec, tuple): |
| | self.add_token_to_machine( |
| | nfa, default_initial_state, spec, token_number) |
| | token_number += 1 |
| | else: |
| | raise Errors.InvalidToken( |
| | token_number, |
| | "Expected a token definition (tuple) or State instance") |
| | if timings: |
| | time2 = time() |
| | total_time = total_time + (time2 - time1) |
| | time3 = time() |
| | if debug and (debug_flags & 1): |
| | debug.write("\n============= NFA ===========\n") |
| | nfa.dump(debug) |
| | dfa = DFA.nfa_to_dfa(nfa, debug=(debug_flags & 3) == 3 and debug) |
| | if timings: |
| | time4 = time() |
| | total_time = total_time + (time4 - time3) |
| | if debug and (debug_flags & 2): |
| | debug.write("\n============= DFA ===========\n") |
| | dfa.dump(debug) |
| | if timings: |
| | timings.write("Constructing NFA : %5.2f\n" % (time2 - time1)) |
| | timings.write("Converting to DFA: %5.2f\n" % (time4 - time3)) |
| | timings.write("TOTAL : %5.2f\n" % total_time) |
| | self.machine = dfa |
| |
|
| | def add_token_to_machine(self, machine, initial_state, token_spec, token_number): |
| | try: |
| | (re, action_spec) = self.parse_token_definition(token_spec) |
| | |
| | |
| | |
| | |
| | if isinstance(action_spec, Actions.Action): |
| | action = action_spec |
| | else: |
| | try: |
| | action_spec.__call__ |
| | except AttributeError: |
| | action = Actions.Return(action_spec) |
| | else: |
| | action = Actions.Call(action_spec) |
| | final_state = machine.new_state() |
| | re.build_machine(machine, initial_state, final_state, |
| | match_bol=1, nocase=0) |
| | final_state.set_action(action, priority=-token_number) |
| | except Errors.PlexError as e: |
| | raise e.__class__("Token number %d: %s" % (token_number, e)) |
| |
|
| | def parse_token_definition(self, token_spec): |
| | if not isinstance(token_spec, tuple): |
| | raise Errors.InvalidToken("Token definition is not a tuple") |
| | if len(token_spec) != 2: |
| | raise Errors.InvalidToken("Wrong number of items in token definition") |
| | pattern, action = token_spec |
| | if not isinstance(pattern, Regexps.RE): |
| | raise Errors.InvalidToken("Pattern is not an RE instance") |
| | return (pattern, action) |
| |
|
| | def get_initial_state(self, name): |
| | return self.machine.get_initial_state(name) |
| |
|
| |
|
| |
|
| |
|