| // Copyright 2020 Google LLC | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // https://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| // | |
| include "expression.fbs"; | |
| include "buffer.fbs"; | |
| include "language-tag.fbs"; | |
| // The terminal rules map as sorted strings table. | |
| // The sorted terminal strings table is represented as offsets into the | |
| // global strings pool, this allows to save memory between localized | |
| // rules sets. | |
| namespace libtextclassifier3.grammar.RulesSet_.Rules_; | |
| table TerminalRulesMap { | |
| // The offsets into the terminals pool. | |
| terminal_offsets:[uint]; | |
| // The lhs set associated with a terminal rule. | |
| // This is an offset into the (deduplicated) global `lhs_set` vector. | |
| lhs_set_index:[uint]; | |
| // Bounds the lengths of the terminal strings for quick early lookup | |
| // abort. | |
| min_terminal_length:int; | |
| max_terminal_length:int; | |
| } | |
| namespace libtextclassifier3.grammar.RulesSet_.Rules_; | |
| struct UnaryRulesEntry { | |
| key:uint (key); | |
| value:uint; | |
| } | |
| // One key, value pair entry in the binary rules hash map. | |
| // The key is a pair of nonterminals and the value the index of the lhs set. | |
| namespace libtextclassifier3.grammar.RulesSet_.Rules_; | |
| struct BinaryRule { | |
| // The two rhs nonterminals. | |
| rhs_first:uint; | |
| rhs_second:uint; | |
| // The lhs set associated with this binary rule. | |
| // This is an offset into the (deduplicated) global `lhs_set` vector. | |
| lhs_set_index:uint; | |
| } | |
| // One bucket in the binary rule hash map that contains all entries for a | |
| // given hash value. | |
| namespace libtextclassifier3.grammar.RulesSet_.Rules_; | |
| table BinaryRuleTableBucket { | |
| rules:[BinaryRule]; | |
| } | |
| namespace libtextclassifier3.grammar.RulesSet_; | |
| table Rules { | |
| // The locale this rule set applies to. | |
| locale:[LanguageTag]; | |
| terminal_rules:Rules_.TerminalRulesMap; | |
| lowercase_terminal_rules:Rules_.TerminalRulesMap; | |
| // The unary rules map. | |
| // This is a map from a nonterminal to an lhs set index into the | |
| // (deduplicated) global `lhs_set` vector. | |
| unary_rules:[Rules_.UnaryRulesEntry]; | |
| // The binary rules (hash) map. | |
| // This is a map from nonterminal pair to an lhs set index into the | |
| // (deduplicated) global `lhs_set` vector. | |
| binary_rules:[Rules_.BinaryRuleTableBucket]; | |
| } | |
| // A set of lhs nonterminals associated with a rule match. | |
| // Most commonly, that is just the id of the lhs nonterminal of the rule that | |
| // is triggered, in this case `lhs` is set to the id of the nonterminal. | |
| // If a callback needs to be triggered, lhs is the (negated) index into the | |
| // `lhs` vector below that specifies additionally to the nonterminal, also the | |
| // callback and parameter to call. | |
| namespace libtextclassifier3.grammar.RulesSet_; | |
| table LhsSet { | |
| lhs:[int]; | |
| } | |
| namespace libtextclassifier3.grammar.RulesSet_; | |
| struct Lhs { | |
| // The lhs nonterminal. | |
| nonterminal:uint; | |
| // The id of the callback to trigger. | |
| callback_id:uint; | |
| // A parameter to pass when invoking the callback. | |
| callback_param:ulong; | |
| // The maximum amount of whitespace allowed between the two parts. | |
| // A value of -1 allows for unbounded whitespace. | |
| max_whitespace_gap:byte; | |
| } | |
| namespace libtextclassifier3.grammar.RulesSet_.Nonterminals_; | |
| table AnnotationNtEntry { | |
| key:string (key); | |
| value:int; | |
| } | |
| // Usage of pre-defined non-terminals that the lexer can generate if used by | |
| // the grammar. | |
| namespace libtextclassifier3.grammar.RulesSet_; | |
| table Nonterminals { | |
| // Id of the nonterminal indicating the start of input. | |
| start_nt:int; | |
| // Id of the nonterminal indicating the end of input. | |
| end_nt:int; | |
| // Id of the nonterminal indicating a token. | |
| token_nt:int; | |
| // Id of the nonterminal indicating a string of digits. | |
| digits_nt:int; | |
| // `n_digits_nt[k]` is the id of the nonterminal indicating a string of | |
| // `k` digits. | |
| n_digits_nt:[int]; | |
| // Id of the nonterminal indicating a word or token boundary. | |
| wordbreak_nt:int; | |
| // Id of the nonterminal indicating an uppercase token. | |
| uppercase_token_nt:int; | |
| // Predefined nonterminals for annotations. | |
| // Maps annotation/collection names to non-terminal ids. | |
| annotation_nt:[Nonterminals_.AnnotationNtEntry]; | |
| } | |
| namespace libtextclassifier3.grammar.RulesSet_.DebugInformation_; | |
| table NonterminalNamesEntry { | |
| key:int (key); | |
| value:string; | |
| } | |
| // Debug information for e.g. printing parse trees and show match | |
| // information. | |
| namespace libtextclassifier3.grammar.RulesSet_; | |
| table DebugInformation { | |
| nonterminal_names:[DebugInformation_.NonterminalNamesEntry]; | |
| } | |
| // Regex annotators. | |
| namespace libtextclassifier3.grammar.RulesSet_; | |
| table RegexAnnotator { | |
| // The pattern to run. | |
| pattern:string; | |
| compressed_pattern:CompressedBuffer; | |
| // The nonterminal to trigger. | |
| nonterminal:uint; | |
| } | |
| // Context free grammar rules representation. | |
| // Rules are represented in (mostly) Chomsky Normal Form, where all rules are | |
| // of the following form, either: | |
| // * <nonterm> ::= term | |
| // * <nonterm> ::= <nonterm> | |
| // * <nonterm> ::= <nonterm> <nonterm> | |
| // The `terminals`, `unary_rules` and `binary_rules` maps below represent | |
| // these sets of rules. | |
| namespace libtextclassifier3.grammar; | |
| table RulesSet { | |
| rules:[RulesSet_.Rules]; | |
| lhs_set:[RulesSet_.LhsSet]; | |
| lhs:[RulesSet_.Lhs]; | |
| // Terminals string pool. | |
| // The strings are zero-byte delimited and offset indexed by | |
| // `terminal_offsets` in the terminals rules map. | |
| terminals:string; | |
| nonterminals:RulesSet_.Nonterminals; | |
| reserved_6:int16 (deprecated); | |
| debug_information:RulesSet_.DebugInformation; | |
| regex_annotator:[RulesSet_.RegexAnnotator]; | |
| // If true, will compile the regexes only on first use. | |
| lazy_regex_compilation:bool; | |
| // The semantic expressions associated with rule matches. | |
| semantic_expression:[SemanticExpression]; | |
| // The schema defining the semantic results. | |
| semantic_values_schema:[ubyte]; | |
| } | |