| // Copyright 2020 Google LLC | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // https://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| // | |
| include "bit-vector.fbs"; | |
| include "intent-config.fbs"; | |
| include "normalization.fbs"; | |
| include "flatbuffers.fbs"; | |
| include "experimental.fbs"; | |
| include "resources.fbs"; | |
| include "entity-data.fbs"; | |
| include "codepoint-range.fbs"; | |
| include "tokenizer.fbs"; | |
| include "buffer.fbs"; | |
| include "rules.fbs"; | |
| file_identifier "TC2 "; | |
| // The possible model modes, represents a bit field. | |
| namespace libtextclassifier3; | |
| enum ModeFlag : int { | |
| NONE = 0, | |
| ANNOTATION = 1, | |
| CLASSIFICATION = 2, | |
| ANNOTATION_AND_CLASSIFICATION = 3, | |
| SELECTION = 4, | |
| ANNOTATION_AND_SELECTION = 5, | |
| CLASSIFICATION_AND_SELECTION = 6, | |
| ALL = 7, | |
| } | |
| // Enum for specifying the annotation usecase. | |
| namespace libtextclassifier3; | |
| enum AnnotationUsecase : int { | |
| // Results are optimized for Smart{Select,Share,Linkify}. | |
| ANNOTATION_USECASE_SMART = 0, | |
| // Smart{Select,Share,Linkify} | |
| // Results are optimized for using TextClassifier as an infrastructure that | |
| // annotates as much as possible. | |
| ANNOTATION_USECASE_RAW = 1, | |
| } | |
| namespace libtextclassifier3; | |
| enum DatetimeExtractorType : int { | |
| UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, | |
| AM = 1, | |
| PM = 2, | |
| JANUARY = 3, | |
| FEBRUARY = 4, | |
| MARCH = 5, | |
| APRIL = 6, | |
| MAY = 7, | |
| JUNE = 8, | |
| JULY = 9, | |
| AUGUST = 10, | |
| SEPTEMBER = 11, | |
| OCTOBER = 12, | |
| NOVEMBER = 13, | |
| DECEMBER = 14, | |
| NEXT = 15, | |
| NEXT_OR_SAME = 16, | |
| LAST = 17, | |
| NOW = 18, | |
| TOMORROW = 19, | |
| YESTERDAY = 20, | |
| PAST = 21, | |
| FUTURE = 22, | |
| DAY = 23, | |
| WEEK = 24, | |
| MONTH = 25, | |
| YEAR = 26, | |
| MONDAY = 27, | |
| TUESDAY = 28, | |
| WEDNESDAY = 29, | |
| THURSDAY = 30, | |
| FRIDAY = 31, | |
| SATURDAY = 32, | |
| SUNDAY = 33, | |
| DAYS = 34, | |
| WEEKS = 35, | |
| MONTHS = 36, | |
| // TODO(zilka): Make the following 3 values singular for consistency. | |
| HOURS = 37, | |
| MINUTES = 38, | |
| SECONDS = 39, | |
| YEARS = 40, | |
| DIGITS = 41, | |
| SIGNEDDIGITS = 42, | |
| ZERO = 43, | |
| ONE = 44, | |
| TWO = 45, | |
| THREE = 46, | |
| FOUR = 47, | |
| FIVE = 48, | |
| SIX = 49, | |
| SEVEN = 50, | |
| EIGHT = 51, | |
| NINE = 52, | |
| TEN = 53, | |
| ELEVEN = 54, | |
| TWELVE = 55, | |
| THIRTEEN = 56, | |
| FOURTEEN = 57, | |
| FIFTEEN = 58, | |
| SIXTEEN = 59, | |
| SEVENTEEN = 60, | |
| EIGHTEEN = 61, | |
| NINETEEN = 62, | |
| TWENTY = 63, | |
| THIRTY = 64, | |
| FORTY = 65, | |
| FIFTY = 66, | |
| SIXTY = 67, | |
| SEVENTY = 68, | |
| EIGHTY = 69, | |
| NINETY = 70, | |
| HUNDRED = 71, | |
| THOUSAND = 72, | |
| NOON = 73, | |
| MIDNIGHT = 74, | |
| } | |
| namespace libtextclassifier3; | |
| enum DatetimeGroupType : int { | |
| GROUP_UNKNOWN = 0, | |
| GROUP_UNUSED = 1, | |
| GROUP_YEAR = 2, | |
| GROUP_MONTH = 3, | |
| GROUP_DAY = 4, | |
| GROUP_HOUR = 5, | |
| GROUP_MINUTE = 6, | |
| GROUP_SECOND = 7, | |
| GROUP_AMPM = 8, | |
| GROUP_RELATIONDISTANCE = 9, | |
| GROUP_RELATION = 10, | |
| GROUP_RELATIONTYPE = 11, | |
| // Dummy groups serve just as an inflator of the selection. E.g. we might want | |
| // to select more text than was contained in an envelope of all extractor | |
| // spans. | |
| GROUP_DUMMY1 = 12, | |
| GROUP_DUMMY2 = 13, | |
| GROUP_ABSOLUTETIME = 14, | |
| } | |
| // Options for the model that predicts text selection. | |
| namespace libtextclassifier3; | |
| table SelectionModelOptions { | |
| // If true, before the selection is returned, the unpaired brackets contained | |
| // in the predicted selection are stripped from the both selection ends. | |
| // The bracket codepoints are defined in the Unicode standard: | |
| // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt | |
| strip_unpaired_brackets:bool = true; | |
| // Number of hypothetical click positions on either side of the actual click | |
| // to consider in order to enforce symmetry. | |
| symmetry_context_size:int; | |
| // Number of examples to bundle in one batch for inference. | |
| batch_size:int = 1024; | |
| // Whether to always classify a suggested selection or only on demand. | |
| always_classify_suggested_selection:bool = false; | |
| } | |
| // Options for the model that classifies a text selection. | |
| namespace libtextclassifier3; | |
| table ClassificationModelOptions { | |
| // Limits for phone numbers. | |
| phone_min_num_digits:int = 7; | |
| phone_max_num_digits:int = 15; | |
| // Limits for addresses. | |
| address_min_num_tokens:int; | |
| // Maximum number of tokens to attempt a classification (-1 is unlimited). | |
| max_num_tokens:int = -1; | |
| } | |
| // Options for post-checks, checksums and verification to apply on a match. | |
| namespace libtextclassifier3; | |
| table VerificationOptions { | |
| verify_luhn_checksum:bool = false; | |
| // Lua verifier to use. | |
| // Index of the lua verifier in the model. | |
| lua_verifier:int = -1; | |
| } | |
| // Behaviour of rule capturing groups. | |
| // This specifies how the text and span of a capturing group, in a regular | |
| // expression or from a capturing match in a grammar rule, should be handled. | |
| namespace libtextclassifier3; | |
| table CapturingGroup { | |
| // If true, the span of the capturing group will be used to | |
| // extend the selection. | |
| extend_selection:bool = true; | |
| // If set, the text of the capturing group will be used to set a field in | |
| // the classfication result entity data. | |
| entity_field_path:FlatbufferFieldPath; | |
| // If set, the flatbuffer entity data will be merged with the | |
| // classification result entity data. | |
| serialized_entity_data:string; | |
| // If set, normalization to apply before text is used in entity data. | |
| normalization_options:NormalizationOptions; | |
| entity_data:EntityData; | |
| } | |
| // List of regular expression matchers to check. | |
| namespace libtextclassifier3.RegexModel_; | |
| table Pattern { | |
| // The name of the collection of a match. | |
| collection_name:string; | |
| // The pattern to check. | |
| pattern:string; | |
| // The modes for which to apply the patterns. | |
| enabled_modes:ModeFlag = ALL; | |
| // The final score to assign to the results of this pattern. | |
| target_classification_score:float = 1; | |
| // Priority score used for conflict resolution with the other models. | |
| priority_score:float = 0; | |
| // If true, will use an approximate matching implementation implemented | |
| // using Find() instead of the true Match(). This approximate matching will | |
| // use the first Find() result and then check that it spans the whole input. | |
| use_approximate_matching:bool = false; | |
| compressed_pattern:CompressedBuffer; | |
| // Verification to apply on a match. | |
| verification_options:VerificationOptions; | |
| capturing_group:[CapturingGroup]; | |
| // Entity data to set for a match. | |
| serialized_entity_data:string; | |
| entity_data:EntityData; | |
| } | |
| namespace libtextclassifier3; | |
| table RegexModel { | |
| patterns:[RegexModel_.Pattern]; | |
| // If true, will compile the regexes only on first use. | |
| lazy_regex_compilation:bool = true; | |
| // Lua scripts for match verification. | |
| // The verifier can access: | |
| // * `context`: The context as a string. | |
| // * `match`: The groups of the regex match as an array, each group gives | |
| // * `begin`: span start | |
| // * `end`: span end | |
| // * `text`: the text | |
| // The verifier is expected to return a boolean, indicating whether the | |
| // verification succeeded or not. | |
| lua_verifier:[string]; | |
| } | |
| // List of regex patterns. | |
| namespace libtextclassifier3.DatetimeModelPattern_; | |
| table Regex { | |
| pattern:string; | |
| // The ith entry specifies the type of the ith capturing group. | |
| // This is used to decide how the matched content has to be parsed. | |
| groups:[DatetimeGroupType]; | |
| compressed_pattern:CompressedBuffer; | |
| } | |
| namespace libtextclassifier3; | |
| table DatetimeModelPattern { | |
| regexes:[DatetimeModelPattern_.Regex]; | |
| // List of locale indices in DatetimeModel that represent the locales that | |
| // these patterns should be used for. If empty, can be used for all locales. | |
| locales:[int]; | |
| // The final score to assign to the results of this pattern. | |
| target_classification_score:float = 1; | |
| // Priority score used for conflict resolution with the other models. | |
| priority_score:float = 0; | |
| // The modes for which to apply the patterns. | |
| enabled_modes:ModeFlag = ALL; | |
| // The annotation usecases for which to apply the patterns. | |
| // This is a flag field for values of AnnotationUsecase. | |
| enabled_annotation_usecases:uint = 4294967295; | |
| } | |
| namespace libtextclassifier3; | |
| table DatetimeModelExtractor { | |
| extractor:DatetimeExtractorType; | |
| pattern:string; | |
| locales:[int]; | |
| compressed_pattern:CompressedBuffer; | |
| } | |
| namespace libtextclassifier3; | |
| table DatetimeModel { | |
| // List of BCP 47 locale strings representing all locales supported by the | |
| // model. The individual patterns refer back to them using an index. | |
| locales:[string]; | |
| patterns:[DatetimeModelPattern]; | |
| extractors:[DatetimeModelExtractor]; | |
| // If true, will use the extractors for determining the match location as | |
| // opposed to using the location where the global pattern matched. | |
| use_extractors_for_locating:bool = true; | |
| // List of locale ids, rules of whose are always run, after the requested | |
| // ones. | |
| default_locales:[int]; | |
| // If true, will generate the alternative interpretations for ambiguous | |
| // datetime expressions. | |
| generate_alternative_interpretations_when_ambiguous:bool = false; | |
| // If true, will compile the regexes only on first use. | |
| lazy_regex_compilation:bool = true; | |
| // If true, will give only future dates (when the day is not specified). | |
| prefer_future_for_unspecified_date:bool = false; | |
| } | |
| // Configuration for the tokenizer. | |
| namespace libtextclassifier3; | |
| table GrammarTokenizerOptions { | |
| tokenization_type:TokenizationType = ICU; | |
| // If true, white space tokens will be kept when using the icu tokenizer. | |
| icu_preserve_whitespace_tokens:bool = false; | |
| // Codepoint ranges that determine what role the different codepoints play | |
| // during tokenized. The ranges must not overlap. | |
| tokenization_codepoint_config:[TokenizationCodepointRange]; | |
| // A set of codepoint ranges to use in the mixed tokenization mode to identify | |
| // stretches of tokens to re-tokenize using the internal tokenizer. | |
| internal_tokenizer_codepoint_ranges:[CodepointRange]; | |
| // If true, tokens will be also split when the codepoint's script_id changes | |
| // as defined in TokenizationCodepointRange. | |
| tokenize_on_script_change:bool = false; | |
| } | |
| namespace libtextclassifier3.DatetimeModelLibrary_; | |
| table Item { | |
| key:string; | |
| value:DatetimeModel; | |
| } | |
| // A set of named DateTime models. | |
| namespace libtextclassifier3; | |
| table DatetimeModelLibrary { | |
| models:[DatetimeModelLibrary_.Item]; | |
| } | |
| // Classification result to instantiate for a rule match. | |
| namespace libtextclassifier3.GrammarModel_; | |
| table RuleClassificationResult { | |
| // The name of the collection. | |
| collection_name:string; | |
| // The score. | |
| target_classification_score:float = 1; | |
| // The priority score used for conflict resolution with the other models. | |
| priority_score:float = 0; | |
| // Behaviour of capturing matches. | |
| capturing_group:[CapturingGroup]; | |
| // Entity data to set for a match. | |
| serialized_entity_data:string; | |
| // Enabled modes. | |
| enabled_modes:ModeFlag = ALL; | |
| entity_data:EntityData; | |
| } | |
| // Configuration for grammar based annotators. | |
| namespace libtextclassifier3; | |
| table GrammarModel { | |
| // The grammar rules. | |
| rules:grammar.RulesSet; | |
| rule_classification_result:[GrammarModel_.RuleClassificationResult]; | |
| // Number of tokens in the context to use for classification and text | |
| // selection suggestion. | |
| // A value -1 uses the full context. | |
| context_left_num_tokens:int; | |
| context_right_num_tokens:int; | |
| // Grammar specific tokenizer options. | |
| tokenizer_options:GrammarTokenizerOptions; | |
| } | |
| namespace libtextclassifier3.MoneyParsingOptions_; | |
| table QuantitiesNameToExponentEntry { | |
| key:string (key); | |
| value:int; | |
| } | |
| namespace libtextclassifier3; | |
| table MoneyParsingOptions { | |
| // Separators (codepoints) marking decimal or thousand in the money amount. | |
| separators:[int]; | |
| // Mapping between a quantity string (e.g. "million") and the power of 10 | |
| // it multiplies the amount with (e.g. 6 in case of "million"). | |
| // NOTE: The entries need to be sorted by key since we use LookupByKey. | |
| quantities_name_to_exponent:[MoneyParsingOptions_.QuantitiesNameToExponentEntry]; | |
| } | |
| namespace libtextclassifier3.ModelTriggeringOptions_; | |
| table CollectionToPriorityEntry { | |
| key:string (key); | |
| value:float; | |
| } | |
| // Options controlling the output of the Tensorflow Lite models. | |
| namespace libtextclassifier3; | |
| table ModelTriggeringOptions { | |
| // Lower bound threshold for filtering annotation model outputs. | |
| min_annotate_confidence:float = 0; | |
| // The modes for which to enable the models. | |
| enabled_modes:ModeFlag = ALL; | |
| // Comma-separated list of locales (BCP 47 tags) that dictionary | |
| // classification supports. | |
| dictionary_locales:string; | |
| // Comma-separated list of locales (BCP 47 tags) that the model supports, that | |
| // are used to prevent triggering on input in unsupported languages. If | |
| // empty, the model will trigger on all inputs. | |
| locales:string; | |
| // Priority score assigned to the "other" class from ML model. | |
| other_collection_priority_score:float = -1000; | |
| // Priority score assigned to knowledge engine annotations. | |
| knowledge_priority_score:float = 0; | |
| reserved_7:int16 (deprecated); | |
| // Apply a factor to the priority score for entities that are added to this | |
| // map. Key: collection type e.g. "address", "phone"..., Value: float number. | |
| // NOTE: The entries here need to be sorted since we use LookupByKey. | |
| collection_to_priority:[ModelTriggeringOptions_.CollectionToPriorityEntry]; | |
| } | |
| // Options controlling the output of the classifier. | |
| namespace libtextclassifier3; | |
| table OutputOptions { | |
| // Lists of collection names that will be filtered out at the output: | |
| // - For annotation, the spans of given collection are simply dropped. | |
| // - For classification, the result is mapped to the class "other". | |
| // - For selection, the spans of given class are returned as | |
| // single-selection. | |
| filtered_collections_annotation:[string]; | |
| filtered_collections_classification:[string]; | |
| filtered_collections_selection:[string]; | |
| } | |
| namespace libtextclassifier3.Model_; | |
| table EmbeddingPruningMask { | |
| // If true, use pruning mask. In this case, we use mask | |
| // pruning_mask to determine the mapping of hashed-charactergrams. | |
| enabled:bool; | |
| // Packing of the binary pruning mask into uint64 values. | |
| pruning_mask:[ulong] (force_align: 16); | |
| // Number of buckets before pruning. | |
| full_num_buckets:int; | |
| // Index of row of compressed embedding matrix to which all pruned buckets | |
| // are mapped. | |
| pruned_row_bucket_id:int; | |
| } | |
| namespace libtextclassifier3.Model_; | |
| table ConflictResolutionOptions { | |
| // If true, will prioritize the longest annotation during conflict | |
| // resolution. | |
| prioritize_longest_annotation:bool = false; | |
| // If true, the annotator will perform conflict resolution between the | |
| // different sub-annotators also in the RAW mode. If false, no conflict | |
| // resolution will be performed in RAW mode. | |
| do_conflict_resolution_in_raw_mode:bool = true; | |
| } | |
| namespace libtextclassifier3; | |
| table Model { | |
| // Comma-separated list of locales supported by the model as BCP 47 tags. | |
| locales:string; | |
| version:int; | |
| // A name for the model that can be used for e.g. logging. | |
| name:string; | |
| selection_feature_options:FeatureProcessorOptions; | |
| classification_feature_options:FeatureProcessorOptions; | |
| // Tensorflow Lite models. | |
| selection_model:[ubyte] (force_align: 16); | |
| classification_model:[ubyte] (force_align: 16); | |
| embedding_model:[ubyte] (force_align: 16); | |
| // Options for the different models. | |
| selection_options:SelectionModelOptions; | |
| classification_options:ClassificationModelOptions; | |
| regex_model:RegexModel; | |
| datetime_model:DatetimeModel; | |
| // Options controlling the output of the models. | |
| triggering_options:ModelTriggeringOptions; | |
| // Global switch that controls if SuggestSelection(), ClassifyText() and | |
| // Annotate() will run. If a mode is disabled it returns empty/no-op results. | |
| enabled_modes:ModeFlag = ALL; | |
| // If true, will snap the selections that consist only of whitespaces to the | |
| // containing suggested span. Otherwise, no suggestion is proposed, since the | |
| // selections are not part of any token. | |
| snap_whitespace_selections:bool = true; | |
| // Global configuration for the output of SuggestSelection(), ClassifyText() | |
| // and Annotate(). | |
| output_options:OutputOptions; | |
| // Configures how Intents should be generated on Android. | |
| android_intent_options:AndroidIntentFactoryOptions; | |
| intent_options:IntentFactoryModel; | |
| // Model resources. | |
| resources:ResourcePool; | |
| // Schema data for handling entity data. | |
| entity_data_schema:[ubyte]; | |
| number_annotator_options:NumberAnnotatorOptions; | |
| duration_annotator_options:DurationAnnotatorOptions; | |
| // Comma-separated list of locales (BCP 47 tags) that the model supports, that | |
| // are used to prevent triggering on input in unsupported languages. If | |
| // empty, the model will trigger on all inputs. | |
| triggering_locales:string; | |
| embedding_pruning_mask:Model_.EmbeddingPruningMask; | |
| reserved_25:int16 (deprecated); | |
| contact_annotator_options:ContactAnnotatorOptions; | |
| money_parsing_options:MoneyParsingOptions; | |
| translate_annotator_options:TranslateAnnotatorOptions; | |
| grammar_model:GrammarModel; | |
| conflict_resolution_options:Model_.ConflictResolutionOptions; | |
| experimental_model:ExperimentalModel; | |
| pod_ner_model:PodNerModel; | |
| vocab_model:VocabModel; | |
| } | |
| // Method for selecting the center token. | |
| namespace libtextclassifier3.FeatureProcessorOptions_; | |
| enum CenterTokenSelectionMethod : int { | |
| DEFAULT_CENTER_TOKEN_METHOD = 0, | |
| // Invalid option. | |
| // Use click indices to determine the center token. | |
| CENTER_TOKEN_FROM_CLICK = 1, | |
| // Use selection indices to get a token range, and select the middle of it | |
| // as the center token. | |
| CENTER_TOKEN_MIDDLE_OF_SELECTION = 2, | |
| } | |
| // Bounds-sensitive feature extraction configuration. | |
| namespace libtextclassifier3.FeatureProcessorOptions_; | |
| table BoundsSensitiveFeatures { | |
| // Enables the extraction of bounds-sensitive features, instead of the click | |
| // context features. | |
| enabled:bool; | |
| // The numbers of tokens to extract in specific locations relative to the | |
| // bounds. | |
| // Immediately before the span. | |
| num_tokens_before:int; | |
| // Inside the span, aligned with the beginning. | |
| num_tokens_inside_left:int; | |
| // Inside the span, aligned with the end. | |
| num_tokens_inside_right:int; | |
| // Immediately after the span. | |
| num_tokens_after:int; | |
| // If true, also extracts the tokens of the entire span and adds up their | |
| // features forming one "token" to include in the extracted features. | |
| include_inside_bag:bool; | |
| // If true, includes the selection length (in the number of tokens) as a | |
| // feature. | |
| include_inside_length:bool; | |
| // If true, for selection, single token spans are not run through the model | |
| // and their score is assumed to be zero. | |
| score_single_token_spans_as_zero:bool; | |
| } | |
| namespace libtextclassifier3; | |
| table FeatureProcessorOptions { | |
| // Number of buckets used for hashing charactergrams. | |
| num_buckets:int = -1; | |
| // Size of the embedding. | |
| embedding_size:int = -1; | |
| // Number of bits for quantization for embeddings. | |
| embedding_quantization_bits:int = 8; | |
| // Context size defines the number of words to the left and to the right of | |
| // the selected word to be used as context. For example, if context size is | |
| // N, then we take N words to the left and N words to the right of the | |
| // selected word as its context. | |
| context_size:int = -1; | |
| // Maximum number of words of the context to select in total. | |
| max_selection_span:int = -1; | |
| // Orders of charactergrams to extract. E.g., 2 means character bigrams, 3 | |
| // character trigrams etc. | |
| chargram_orders:[int]; | |
| // Maximum length of a word, in codepoints. | |
| max_word_length:int = 20; | |
| // If true, will use the unicode-aware functionality for extracting features. | |
| unicode_aware_features:bool = false; | |
| // Whether to extract the token case feature. | |
| extract_case_feature:bool = false; | |
| // Whether to extract the selection mask feature. | |
| extract_selection_mask_feature:bool = false; | |
| // List of regexps to run over each token. For each regexp, if there is a | |
| // match, a dense feature of 1.0 is emitted. Otherwise -1.0 is used. | |
| regexp_feature:[string]; | |
| // Whether to remap all digits to a single number. | |
| remap_digits:bool = false; | |
| // Whether to lower-case each token before generating hashgrams. | |
| lowercase_tokens:bool; | |
| // If true, the selection classifier output will contain only the selections | |
| // that are feasible (e.g., those that are shorter than max_selection_span), | |
| // if false, the output will be a complete cross-product of possible | |
| // selections to the left and possible selections to the right, including the | |
| // infeasible ones. | |
| // NOTE: Exists mainly for compatibility with older models that were trained | |
| // with the non-reduced output space. | |
| selection_reduced_output_space:bool = true; | |
| // Collection names. | |
| collections:[string]; | |
| // An index of collection in collections to be used if a collection name can't | |
| // be mapped to an id. | |
| default_collection:int = -1; | |
| // If true, will split the input by lines, and only use the line that contains | |
| // the clicked token. | |
| only_use_line_with_click:bool = false; | |
| // If true, will split tokens that contain the selection boundary, at the | |
| // position of the boundary. | |
| // E.g. "foo{bar}@google.com" -> "foo", "bar", "@google.com" | |
| split_tokens_on_selection_boundaries:bool = false; | |
| // Codepoint ranges that determine how different codepoints are tokenized. | |
| // The ranges must not overlap. | |
| tokenization_codepoint_config:[TokenizationCodepointRange]; | |
| center_token_selection_method:FeatureProcessorOptions_.CenterTokenSelectionMethod; | |
| // If true, span boundaries will be snapped to containing tokens and not | |
| // required to exactly match token boundaries. | |
| snap_label_span_boundaries_to_containing_tokens:bool; | |
| // A set of codepoint ranges supported by the model. | |
| supported_codepoint_ranges:[CodepointRange]; | |
| // A set of codepoint ranges to use in the mixed tokenization mode to identify | |
| // stretches of tokens to re-tokenize using the internal tokenizer. | |
| internal_tokenizer_codepoint_ranges:[CodepointRange]; | |
| // Minimum ratio of supported codepoints in the input context. If the ratio | |
| // is lower than this, the feature computation will fail. | |
| min_supported_codepoint_ratio:float = 0; | |
| // Used for versioning the format of features the model expects. | |
| // - feature_version == 0: | |
| // For each token the features consist of: | |
| // - chargram embeddings | |
| // - dense features | |
| // Chargram embeddings for tokens are concatenated first together, | |
| // and at the end, the dense features for the tokens are concatenated | |
| // to it. So the resulting feature vector has two regions. | |
| feature_version:int = 0; | |
| tokenization_type:TokenizationType = INTERNAL_TOKENIZER; | |
| icu_preserve_whitespace_tokens:bool = false; | |
| // List of codepoints that will be stripped from beginning and end of | |
| // predicted spans. | |
| ignored_span_boundary_codepoints:[int]; | |
| bounds_sensitive_features:FeatureProcessorOptions_.BoundsSensitiveFeatures; | |
| // List of allowed charactergrams. The extracted charactergrams are filtered | |
| // using this list, and charactergrams that are not present are interpreted as | |
| // out-of-vocabulary. | |
| // If no allowed_chargrams are specified, all charactergrams are allowed. | |
| // The field is typed as bytes type to allow non-UTF8 chargrams. | |
| allowed_chargrams:[string]; | |
| // If true, tokens will be also split when the codepoint's script_id changes | |
| // as defined in TokenizationCodepointRange. | |
| tokenize_on_script_change:bool = false; | |
| // If true, the pipe character '|' will be used as a newline character when | |
| // splitting lines. | |
| use_pipe_character_for_newline:bool = true; | |
| } | |
| namespace libtextclassifier3; | |
| table NumberAnnotatorOptions { | |
| // If true, number and percentage annotations will be produced. | |
| enabled:bool = false; | |
| // Score to assign to the annotated numbers and percentages in the annotator. | |
| score:float = 1; | |
| // Number priority score used for conflict resolution with the other models. | |
| priority_score:float = 0; | |
| // The modes in which to enable number and percentage annotations. | |
| enabled_modes:ModeFlag = ALL; | |
| // The annotation usecases for which to produce number annotations. | |
| // This is a flag field for values of AnnotationUsecase. | |
| enabled_annotation_usecases:uint = 4294967295; | |
| // [Deprecated] A list of codepoints that can form a prefix of a valid number. | |
| allowed_prefix_codepoints:[int]; | |
| // [Deprecated] A list of codepoints that can form a suffix of a valid number. | |
| allowed_suffix_codepoints:[int]; | |
| // [Deprecated] List of codepoints that will be stripped from beginning of | |
| // predicted spans. | |
| ignored_prefix_span_boundary_codepoints:[int]; | |
| // [Deprecated] List of codepoints that will be stripped from end of predicted | |
| // spans. | |
| ignored_suffix_span_boundary_codepoints:[int]; | |
| // [Deprecated] If true, percent annotations will be produced. | |
| enable_percentage:bool = false; | |
| // Zero separated and ordered list of suffixes that mark a percent. | |
| percentage_pieces_string:string; | |
| // [Deprecated] List of suffixes offsets in the percent_pieces_string string. | |
| percentage_pieces_offsets:[int]; | |
| // Priority score for the percentage annotation. | |
| percentage_priority_score:float = 1; | |
| // Float number priority score used for conflict resolution with the other | |
| // models. | |
| float_number_priority_score:float = 0; | |
| // The maximum number of digits an annotated number can have. Requirement: | |
| // the value should be less or equal to 20. | |
| max_number_of_digits:int = 20; | |
| // The annotation usecases for which to produce percentage annotations. | |
| // This is a flag field for values of AnnotationUsecase. | |
| percentage_annotation_usecases:uint = 2; | |
| } | |
| // DurationAnnotator is so far tailored for English and Japanese only. | |
| namespace libtextclassifier3; | |
| table DurationAnnotatorOptions { | |
| // If true, duration annotations will be produced. | |
| enabled:bool = false; | |
| // Score to assign to the annotated durations from the annotator. | |
| score:float = 1; | |
| // Priority score used for conflict resolution with the other models. | |
| priority_score:float = 0; | |
| // The modes in which to enable duration annotations. | |
| enabled_modes:ModeFlag = ALL; | |
| // The annotation usecases for which to produce duration annotations. | |
| enabled_annotation_usecases:uint = 4294967295; | |
| // Durations typically look like XX hours and XX minutes etc... The list of | |
| // strings below enumerate variants of "hours", "minutes", etc. in these | |
| // expressions. These are verbatim strings that are matched against tokens in | |
| // the input. | |
| week_expressions:[string]; | |
| day_expressions:[string]; | |
| hour_expressions:[string]; | |
| minute_expressions:[string]; | |
| second_expressions:[string]; | |
| // List of expressions that doesn't break a duration expression (can become | |
| // a part of it) but has not semantic meaning. | |
| filler_expressions:[string]; | |
| // List of expressions that mean half of a unit of duration (e.g. "half an | |
| // hour"). | |
| half_expressions:[string]; | |
| // Set of condepoints that can split the Annotator tokens to sub-tokens for | |
| // sub-token matching. | |
| sub_token_separator_codepoints:[int]; | |
| // If this is true, unit must be associated with quantity. For example, a | |
| // phrase "minute" is not parsed as one minute duration if this is true. | |
| require_quantity:bool; | |
| // If this is true, dangling quantity is included in the annotation. For | |
| // example, "10 minutes 20" is interpreted as 10 minutes and 20 seconds. | |
| enable_dangling_quantity_interpretation:bool = true; | |
| } | |
| namespace libtextclassifier3; | |
| table ContactAnnotatorOptions { | |
| // Supported for English genitives only so far. | |
| enable_declension:bool; | |
| // For each language there is a customized list of supported declensions. | |
| language:string; | |
| } | |
| namespace libtextclassifier3.TranslateAnnotatorOptions_; | |
| enum Algorithm : int { | |
| DEFAULT_ALGORITHM = 0, | |
| BACKOFF = 1, | |
| } | |
| // Backoff is the algorithm shipped with Android Q. | |
| namespace libtextclassifier3.TranslateAnnotatorOptions_; | |
| table BackoffOptions { | |
| // The minimum size of text to prefer for detection (in codepoints). | |
| min_text_size:int = 20; | |
| // For reducing the score when text is less than the preferred size. | |
| penalize_ratio:float = 1; | |
| // Original detection score to surrounding text detection score ratios. | |
| subject_text_score_ratio:float = 0.4; | |
| } | |
| namespace libtextclassifier3; | |
| table TranslateAnnotatorOptions { | |
| enabled:bool = false; | |
| // Score to assign to the classification results. | |
| score:float = 1; | |
| // Priority score used for conflict resolution with the other models. | |
| priority_score:float; | |
| algorithm:TranslateAnnotatorOptions_.Algorithm; | |
| backoff_options:TranslateAnnotatorOptions_.BackoffOptions; | |
| } | |
| namespace libtextclassifier3.PodNerModel_; | |
| table Collection { | |
| // Collection's name (e.g., "location", "person"). | |
| name:string; | |
| // Priority scores used for conflict resolution with the other annotators | |
| // when the annotation is made over a single/multi token text. | |
| single_token_priority_score:float; | |
| multi_token_priority_score:float; | |
| } | |
| namespace libtextclassifier3.PodNerModel_.Label_; | |
| enum BoiseType : int { | |
| NONE = 0, | |
| BEGIN = 1, | |
| O = 2, | |
| // No label. | |
| INTERMEDIATE = 3, | |
| SINGLE = 4, | |
| END = 5, | |
| } | |
| namespace libtextclassifier3.PodNerModel_.Label_; | |
| enum MentionType : int { | |
| UNDEFINED = 0, | |
| NAM = 1, | |
| NOM = 2, | |
| } | |
| namespace libtextclassifier3.PodNerModel_; | |
| table Label { | |
| boise_type:Label_.BoiseType; | |
| mention_type:Label_.MentionType; | |
| collection_id:int; | |
| // points to the collections array above. | |
| } | |
| namespace libtextclassifier3; | |
| table PodNerModel { | |
| tflite_model:[ubyte]; | |
| word_piece_vocab:[ubyte]; | |
| lowercase_input:bool = true; | |
| // Index of mention_logits tensor in the output of the tflite model. Can | |
| // be found in the textproto output after model is converted to tflite. | |
| logits_index_in_output_tensor:int = 0; | |
| // Whether to append a period at the end of an input that doesn't already | |
| // end in punctuation. | |
| append_final_period:bool = false; | |
| // Priority score used for conflict resolution with the other models. Used | |
| // only if collections_array is empty. | |
| priority_score:float = 0; | |
| // Maximum number of wordpieces supported by the model. | |
| max_num_wordpieces:int = 128; | |
| // In case of long text (number of wordpieces greater than the max) we use | |
| // sliding window approach, this determines the number of overlapping | |
| // wordpieces between two consecutive windows. This overlap enables context | |
| // for each word NER annotates. | |
| sliding_window_num_wordpieces_overlap:int = 20; | |
| reserved_9:int16 (deprecated); | |
| // The possible labels the ner model can output. If empty the default labels | |
| // will be used. | |
| labels:[PodNerModel_.Label]; | |
| // If the ratio of unknown wordpieces in the input text is greater than this | |
| // maximum, the text won't be annotated. | |
| max_ratio_unknown_wordpieces:float = 0.1; | |
| // Possible collections for labeled entities. | |
| collections:[PodNerModel_.Collection]; | |
| // Minimum word-length and wordpieces-length required for the text to be | |
| // annotated. | |
| min_number_of_tokens:int = 1; | |
| min_number_of_wordpieces:int = 1; | |
| } | |
| namespace libtextclassifier3; | |
| table VocabModel { | |
| // A trie that stores a list of vocabs that triggers "Define". A id is | |
| // returned when looking up a vocab from the trie and the id can be used | |
| // to access more information about that vocab. The marisa trie library | |
| // requires 8-byte alignment because the first thing in a marisa trie is a | |
| // 64-bit integer. | |
| vocab_trie:[ubyte] (force_align: 8); | |
| // A bit vector that tells if the vocab should trigger "Define" for users of | |
| // beginner proficiency only. To look up the bit vector, use the id returned | |
| // by the trie. | |
| beginner_level:BitVectorData; | |
| // A sorted list of indices of vocabs that should not trigger "Define" if | |
| // its leading character is in upper case. The indices are those returned by | |
| // trie. You may perform binary search to look up an index. | |
| do_not_trigger_in_upper_case:BitVectorData; | |
| // Comma-separated list of locales (BCP 47 tags) that the model supports, that | |
| // are used to prevent triggering on input in unsupported languages. If | |
| // empty, the model will trigger on all inputs. | |
| triggering_locales:string; | |
| // The final score to assign to the results of the vocab model | |
| target_classification_score:float = 1; | |
| // Priority score used for conflict resolution with the other models. | |
| priority_score:float = 0; | |
| } | |
| root_type libtextclassifier3.Model; |