| // Copyright 2020 Google LLC | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // https://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| // | |
| // Configuration for the text encoder op. | |
| namespace libtextclassifier3; | |
| enum SentencePieceMatcherType : byte { | |
| MAPPED_TRIE = 0, | |
| SORTED_STRING_TABLE = 1, | |
| } | |
| table TextEncoderConfig { | |
| // Code that is used as encoding of the start code. | |
| start_code:int32 = 0; | |
| // Code that is used as encoding of the end code. | |
| end_code:int32 = 1; | |
| // This value is added to all codes to make them not intersect with | |
| // `start_code` and `end_code`. | |
| encoding_offset:int32 = 2; | |
| // Code that is used for out-of-dictionary characters. | |
| unknown_code:int32 = -1; | |
| // Penalty associated with the unknown code. | |
| unknown_score:float; | |
| // Normalization options. | |
| // Serialized normalization charsmap. | |
| normalization_charsmap:string; | |
| normalization_charsmap_values:string; | |
| // Whether to add dummy whitespace at the beginning of the text in order to | |
| // treat "world" in "world" and "hello world" uniformly. | |
| add_dummy_prefix:bool = true; | |
| // Whether to remove leading, trailing and duplicate internal whitespace. | |
| remove_extra_whitespaces:bool = true; | |
| // Whether to replace whitespace with a meta symbol. | |
| escape_whitespaces:bool = true; | |
| // Sentence pieces scores. | |
| pieces_scores:[float]; | |
| // Serialized sentence pieces. | |
| pieces:string; | |
| pieces_offsets:[uint32]; | |
| matcher_type: SentencePieceMatcherType = MAPPED_TRIE; | |
| } |