// Copyright 2025 The ODML Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_SENTENCEPIECE_TOKENIZER_H_ #define THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_SENTENCEPIECE_TOKENIZER_H_ #include #include #include #include #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "runtime/components/tokenizer.h" #include "sentencepiece_model.pb.h" // from @sentencepiece #include "sentencepiece_processor.h" // from @sentencepiece namespace litert::lm { // A Tokenizer implementation using SentencePiece. class SentencePieceTokenizer : public Tokenizer { public: // Creates a SentencePieceTokenizer from the given model path. // Note that the model path can only be a local file path but not a CNS path. static absl::StatusOr> CreateFromFile( absl::string_view model_path); // Creates a SentencePieceTokenizer from a preloaded model buffer. static absl::StatusOr> CreateFromBuffer(absl::string_view model_buffer); // Creates a SentencePieceTokenizer from a model proto. static absl::StatusOr> CreateFromProto(std::unique_ptr model_proto); TokenizerType GetTokenizerType() const override { return TokenizerType::kSentencePiece; } // Encodes the given text into a sequence of token ids. absl::StatusOr> TextToTokenIds( absl::string_view text) override; // Converts a token string to its token id. Uses SentencePiece's // PieceToId method. absl::StatusOr TokenToId(absl::string_view token) override; // Decodes the given sequence of token ids into a string. absl::StatusOr TokenIdsToText( const std::vector& token_ids) override; // Returns the tokens in the SentencePiece model. std::vector GetTokens() const override; const sentencepiece::SentencePieceProcessor& GetProcessor() const { return *processor_; } private: // Constructor. explicit SentencePieceTokenizer( std::unique_ptr processor) : processor_(std::move(processor)), vocab_size_(processor_->GetPieceSize()) {}; // SentencePiece processor. std::unique_ptr processor_; // The size of the vocabulary. Used to avoid decoding the invalid IDs that are // out of the range of the vocabulary. int vocab_size_; }; } // namespace litert::lm #endif // THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_SENTENCEPIECE_TOKENIZER_H_