/*! * Copyright (c) 2023 by Contributors * \file tokenizers.cc * \brief C binding to tokenizers */ #ifndef TOKENIZERS_H_ #define TOKENIZERS_H_ // The C API #ifdef __cplusplus extern "C" { #endif #include #include typedef void* TokenizerHandle; TokenizerHandle tokenizers_new_from_str(const char* json, size_t len); TokenizerHandle byte_level_bpe_tokenizers_new_from_str(const char* vocab, size_t vocab_len, const char* merges, size_t merges_len, const char* added_tokens, size_t added_tokens_len); void tokenizers_encode(TokenizerHandle handle, const char* data, size_t len, int add_special_token); void tokenizers_decode(TokenizerHandle handle, const uint32_t* data, size_t len, int skip_special_token); void tokenizers_get_decode_str(TokenizerHandle handle, const char** data, size_t* len); void tokenizers_get_encode_ids(TokenizerHandle handle, const uint32_t** id_data, size_t* len); void tokenizers_free(TokenizerHandle handle); #ifdef __cplusplus } #endif #ifdef __cplusplus #include #include // simple c++ binding namespace tokenizers { /*! * \brief A simple c++ header of tokenizer via C API. */ class Tokenizer { public: Tokenizer(const Tokenizer&) = delete; Tokenizer(Tokenizer&& other) { std::swap(other.handle_, handle_); } ~Tokenizer() { if (handle_ != nullptr) { tokenizers_free(handle_); } } // use i32 to be consistent with sentencepiece std::vector Encode(const std::string& text, bool add_special_token) { tokenizers_encode(handle_, text.data(), text.length(), static_cast(add_special_token)); const uint32_t* data; size_t len; tokenizers_get_encode_ids(handle_, &data, &len); return std::vector(data, data + len); } // use i32 to be consistent with sentencepiece std::string Decode(const std::vector& ids, bool skip_special_token) { tokenizers_decode(handle_, reinterpret_cast(ids.data()), ids.size(), static_cast(skip_special_token)); const char* data; size_t len; tokenizers_get_decode_str(handle_, &data, &len); return std::string(data, len); } // avoid from_file so we don't have file system static Tokenizer FromJSON(const std::string& json) { return Tokenizer(tokenizers_new_from_str(json.data(), json.length())); } static Tokenizer FromBPE(const std::string& vocab, const std::string& merges, const std::string& added_tokens) { return Tokenizer(byte_level_bpe_tokenizers_new_from_str( vocab.data(), vocab.length(), merges.data(), merges.length(), added_tokens.data(), added_tokens.length())); } private: Tokenizer(TokenizerHandle handle) : handle_(handle) {} // internal handle TokenizerHandle handle_{nullptr}; }; } // namespace tokenizers #endif #endif // TOKENIZERS_H_