File size: 3,074 Bytes
d443007 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
/*!
* Copyright (c) 2023 by Contributors
* \file tokenizers.cc
* \brief C binding to tokenizers
*/
#ifndef TOKENIZERS_H_
#define TOKENIZERS_H_
// The C API
#ifdef __cplusplus
extern "C" {
#endif
#include <stddef.h>
#include <stdint.h>
typedef void* TokenizerHandle;
TokenizerHandle tokenizers_new_from_str(const char* json, size_t len);
TokenizerHandle byte_level_bpe_tokenizers_new_from_str(const char* vocab, size_t vocab_len,
const char* merges, size_t merges_len,
const char* added_tokens,
size_t added_tokens_len);
void tokenizers_encode(TokenizerHandle handle, const char* data, size_t len, int add_special_token);
void tokenizers_decode(TokenizerHandle handle, const uint32_t* data, size_t len,
int skip_special_token);
void tokenizers_get_decode_str(TokenizerHandle handle, const char** data, size_t* len);
void tokenizers_get_encode_ids(TokenizerHandle handle, const uint32_t** id_data, size_t* len);
void tokenizers_free(TokenizerHandle handle);
#ifdef __cplusplus
}
#endif
#ifdef __cplusplus
#include <string>
#include <vector>
// simple c++ binding
namespace tokenizers {
/*!
* \brief A simple c++ header of tokenizer via C API.
*/
class Tokenizer {
public:
Tokenizer(const Tokenizer&) = delete;
Tokenizer(Tokenizer&& other) { std::swap(other.handle_, handle_); }
~Tokenizer() {
if (handle_ != nullptr) {
tokenizers_free(handle_);
}
}
// use i32 to be consistent with sentencepiece
std::vector<int32_t> Encode(const std::string& text, bool add_special_token) {
tokenizers_encode(handle_, text.data(), text.length(), static_cast<int>(add_special_token));
const uint32_t* data;
size_t len;
tokenizers_get_encode_ids(handle_, &data, &len);
return std::vector<int32_t>(data, data + len);
}
// use i32 to be consistent with sentencepiece
std::string Decode(const std::vector<int32_t>& ids, bool skip_special_token) {
tokenizers_decode(handle_, reinterpret_cast<const uint32_t*>(ids.data()), ids.size(),
static_cast<int>(skip_special_token));
const char* data;
size_t len;
tokenizers_get_decode_str(handle_, &data, &len);
return std::string(data, len);
}
// avoid from_file so we don't have file system
static Tokenizer FromJSON(const std::string& json) {
return Tokenizer(tokenizers_new_from_str(json.data(), json.length()));
}
static Tokenizer FromBPE(const std::string& vocab, const std::string& merges,
const std::string& added_tokens) {
return Tokenizer(byte_level_bpe_tokenizers_new_from_str(
vocab.data(), vocab.length(), merges.data(), merges.length(), added_tokens.data(),
added_tokens.length()));
}
private:
Tokenizer(TokenizerHandle handle) : handle_(handle) {}
// internal handle
TokenizerHandle handle_{nullptr};
};
} // namespace tokenizers
#endif
#endif // TOKENIZERS_H_ |