File size: 1,512 Bytes
4b9fefd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | // tokenizer.h — minimal Qwen3 tokenizer.
//
// M2-phase1: decode() is native C++ (simple vocab lookup). encode() is a Python subprocess
// (one-time cost at prompt setup). Native BPE encode is a future item.
//
#pragma once
#include <string>
#include <vector>
#include <cstdint>
class Tokenizer {
public:
bool load(const std::string& vocab_bin_path);
// Decode a single token id to UTF-8 string.
std::string decode(int token_id) const;
// Decode list of token ids to concatenated UTF-8 string.
std::string decode(const std::vector<int>& token_ids) const;
// Encode prompt to token ids. Uses a Python subprocess since Qwen3 needs proper BPE.
// The subprocess call takes ~200ms but is only invoked once per prompt.
std::vector<int> encode_via_python(const std::string& model_dir,
const std::string& prompt,
bool apply_chat_template = false) const;
// Encode a multi-turn conversation by applying the model's chat template. Each pair is
// (role, content) — typical roles: "system", "user", "assistant". Uses Python subprocess.
std::vector<int> encode_conversation_via_python(
const std::string& model_dir,
const std::vector<std::pair<std::string, std::string>>& conversation,
bool add_generation_prompt = true) const;
size_t size() const { return id_to_bytes_.size(); }
private:
std::vector<std::string> id_to_bytes_; // id -> raw utf-8 bytes
};
|