File size: 1,512 Bytes
4b9fefd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// tokenizer.h — minimal Qwen3 tokenizer.
//
// M2-phase1: decode() is native C++ (simple vocab lookup). encode() is a Python subprocess
// (one-time cost at prompt setup). Native BPE encode is a future item.
//
#pragma once
#include <string>
#include <vector>
#include <cstdint>

class Tokenizer {
public:
    bool load(const std::string& vocab_bin_path);

    // Decode a single token id to UTF-8 string.
    std::string decode(int token_id) const;

    // Decode list of token ids to concatenated UTF-8 string.
    std::string decode(const std::vector<int>& token_ids) const;

    // Encode prompt to token ids. Uses a Python subprocess since Qwen3 needs proper BPE.
    // The subprocess call takes ~200ms but is only invoked once per prompt.
    std::vector<int> encode_via_python(const std::string& model_dir,
                                       const std::string& prompt,
                                       bool apply_chat_template = false) const;

    // Encode a multi-turn conversation by applying the model's chat template. Each pair is
    // (role, content) — typical roles: "system", "user", "assistant". Uses Python subprocess.
    std::vector<int> encode_conversation_via_python(
        const std::string& model_dir,
        const std::vector<std::pair<std::string, std::string>>& conversation,
        bool add_generation_prompt = true) const;

    size_t size() const { return id_to_bytes_.size(); }

private:
    std::vector<std::string> id_to_bytes_;  // id -> raw utf-8 bytes
};