File size: 2,851 Bytes

4b9fefd

#!/usr/bin/env python3
"""Export Qwen3 tokenizer vocab to a simple binary format.

Format (little-endian):
  u32 num_tokens
  for each id in [0, num_tokens):
    u32 byte_length
    u8[byte_length] utf8_bytes

Also emits special_tokens.txt with id + content pairs for reference.
"""
import json, sys, struct, os

model_dir = sys.argv[1] if len(sys.argv) > 1 else '/path/to/Qwen3-235B-A22B-Instruct-2507-BF16'
out_dir   = sys.argv[2] if len(sys.argv) > 2 else 'tokenizer_data'
os.makedirs(out_dir, exist_ok=True)

with open(os.path.join(model_dir, 'tokenizer.json'), 'r') as f:
    tok = json.load(f)

# Byte-level decoder map: HF Qwen uses byte-level BPE like GPT-2
# Each non-ASCII vocab token is a mapping of U+0100..U+017F etc back to raw bytes.
# For decode we just need the reverse map from printable chars to raw bytes.
def build_byte_decoder():
    bs = list(range(ord('!'), ord('~')+1)) + list(range(ord('¡'), ord('¬')+1)) + list(range(ord('®'), ord('ÿ')+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    return {chr(c): bytes([b]) for b, c in zip(bs, cs)}

byte_decoder = build_byte_decoder()

# Merge vocab + added_tokens into id -> utf8_bytes lookup
vocab = tok['model']['vocab']  # {token_str: id}
added = tok.get('added_tokens', [])  # list of {id, content, ...}

id_to_bytes = {}
for token, tid in vocab.items():
    # Decode byte-level encoding back to raw utf8 bytes
    raw = b''
    for ch in token:
        if ch in byte_decoder:
            raw += byte_decoder[ch]
        else:
            raw += ch.encode('utf-8')
    id_to_bytes[int(tid)] = raw

for a in added:
    # Special tokens stored as raw utf8
    id_to_bytes[int(a['id'])] = a['content'].encode('utf-8')

max_id = max(id_to_bytes.keys())
num = max_id + 1
print(f"max_id = {max_id}, num_tokens = {num}")
print(f"num_special_tokens = {len(added)}")

# Write vocab.bin
vocab_path = os.path.join(out_dir, 'vocab.bin')
with open(vocab_path, 'wb') as f:
    f.write(struct.pack('<I', num))
    for i in range(num):
        b = id_to_bytes.get(i, b'')
        f.write(struct.pack('<I', len(b)))
        f.write(b)
print(f"Wrote {vocab_path} ({os.path.getsize(vocab_path)} bytes)")

# Write special tokens
with open(os.path.join(out_dir, 'special_tokens.txt'), 'w') as f:
    for a in added:
        f.write(f"{a['id']}\t{a['content']}\n")
print(f"Wrote special_tokens.txt")

# Verify via a known prompt
from transformers import AutoTokenizer
atok = AutoTokenizer.from_pretrained(model_dir)
test = "The capital of France is"
ids = atok.encode(test)
print(f"\nTest encode '{test}' -> {ids}")
decoded = ''.join(id_to_bytes.get(i, b'?').decode('utf-8', errors='replace') for i in ids)
print(f"Our decode:   '{decoded}'")
print(f"HF  decode:   '{atok.decode(ids)}'")