#!/usr/bin/env python3 """Export Qwen3 tokenizer vocab to a simple binary format. Format (little-endian): u32 num_tokens for each id in [0, num_tokens): u32 byte_length u8[byte_length] utf8_bytes Also emits special_tokens.txt with id + content pairs for reference. """ import json, sys, struct, os model_dir = sys.argv[1] if len(sys.argv) > 1 else '/path/to/Qwen3-235B-A22B-Instruct-2507-BF16' out_dir = sys.argv[2] if len(sys.argv) > 2 else 'tokenizer_data' os.makedirs(out_dir, exist_ok=True) with open(os.path.join(model_dir, 'tokenizer.json'), 'r') as f: tok = json.load(f) # Byte-level decoder map: HF Qwen uses byte-level BPE like GPT-2 # Each non-ASCII vocab token is a mapping of U+0100..U+017F etc back to raw bytes. # For decode we just need the reverse map from printable chars to raw bytes. def build_byte_decoder(): bs = list(range(ord('!'), ord('~')+1)) + list(range(ord('¡'), ord('¬')+1)) + list(range(ord('®'), ord('ÿ')+1)) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8 + n) n += 1 return {chr(c): bytes([b]) for b, c in zip(bs, cs)} byte_decoder = build_byte_decoder() # Merge vocab + added_tokens into id -> utf8_bytes lookup vocab = tok['model']['vocab'] # {token_str: id} added = tok.get('added_tokens', []) # list of {id, content, ...} id_to_bytes = {} for token, tid in vocab.items(): # Decode byte-level encoding back to raw utf8 bytes raw = b'' for ch in token: if ch in byte_decoder: raw += byte_decoder[ch] else: raw += ch.encode('utf-8') id_to_bytes[int(tid)] = raw for a in added: # Special tokens stored as raw utf8 id_to_bytes[int(a['id'])] = a['content'].encode('utf-8') max_id = max(id_to_bytes.keys()) num = max_id + 1 print(f"max_id = {max_id}, num_tokens = {num}") print(f"num_special_tokens = {len(added)}") # Write vocab.bin vocab_path = os.path.join(out_dir, 'vocab.bin') with open(vocab_path, 'wb') as f: f.write(struct.pack(' {ids}") decoded = ''.join(id_to_bytes.get(i, b'?').decode('utf-8', errors='replace') for i in ids) print(f"Our decode: '{decoded}'") print(f"HF decode: '{atok.decode(ids)}'")