llm_mutil_npu / scripts /export_vocab.py
xianglarry's picture
Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU
4b9fefd
#!/usr/bin/env python3
"""Export Qwen3 tokenizer vocab to a simple binary format.
Format (little-endian):
u32 num_tokens
for each id in [0, num_tokens):
u32 byte_length
u8[byte_length] utf8_bytes
Also emits special_tokens.txt with id + content pairs for reference.
"""
import json, sys, struct, os
model_dir = sys.argv[1] if len(sys.argv) > 1 else '/path/to/Qwen3-235B-A22B-Instruct-2507-BF16'
out_dir = sys.argv[2] if len(sys.argv) > 2 else 'tokenizer_data'
os.makedirs(out_dir, exist_ok=True)
with open(os.path.join(model_dir, 'tokenizer.json'), 'r') as f:
tok = json.load(f)
# Byte-level decoder map: HF Qwen uses byte-level BPE like GPT-2
# Each non-ASCII vocab token is a mapping of U+0100..U+017F etc back to raw bytes.
# For decode we just need the reverse map from printable chars to raw bytes.
def build_byte_decoder():
bs = list(range(ord('!'), ord('~')+1)) + list(range(ord('¡'), ord('¬')+1)) + list(range(ord('®'), ord('ÿ')+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
return {chr(c): bytes([b]) for b, c in zip(bs, cs)}
byte_decoder = build_byte_decoder()
# Merge vocab + added_tokens into id -> utf8_bytes lookup
vocab = tok['model']['vocab'] # {token_str: id}
added = tok.get('added_tokens', []) # list of {id, content, ...}
id_to_bytes = {}
for token, tid in vocab.items():
# Decode byte-level encoding back to raw utf8 bytes
raw = b''
for ch in token:
if ch in byte_decoder:
raw += byte_decoder[ch]
else:
raw += ch.encode('utf-8')
id_to_bytes[int(tid)] = raw
for a in added:
# Special tokens stored as raw utf8
id_to_bytes[int(a['id'])] = a['content'].encode('utf-8')
max_id = max(id_to_bytes.keys())
num = max_id + 1
print(f"max_id = {max_id}, num_tokens = {num}")
print(f"num_special_tokens = {len(added)}")
# Write vocab.bin
vocab_path = os.path.join(out_dir, 'vocab.bin')
with open(vocab_path, 'wb') as f:
f.write(struct.pack('<I', num))
for i in range(num):
b = id_to_bytes.get(i, b'')
f.write(struct.pack('<I', len(b)))
f.write(b)
print(f"Wrote {vocab_path} ({os.path.getsize(vocab_path)} bytes)")
# Write special tokens
with open(os.path.join(out_dir, 'special_tokens.txt'), 'w') as f:
for a in added:
f.write(f"{a['id']}\t{a['content']}\n")
print(f"Wrote special_tokens.txt")
# Verify via a known prompt
from transformers import AutoTokenizer
atok = AutoTokenizer.from_pretrained(model_dir)
test = "The capital of France is"
ids = atok.encode(test)
print(f"\nTest encode '{test}' -> {ids}")
decoded = ''.join(id_to_bytes.get(i, b'?').decode('utf-8', errors='replace') for i in ids)
print(f"Our decode: '{decoded}'")
print(f"HF decode: '{atok.decode(ids)}'")