llm_mutil_npu / scripts /export_vocab.py

Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU

4b9fefd 19 days ago

2.85 kB

	#!/usr/bin/env python3
	"""Export Qwen3 tokenizer vocab to a simple binary format.

	Format (little-endian):
	u32 num_tokens
	for each id in [0, num_tokens):
	u32 byte_length
	u8[byte_length] utf8_bytes

	Also emits special_tokens.txt with id + content pairs for reference.
	"""
	import json, sys, struct, os

	model_dir = sys.argv[1] if len(sys.argv) > 1 else '/path/to/Qwen3-235B-A22B-Instruct-2507-BF16'
	out_dir = sys.argv[2] if len(sys.argv) > 2 else 'tokenizer_data'
	os.makedirs(out_dir, exist_ok=True)

	with open(os.path.join(model_dir, 'tokenizer.json'), 'r') as f:
	tok = json.load(f)

	# Byte-level decoder map: HF Qwen uses byte-level BPE like GPT-2
	# Each non-ASCII vocab token is a mapping of U+0100..U+017F etc back to raw bytes.
	# For decode we just need the reverse map from printable chars to raw bytes.
	def build_byte_decoder():
	bs = list(range(ord('!'), ord('~')+1)) + list(range(ord('¡'), ord('¬')+1)) + list(range(ord('®'), ord('ÿ')+1))
	cs = bs[:]
	n = 0
	for b in range(2**8):
	if b not in bs:
	bs.append(b)
	cs.append(2**8 + n)
	n += 1
	return {chr(c): bytes([b]) for b, c in zip(bs, cs)}

	byte_decoder = build_byte_decoder()

	# Merge vocab + added_tokens into id -> utf8_bytes lookup
	vocab = tok['model']['vocab'] # {token_str: id}
	added = tok.get('added_tokens', []) # list of {id, content, ...}

	id_to_bytes = {}
	for token, tid in vocab.items():
	# Decode byte-level encoding back to raw utf8 bytes
	raw = b''
	for ch in token:
	if ch in byte_decoder:
	raw += byte_decoder[ch]
	else:
	raw += ch.encode('utf-8')
	id_to_bytes[int(tid)] = raw

	for a in added:
	# Special tokens stored as raw utf8
	id_to_bytes[int(a['id'])] = a['content'].encode('utf-8')

	max_id = max(id_to_bytes.keys())
	num = max_id + 1
	print(f"max_id = {max_id}, num_tokens = {num}")
	print(f"num_special_tokens = {len(added)}")

	# Write vocab.bin
	vocab_path = os.path.join(out_dir, 'vocab.bin')
	with open(vocab_path, 'wb') as f:
	f.write(struct.pack('<I', num))
	for i in range(num):
	b = id_to_bytes.get(i, b'')
	f.write(struct.pack('<I', len(b)))
	f.write(b)
	print(f"Wrote {vocab_path} ({os.path.getsize(vocab_path)} bytes)")

	# Write special tokens
	with open(os.path.join(out_dir, 'special_tokens.txt'), 'w') as f:
	for a in added:
	f.write(f"{a['id']}\t{a['content']}\n")
	print(f"Wrote special_tokens.txt")

	# Verify via a known prompt
	from transformers import AutoTokenizer
	atok = AutoTokenizer.from_pretrained(model_dir)
	test = "The capital of France is"
	ids = atok.encode(test)
	print(f"\nTest encode '{test}' -> {ids}")
	decoded = ''.join(id_to_bytes.get(i, b'?').decode('utf-8', errors='replace') for i in ids)
	print(f"Our decode: '{decoded}'")
	print(f"HF decode: '{atok.decode(ids)}'")