| |
| """Export Qwen3 tokenizer vocab to a simple binary format. |
| |
| Format (little-endian): |
| u32 num_tokens |
| for each id in [0, num_tokens): |
| u32 byte_length |
| u8[byte_length] utf8_bytes |
| |
| Also emits special_tokens.txt with id + content pairs for reference. |
| """ |
| import json, sys, struct, os |
|
|
| model_dir = sys.argv[1] if len(sys.argv) > 1 else '/path/to/Qwen3-235B-A22B-Instruct-2507-BF16' |
| out_dir = sys.argv[2] if len(sys.argv) > 2 else 'tokenizer_data' |
| os.makedirs(out_dir, exist_ok=True) |
|
|
| with open(os.path.join(model_dir, 'tokenizer.json'), 'r') as f: |
| tok = json.load(f) |
|
|
| |
| |
| |
| def build_byte_decoder(): |
| bs = list(range(ord('!'), ord('~')+1)) + list(range(ord('¡'), ord('¬')+1)) + list(range(ord('®'), ord('ÿ')+1)) |
| cs = bs[:] |
| n = 0 |
| for b in range(2**8): |
| if b not in bs: |
| bs.append(b) |
| cs.append(2**8 + n) |
| n += 1 |
| return {chr(c): bytes([b]) for b, c in zip(bs, cs)} |
|
|
| byte_decoder = build_byte_decoder() |
|
|
| |
| vocab = tok['model']['vocab'] |
| added = tok.get('added_tokens', []) |
|
|
| id_to_bytes = {} |
| for token, tid in vocab.items(): |
| |
| raw = b'' |
| for ch in token: |
| if ch in byte_decoder: |
| raw += byte_decoder[ch] |
| else: |
| raw += ch.encode('utf-8') |
| id_to_bytes[int(tid)] = raw |
|
|
| for a in added: |
| |
| id_to_bytes[int(a['id'])] = a['content'].encode('utf-8') |
|
|
| max_id = max(id_to_bytes.keys()) |
| num = max_id + 1 |
| print(f"max_id = {max_id}, num_tokens = {num}") |
| print(f"num_special_tokens = {len(added)}") |
|
|
| |
| vocab_path = os.path.join(out_dir, 'vocab.bin') |
| with open(vocab_path, 'wb') as f: |
| f.write(struct.pack('<I', num)) |
| for i in range(num): |
| b = id_to_bytes.get(i, b'') |
| f.write(struct.pack('<I', len(b))) |
| f.write(b) |
| print(f"Wrote {vocab_path} ({os.path.getsize(vocab_path)} bytes)") |
|
|
| |
| with open(os.path.join(out_dir, 'special_tokens.txt'), 'w') as f: |
| for a in added: |
| f.write(f"{a['id']}\t{a['content']}\n") |
| print(f"Wrote special_tokens.txt") |
|
|
| |
| from transformers import AutoTokenizer |
| atok = AutoTokenizer.from_pretrained(model_dir) |
| test = "The capital of France is" |
| ids = atok.encode(test) |
| print(f"\nTest encode '{test}' -> {ids}") |
| decoded = ''.join(id_to_bytes.get(i, b'?').decode('utf-8', errors='replace') for i in ids) |
| print(f"Our decode: '{decoded}'") |
| print(f"HF decode: '{atok.decode(ids)}'") |
|
|