File size: 3,284 Bytes
11e0d89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | """
MINDI 1.5 Vision-Coder β Step 4: Add MINDI Special Tokens
Loads the base Qwen2.5-Coder tokenizer, adds 22 MINDI-specific
special tokens, saves the updated tokenizer, and reports vocab changes.
"""
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))
MINDI_SPECIAL_TOKENS = [
"<|mindi_start|>",
"<|mindi_end|>",
"<|code_start|>",
"<|code_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|critique_start|>",
"<|critique_end|>",
"<|suggest_start|>",
"<|suggest_end|>",
"<|think_start|>",
"<|think_end|>",
"<|file_start|>",
"<|file_end|>",
"<|search_start|>",
"<|search_end|>",
"<|sandbox_start|>",
"<|sandbox_end|>",
"<|error_start|>",
"<|error_end|>",
"<|fix_start|>",
"<|fix_end|>",
]
def main():
from transformers import AutoTokenizer
base_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
save_dir = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
print(f"\n{'='*60}")
print(f" Step 4: Adding MINDI Special Tokens")
print(f"{'='*60}")
# Load base tokenizer
print(f"\n Loading base tokenizer from: {base_dir}")
tokenizer = AutoTokenizer.from_pretrained(str(base_dir), trust_remote_code=True)
original_vocab_size = len(tokenizer)
print(f" β
Base vocab size: {original_vocab_size:,}")
# Add special tokens
print(f"\n Adding {len(MINDI_SPECIAL_TOKENS)} MINDI special tokens...")
num_added = tokenizer.add_special_tokens({
"additional_special_tokens": MINDI_SPECIAL_TOKENS
})
new_vocab_size = len(tokenizer)
print(f" β
Tokens added: {num_added}")
print(f" β
New vocab size: {new_vocab_size:,}")
print(f" β
Delta: +{new_vocab_size - original_vocab_size}")
# Save updated tokenizer
save_dir.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(str(save_dir))
print(f"\n β
Saved MINDI tokenizer to: {save_dir}")
# Show token ID mapping
print(f"\n{'='*60}")
print(f" Special Token ID Mapping")
print(f"{'='*60}")
for token in MINDI_SPECIAL_TOKENS:
token_id = tokenizer.convert_tokens_to_ids(token)
print(f" {token:<25} β ID {token_id}")
# Verify round-trip for each special token
print(f"\n{'='*60}")
print(f" Round-trip Verification")
print(f"{'='*60}")
all_pass = True
for token in MINDI_SPECIAL_TOKENS:
token_id = tokenizer.convert_tokens_to_ids(token)
decoded = tokenizer.decode([token_id])
match = decoded == token
if not match:
all_pass = False
status = "β
" if match else "β"
print(f" {status} {token} β {token_id} β \"{decoded}\"")
# Summary
print(f"\n{'='*60}")
print(f" SUMMARY")
print(f"{'='*60}")
print(f" Original vocab size: {original_vocab_size:,}")
print(f" New vocab size: {new_vocab_size:,}")
print(f" Special tokens added: {num_added}")
if all_pass:
print(f" Round-trip test: β
ALL {len(MINDI_SPECIAL_TOKENS)} PASSED")
else:
print(f" Round-trip test: β SOME FAILED")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()
|