MINDI-1.5-Vision-Coder / scripts /add_special_tokens.py
Faaz
Day 1 Complete: Tokenizer setup β€” Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89
"""
MINDI 1.5 Vision-Coder β€” Step 4: Add MINDI Special Tokens
Loads the base Qwen2.5-Coder tokenizer, adds 22 MINDI-specific
special tokens, saves the updated tokenizer, and reports vocab changes.
"""
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))
MINDI_SPECIAL_TOKENS = [
"<|mindi_start|>",
"<|mindi_end|>",
"<|code_start|>",
"<|code_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|critique_start|>",
"<|critique_end|>",
"<|suggest_start|>",
"<|suggest_end|>",
"<|think_start|>",
"<|think_end|>",
"<|file_start|>",
"<|file_end|>",
"<|search_start|>",
"<|search_end|>",
"<|sandbox_start|>",
"<|sandbox_end|>",
"<|error_start|>",
"<|error_end|>",
"<|fix_start|>",
"<|fix_end|>",
]
def main():
from transformers import AutoTokenizer
base_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
save_dir = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
print(f"\n{'='*60}")
print(f" Step 4: Adding MINDI Special Tokens")
print(f"{'='*60}")
# Load base tokenizer
print(f"\n Loading base tokenizer from: {base_dir}")
tokenizer = AutoTokenizer.from_pretrained(str(base_dir), trust_remote_code=True)
original_vocab_size = len(tokenizer)
print(f" βœ… Base vocab size: {original_vocab_size:,}")
# Add special tokens
print(f"\n Adding {len(MINDI_SPECIAL_TOKENS)} MINDI special tokens...")
num_added = tokenizer.add_special_tokens({
"additional_special_tokens": MINDI_SPECIAL_TOKENS
})
new_vocab_size = len(tokenizer)
print(f" βœ… Tokens added: {num_added}")
print(f" βœ… New vocab size: {new_vocab_size:,}")
print(f" βœ… Delta: +{new_vocab_size - original_vocab_size}")
# Save updated tokenizer
save_dir.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(str(save_dir))
print(f"\n βœ… Saved MINDI tokenizer to: {save_dir}")
# Show token ID mapping
print(f"\n{'='*60}")
print(f" Special Token ID Mapping")
print(f"{'='*60}")
for token in MINDI_SPECIAL_TOKENS:
token_id = tokenizer.convert_tokens_to_ids(token)
print(f" {token:<25} β†’ ID {token_id}")
# Verify round-trip for each special token
print(f"\n{'='*60}")
print(f" Round-trip Verification")
print(f"{'='*60}")
all_pass = True
for token in MINDI_SPECIAL_TOKENS:
token_id = tokenizer.convert_tokens_to_ids(token)
decoded = tokenizer.decode([token_id])
match = decoded == token
if not match:
all_pass = False
status = "βœ…" if match else "❌"
print(f" {status} {token} β†’ {token_id} β†’ \"{decoded}\"")
# Summary
print(f"\n{'='*60}")
print(f" SUMMARY")
print(f"{'='*60}")
print(f" Original vocab size: {original_vocab_size:,}")
print(f" New vocab size: {new_vocab_size:,}")
print(f" Special tokens added: {num_added}")
if all_pass:
print(f" Round-trip test: βœ… ALL {len(MINDI_SPECIAL_TOKENS)} PASSED")
else:
print(f" Round-trip test: ❌ SOME FAILED")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()