Faaz
Day 1 Complete: Tokenizer setup β Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89 | """ | |
| MINDI 1.5 Vision-Coder β Step 4: Add MINDI Special Tokens | |
| Loads the base Qwen2.5-Coder tokenizer, adds 22 MINDI-specific | |
| special tokens, saves the updated tokenizer, and reports vocab changes. | |
| """ | |
| import sys | |
| from pathlib import Path | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| MINDI_SPECIAL_TOKENS = [ | |
| "<|mindi_start|>", | |
| "<|mindi_end|>", | |
| "<|code_start|>", | |
| "<|code_end|>", | |
| "<|vision_start|>", | |
| "<|vision_end|>", | |
| "<|critique_start|>", | |
| "<|critique_end|>", | |
| "<|suggest_start|>", | |
| "<|suggest_end|>", | |
| "<|think_start|>", | |
| "<|think_end|>", | |
| "<|file_start|>", | |
| "<|file_end|>", | |
| "<|search_start|>", | |
| "<|search_end|>", | |
| "<|sandbox_start|>", | |
| "<|sandbox_end|>", | |
| "<|error_start|>", | |
| "<|error_end|>", | |
| "<|fix_start|>", | |
| "<|fix_end|>", | |
| ] | |
| def main(): | |
| from transformers import AutoTokenizer | |
| base_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer" | |
| save_dir = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer" | |
| print(f"\n{'='*60}") | |
| print(f" Step 4: Adding MINDI Special Tokens") | |
| print(f"{'='*60}") | |
| # Load base tokenizer | |
| print(f"\n Loading base tokenizer from: {base_dir}") | |
| tokenizer = AutoTokenizer.from_pretrained(str(base_dir), trust_remote_code=True) | |
| original_vocab_size = len(tokenizer) | |
| print(f" β Base vocab size: {original_vocab_size:,}") | |
| # Add special tokens | |
| print(f"\n Adding {len(MINDI_SPECIAL_TOKENS)} MINDI special tokens...") | |
| num_added = tokenizer.add_special_tokens({ | |
| "additional_special_tokens": MINDI_SPECIAL_TOKENS | |
| }) | |
| new_vocab_size = len(tokenizer) | |
| print(f" β Tokens added: {num_added}") | |
| print(f" β New vocab size: {new_vocab_size:,}") | |
| print(f" β Delta: +{new_vocab_size - original_vocab_size}") | |
| # Save updated tokenizer | |
| save_dir.mkdir(parents=True, exist_ok=True) | |
| tokenizer.save_pretrained(str(save_dir)) | |
| print(f"\n β Saved MINDI tokenizer to: {save_dir}") | |
| # Show token ID mapping | |
| print(f"\n{'='*60}") | |
| print(f" Special Token ID Mapping") | |
| print(f"{'='*60}") | |
| for token in MINDI_SPECIAL_TOKENS: | |
| token_id = tokenizer.convert_tokens_to_ids(token) | |
| print(f" {token:<25} β ID {token_id}") | |
| # Verify round-trip for each special token | |
| print(f"\n{'='*60}") | |
| print(f" Round-trip Verification") | |
| print(f"{'='*60}") | |
| all_pass = True | |
| for token in MINDI_SPECIAL_TOKENS: | |
| token_id = tokenizer.convert_tokens_to_ids(token) | |
| decoded = tokenizer.decode([token_id]) | |
| match = decoded == token | |
| if not match: | |
| all_pass = False | |
| status = "β " if match else "β" | |
| print(f" {status} {token} β {token_id} β \"{decoded}\"") | |
| # Summary | |
| print(f"\n{'='*60}") | |
| print(f" SUMMARY") | |
| print(f"{'='*60}") | |
| print(f" Original vocab size: {original_vocab_size:,}") | |
| print(f" New vocab size: {new_vocab_size:,}") | |
| print(f" Special tokens added: {num_added}") | |
| if all_pass: | |
| print(f" Round-trip test: β ALL {len(MINDI_SPECIAL_TOKENS)} PASSED") | |
| else: | |
| print(f" Round-trip test: β SOME FAILED") | |
| print(f"{'='*60}\n") | |
| if __name__ == "__main__": | |
| main() | |