File size: 3,284 Bytes

11e0d89

"""
MINDI 1.5 Vision-Coder — Step 4: Add MINDI Special Tokens

Loads the base Qwen2.5-Coder tokenizer, adds 22 MINDI-specific
special tokens, saves the updated tokenizer, and reports vocab changes.
"""

import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))


MINDI_SPECIAL_TOKENS = [
    "<|mindi_start|>",
    "<|mindi_end|>",
    "<|code_start|>",
    "<|code_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|critique_start|>",
    "<|critique_end|>",
    "<|suggest_start|>",
    "<|suggest_end|>",
    "<|think_start|>",
    "<|think_end|>",
    "<|file_start|>",
    "<|file_end|>",
    "<|search_start|>",
    "<|search_end|>",
    "<|sandbox_start|>",
    "<|sandbox_end|>",
    "<|error_start|>",
    "<|error_end|>",
    "<|fix_start|>",
    "<|fix_end|>",
]


def main():
    from transformers import AutoTokenizer

    base_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
    save_dir = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"

    print(f"\n{'='*60}")
    print(f"  Step 4: Adding MINDI Special Tokens")
    print(f"{'='*60}")

    # Load base tokenizer
    print(f"\n  Loading base tokenizer from: {base_dir}")
    tokenizer = AutoTokenizer.from_pretrained(str(base_dir), trust_remote_code=True)
    original_vocab_size = len(tokenizer)
    print(f"  ✅ Base vocab size: {original_vocab_size:,}")

    # Add special tokens
    print(f"\n  Adding {len(MINDI_SPECIAL_TOKENS)} MINDI special tokens...")
    num_added = tokenizer.add_special_tokens({
        "additional_special_tokens": MINDI_SPECIAL_TOKENS
    })
    new_vocab_size = len(tokenizer)
    print(f"  ✅ Tokens added: {num_added}")
    print(f"  ✅ New vocab size: {new_vocab_size:,}")
    print(f"  ✅ Delta: +{new_vocab_size - original_vocab_size}")

    # Save updated tokenizer
    save_dir.mkdir(parents=True, exist_ok=True)
    tokenizer.save_pretrained(str(save_dir))
    print(f"\n  ✅ Saved MINDI tokenizer to: {save_dir}")

    # Show token ID mapping
    print(f"\n{'='*60}")
    print(f"  Special Token ID Mapping")
    print(f"{'='*60}")
    for token in MINDI_SPECIAL_TOKENS:
        token_id = tokenizer.convert_tokens_to_ids(token)
        print(f"    {token:<25} → ID {token_id}")

    # Verify round-trip for each special token
    print(f"\n{'='*60}")
    print(f"  Round-trip Verification")
    print(f"{'='*60}")
    all_pass = True
    for token in MINDI_SPECIAL_TOKENS:
        token_id = tokenizer.convert_tokens_to_ids(token)
        decoded = tokenizer.decode([token_id])
        match = decoded == token
        if not match:
            all_pass = False
        status = "✅" if match else "❌"
        print(f"    {status} {token} → {token_id} → \"{decoded}\"")

    # Summary
    print(f"\n{'='*60}")
    print(f"  SUMMARY")
    print(f"{'='*60}")
    print(f"  Original vocab size:  {original_vocab_size:,}")
    print(f"  New vocab size:       {new_vocab_size:,}")
    print(f"  Special tokens added: {num_added}")
    if all_pass:
        print(f"  Round-trip test:      ✅ ALL {len(MINDI_SPECIAL_TOKENS)} PASSED")
    else:
        print(f"  Round-trip test:      ❌ SOME FAILED")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    main()