File size: 3,284 Bytes
11e0d89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
MINDI 1.5 Vision-Coder β€” Step 4: Add MINDI Special Tokens

Loads the base Qwen2.5-Coder tokenizer, adds 22 MINDI-specific
special tokens, saves the updated tokenizer, and reports vocab changes.
"""

import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))


MINDI_SPECIAL_TOKENS = [
    "<|mindi_start|>",
    "<|mindi_end|>",
    "<|code_start|>",
    "<|code_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|critique_start|>",
    "<|critique_end|>",
    "<|suggest_start|>",
    "<|suggest_end|>",
    "<|think_start|>",
    "<|think_end|>",
    "<|file_start|>",
    "<|file_end|>",
    "<|search_start|>",
    "<|search_end|>",
    "<|sandbox_start|>",
    "<|sandbox_end|>",
    "<|error_start|>",
    "<|error_end|>",
    "<|fix_start|>",
    "<|fix_end|>",
]


def main():
    from transformers import AutoTokenizer

    base_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
    save_dir = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"

    print(f"\n{'='*60}")
    print(f"  Step 4: Adding MINDI Special Tokens")
    print(f"{'='*60}")

    # Load base tokenizer
    print(f"\n  Loading base tokenizer from: {base_dir}")
    tokenizer = AutoTokenizer.from_pretrained(str(base_dir), trust_remote_code=True)
    original_vocab_size = len(tokenizer)
    print(f"  βœ… Base vocab size: {original_vocab_size:,}")

    # Add special tokens
    print(f"\n  Adding {len(MINDI_SPECIAL_TOKENS)} MINDI special tokens...")
    num_added = tokenizer.add_special_tokens({
        "additional_special_tokens": MINDI_SPECIAL_TOKENS
    })
    new_vocab_size = len(tokenizer)
    print(f"  βœ… Tokens added: {num_added}")
    print(f"  βœ… New vocab size: {new_vocab_size:,}")
    print(f"  βœ… Delta: +{new_vocab_size - original_vocab_size}")

    # Save updated tokenizer
    save_dir.mkdir(parents=True, exist_ok=True)
    tokenizer.save_pretrained(str(save_dir))
    print(f"\n  βœ… Saved MINDI tokenizer to: {save_dir}")

    # Show token ID mapping
    print(f"\n{'='*60}")
    print(f"  Special Token ID Mapping")
    print(f"{'='*60}")
    for token in MINDI_SPECIAL_TOKENS:
        token_id = tokenizer.convert_tokens_to_ids(token)
        print(f"    {token:<25} β†’ ID {token_id}")

    # Verify round-trip for each special token
    print(f"\n{'='*60}")
    print(f"  Round-trip Verification")
    print(f"{'='*60}")
    all_pass = True
    for token in MINDI_SPECIAL_TOKENS:
        token_id = tokenizer.convert_tokens_to_ids(token)
        decoded = tokenizer.decode([token_id])
        match = decoded == token
        if not match:
            all_pass = False
        status = "βœ…" if match else "❌"
        print(f"    {status} {token} β†’ {token_id} β†’ \"{decoded}\"")

    # Summary
    print(f"\n{'='*60}")
    print(f"  SUMMARY")
    print(f"{'='*60}")
    print(f"  Original vocab size:  {original_vocab_size:,}")
    print(f"  New vocab size:       {new_vocab_size:,}")
    print(f"  Special tokens added: {num_added}")
    if all_pass:
        print(f"  Round-trip test:      βœ… ALL {len(MINDI_SPECIAL_TOKENS)} PASSED")
    else:
        print(f"  Round-trip test:      ❌ SOME FAILED")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    main()