File size: 6,909 Bytes
11e0d89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
MINDI 1.5 Vision-Coder β€” Step 6: Smoke-test MindiTokenizer wrapper & generate test report.
"""

import sys
import datetime
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))

from tokenizer.tokenizer import MindiTokenizer, MINDI_SPECIAL_TOKENS

print("=" * 70)
print("STEP 6: SAVE EVERYTHING β€” WRAPPER SMOKE TEST + REPORT")
print("=" * 70)

# ── 1. Load via wrapper class ────────────────────────────────────────
print("\n1️⃣  Loading MindiTokenizer wrapper...")
tok = MindiTokenizer()
print(f"   βœ… Loaded from: {tok.tokenizer_path}")
print(f"   Vocab size: {tok.get_vocab_size():,}")

# ── 2. Test encode / decode ──────────────────────────────────────────
print("\n2️⃣  encode() / decode()...")
text = "export default function Hero() { return <h1>Hello</h1>; }"
ids = tok.encode(text)
decoded = tok.decode(ids)
assert decoded.strip() == text.strip(), f"Round-trip failed: {decoded!r}"
print(f"   βœ… Round-trip OK β€” {len(ids)} tokens")

# ── 3. Test encode_with_special_tokens ───────────────────────────────
print("\n3️⃣  encode_with_special_tokens()...")
special_text = "<|code_start|>\nconsole.log('hi');\n<|code_end|>"
ids2 = tok.encode_with_special_tokens(special_text)
decoded2 = tok.decode(ids2)
assert decoded2.strip() == special_text.strip(), f"Special round-trip failed"
code_start_id = tok.get_special_token_id("code_start")
code_end_id = tok.get_special_token_id("code_end")
assert code_start_id in ids2, "code_start token not found"
assert code_end_id in ids2, "code_end token not found"
print(f"   βœ… Special tokens preserved β€” {len(ids2)} tokens")

# ── 4. Test encode_conversation ──────────────────────────────────────
print("\n4️⃣  encode_conversation()...")
messages = [
    {"role": "system", "content": "You are MINDI 1.5 Vision-Coder."},
    {"role": "user", "content": "Build a navbar."},
    {"role": "assistant", "content": "<|think_start|>\nPlanning navbar...\n<|think_end|>\n\n<|code_start|>\nexport default function Navbar() { return <nav>Nav</nav>; }\n<|code_end|>"},
]
conv_ids = tok.encode_conversation(messages, wrap_mindi=True)
conv_decoded = tok.decode(conv_ids)
assert "<|mindi_start|>" in conv_decoded, "mindi_start missing"
assert "<|mindi_end|>" in conv_decoded, "mindi_end missing"
assert "<|im_start|>" in conv_decoded, "im_start missing"
assert "<|think_start|>" in conv_decoded, "think_start missing"
assert "<|code_start|>" in conv_decoded, "code_start missing"
print(f"   βœ… Conversation encoded β€” {len(conv_ids)} tokens, mindi/im/think/code all present")

# ── 5. Test get_special_token_ids ────────────────────────────────────
print("\n5️⃣  get_special_token_ids()...")
all_ids = tok.get_special_token_ids()
assert len(all_ids) == 22, f"Expected 22, got {len(all_ids)}"
for name, tid in all_ids.items():
    assert isinstance(tid, int) and tid > 0, f"Bad ID for {name}: {tid}"
print(f"   βœ… 22 special token IDs returned, all valid integers")

# ── 6. Test get_vocab_size ───────────────────────────────────────────
print("\n6️⃣  get_vocab_size()...")
vs = tok.get_vocab_size()
assert vs == 151685, f"Expected 151685, got {vs}"
print(f"   βœ… Vocab size: {vs:,}")

# ── Generate test report ─────────────────────────────────────────────
print("\n" + "─" * 70)
print("πŸ“„ Generating test report...")

report_lines = [
    "=" * 70,
    "MINDI 1.5 VISION-CODER β€” TOKENIZER TEST REPORT",
    f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
    "=" * 70,
    "",
    "BASE MODEL: Qwen/Qwen2.5-Coder-7B-Instruct",
    f"VOCAB SIZE: {vs:,}",
    f"SPECIAL TOKENS: {len(all_ids)} (22 MINDI tokens)",
    f"TOKENIZER PATH: data/tokenizer/mindi_tokenizer/",
    "",
    "─" * 70,
    "SPECIAL TOKEN REGISTRY",
    "─" * 70,
]

for name, tid in sorted(all_ids.items(), key=lambda x: x[1]):
    token_str = MINDI_SPECIAL_TOKENS[name]
    report_lines.append(f"  {token_str:<25} β†’ ID {tid}")

report_lines += [
    "",
    "─" * 70,
    "WRAPPER CLASS API TESTS",
    "─" * 70,
    "  βœ… encode()                    β€” round-trip plain text",
    "  βœ… decode()                    β€” reconstructs original text",
    "  βœ… encode_with_special_tokens() β€” preserves special tokens as single IDs",
    "  βœ… encode_conversation()        β€” formats system/user/assistant with im_start/end + mindi wrapper",
    "  βœ… get_vocab_size()            β€” returns 151,685",
    "  βœ… get_special_token_ids()     β€” returns all 22 MINDI token IDs",
    "  βœ… get_special_token_id(name)  β€” individual token lookup",
    "",
    "─" * 70,
    "CONVERSATION FORMAT TEST (from Step 5)",
    "─" * 70,
    "  Total tokens:       971",
    "  Round-trip:         PERFECT MATCH",
    "  Special tokens:     22/22 preserved as single tokens",
    "  Qwen chat tokens:   im_start Γ—3, im_end Γ—3",
    "  Context usage:      971 / 32,768 = 3.0%",
    "",
    "─" * 70,
    "FILES SAVED",
    "─" * 70,
    "  data/tokenizer/base_tokenizer/     β€” Original Qwen tokenizer (3 files)",
    "  data/tokenizer/mindi_tokenizer/    β€” MINDI tokenizer with 22 special tokens",
    "  src/tokenizer/tokenizer.py         β€” MindiTokenizer wrapper class",
    "  logs/tokenizer_test.txt            β€” This report",
    "  scripts/download_tokenizer.py      β€” Tokenizer download script",
    "  scripts/add_special_tokens.py      β€” Special token addition script",
    "  scripts/test_mindi_format.py       β€” Conversation format test script",
    "",
    "=" * 70,
    "STATUS: ALL TESTS PASSED βœ…",
    "=" * 70,
]

report_text = "\n".join(report_lines)

logs_dir = PROJECT_ROOT / "logs"
logs_dir.mkdir(parents=True, exist_ok=True)
report_path = logs_dir / "tokenizer_test.txt"
report_path.write_text(report_text, encoding="utf-8")
print(f"   βœ… Saved to: {report_path}")

# ── Final summary ────────────────────────────────────────────────────
print("\n" + "=" * 70)
print("βœ… STEP 6 COMPLETE: Everything saved!")
print("   β€’ MindiTokenizer wrapper class β€” 6/6 API methods tested")
print("   β€’ Test report β€” logs/tokenizer_test.txt")
print(f"   β€’ Tokenizer files β€” data/tokenizer/mindi_tokenizer/")
print("=" * 70)