""" MINDI 1.5 Vision-Coder — Step 6: Smoke-test MindiTokenizer wrapper & generate test report. """ import sys import datetime from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT / "src")) from tokenizer.tokenizer import MindiTokenizer, MINDI_SPECIAL_TOKENS print("=" * 70) print("STEP 6: SAVE EVERYTHING — WRAPPER SMOKE TEST + REPORT") print("=" * 70) # ── 1. Load via wrapper class ──────────────────────────────────────── print("\n1️⃣ Loading MindiTokenizer wrapper...") tok = MindiTokenizer() print(f" ✅ Loaded from: {tok.tokenizer_path}") print(f" Vocab size: {tok.get_vocab_size():,}") # ── 2. Test encode / decode ────────────────────────────────────────── print("\n2️⃣ encode() / decode()...") text = "export default function Hero() { return

Hello

; }" ids = tok.encode(text) decoded = tok.decode(ids) assert decoded.strip() == text.strip(), f"Round-trip failed: {decoded!r}" print(f" ✅ Round-trip OK — {len(ids)} tokens") # ── 3. Test encode_with_special_tokens ─────────────────────────────── print("\n3️⃣ encode_with_special_tokens()...") special_text = "<|code_start|>\nconsole.log('hi');\n<|code_end|>" ids2 = tok.encode_with_special_tokens(special_text) decoded2 = tok.decode(ids2) assert decoded2.strip() == special_text.strip(), f"Special round-trip failed" code_start_id = tok.get_special_token_id("code_start") code_end_id = tok.get_special_token_id("code_end") assert code_start_id in ids2, "code_start token not found" assert code_end_id in ids2, "code_end token not found" print(f" ✅ Special tokens preserved — {len(ids2)} tokens") # ── 4. Test encode_conversation ────────────────────────────────────── print("\n4️⃣ encode_conversation()...") messages = [ {"role": "system", "content": "You are MINDI 1.5 Vision-Coder."}, {"role": "user", "content": "Build a navbar."}, {"role": "assistant", "content": "<|think_start|>\nPlanning navbar...\n<|think_end|>\n\n<|code_start|>\nexport default function Navbar() { return ; }\n<|code_end|>"}, ] conv_ids = tok.encode_conversation(messages, wrap_mindi=True) conv_decoded = tok.decode(conv_ids) assert "<|mindi_start|>" in conv_decoded, "mindi_start missing" assert "<|mindi_end|>" in conv_decoded, "mindi_end missing" assert "<|im_start|>" in conv_decoded, "im_start missing" assert "<|think_start|>" in conv_decoded, "think_start missing" assert "<|code_start|>" in conv_decoded, "code_start missing" print(f" ✅ Conversation encoded — {len(conv_ids)} tokens, mindi/im/think/code all present") # ── 5. Test get_special_token_ids ──────────────────────────────────── print("\n5️⃣ get_special_token_ids()...") all_ids = tok.get_special_token_ids() assert len(all_ids) == 22, f"Expected 22, got {len(all_ids)}" for name, tid in all_ids.items(): assert isinstance(tid, int) and tid > 0, f"Bad ID for {name}: {tid}" print(f" ✅ 22 special token IDs returned, all valid integers") # ── 6. Test get_vocab_size ─────────────────────────────────────────── print("\n6️⃣ get_vocab_size()...") vs = tok.get_vocab_size() assert vs == 151685, f"Expected 151685, got {vs}" print(f" ✅ Vocab size: {vs:,}") # ── Generate test report ───────────────────────────────────────────── print("\n" + "─" * 70) print("📄 Generating test report...") report_lines = [ "=" * 70, "MINDI 1.5 VISION-CODER — TOKENIZER TEST REPORT", f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", "=" * 70, "", "BASE MODEL: Qwen/Qwen2.5-Coder-7B-Instruct", f"VOCAB SIZE: {vs:,}", f"SPECIAL TOKENS: {len(all_ids)} (22 MINDI tokens)", f"TOKENIZER PATH: data/tokenizer/mindi_tokenizer/", "", "─" * 70, "SPECIAL TOKEN REGISTRY", "─" * 70, ] for name, tid in sorted(all_ids.items(), key=lambda x: x[1]): token_str = MINDI_SPECIAL_TOKENS[name] report_lines.append(f" {token_str:<25} → ID {tid}") report_lines += [ "", "─" * 70, "WRAPPER CLASS API TESTS", "─" * 70, " ✅ encode() — round-trip plain text", " ✅ decode() — reconstructs original text", " ✅ encode_with_special_tokens() — preserves special tokens as single IDs", " ✅ encode_conversation() — formats system/user/assistant with im_start/end + mindi wrapper", " ✅ get_vocab_size() — returns 151,685", " ✅ get_special_token_ids() — returns all 22 MINDI token IDs", " ✅ get_special_token_id(name) — individual token lookup", "", "─" * 70, "CONVERSATION FORMAT TEST (from Step 5)", "─" * 70, " Total tokens: 971", " Round-trip: PERFECT MATCH", " Special tokens: 22/22 preserved as single tokens", " Qwen chat tokens: im_start ×3, im_end ×3", " Context usage: 971 / 32,768 = 3.0%", "", "─" * 70, "FILES SAVED", "─" * 70, " data/tokenizer/base_tokenizer/ — Original Qwen tokenizer (3 files)", " data/tokenizer/mindi_tokenizer/ — MINDI tokenizer with 22 special tokens", " src/tokenizer/tokenizer.py — MindiTokenizer wrapper class", " logs/tokenizer_test.txt — This report", " scripts/download_tokenizer.py — Tokenizer download script", " scripts/add_special_tokens.py — Special token addition script", " scripts/test_mindi_format.py — Conversation format test script", "", "=" * 70, "STATUS: ALL TESTS PASSED ✅", "=" * 70, ] report_text = "\n".join(report_lines) logs_dir = PROJECT_ROOT / "logs" logs_dir.mkdir(parents=True, exist_ok=True) report_path = logs_dir / "tokenizer_test.txt" report_path.write_text(report_text, encoding="utf-8") print(f" ✅ Saved to: {report_path}") # ── Final summary ──────────────────────────────────────────────────── print("\n" + "=" * 70) print("✅ STEP 6 COMPLETE: Everything saved!") print(" • MindiTokenizer wrapper class — 6/6 API methods tested") print(" • Test report — logs/tokenizer_test.txt") print(f" • Tokenizer files — data/tokenizer/mindi_tokenizer/") print("=" * 70)