Faaz
Day 1 Complete: Tokenizer setup β Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89 | """ | |
| MINDI 1.5 Vision-Coder β Step 6: Smoke-test MindiTokenizer wrapper & generate test report. | |
| """ | |
| import sys | |
| import datetime | |
| from pathlib import Path | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(PROJECT_ROOT / "src")) | |
| from tokenizer.tokenizer import MindiTokenizer, MINDI_SPECIAL_TOKENS | |
| print("=" * 70) | |
| print("STEP 6: SAVE EVERYTHING β WRAPPER SMOKE TEST + REPORT") | |
| print("=" * 70) | |
| # ββ 1. Load via wrapper class ββββββββββββββββββββββββββββββββββββββββ | |
| print("\n1οΈβ£ Loading MindiTokenizer wrapper...") | |
| tok = MindiTokenizer() | |
| print(f" β Loaded from: {tok.tokenizer_path}") | |
| print(f" Vocab size: {tok.get_vocab_size():,}") | |
| # ββ 2. Test encode / decode ββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n2οΈβ£ encode() / decode()...") | |
| text = "export default function Hero() { return <h1>Hello</h1>; }" | |
| ids = tok.encode(text) | |
| decoded = tok.decode(ids) | |
| assert decoded.strip() == text.strip(), f"Round-trip failed: {decoded!r}" | |
| print(f" β Round-trip OK β {len(ids)} tokens") | |
| # ββ 3. Test encode_with_special_tokens βββββββββββββββββββββββββββββββ | |
| print("\n3οΈβ£ encode_with_special_tokens()...") | |
| special_text = "<|code_start|>\nconsole.log('hi');\n<|code_end|>" | |
| ids2 = tok.encode_with_special_tokens(special_text) | |
| decoded2 = tok.decode(ids2) | |
| assert decoded2.strip() == special_text.strip(), f"Special round-trip failed" | |
| code_start_id = tok.get_special_token_id("code_start") | |
| code_end_id = tok.get_special_token_id("code_end") | |
| assert code_start_id in ids2, "code_start token not found" | |
| assert code_end_id in ids2, "code_end token not found" | |
| print(f" β Special tokens preserved β {len(ids2)} tokens") | |
| # ββ 4. Test encode_conversation ββββββββββββββββββββββββββββββββββββββ | |
| print("\n4οΈβ£ encode_conversation()...") | |
| messages = [ | |
| {"role": "system", "content": "You are MINDI 1.5 Vision-Coder."}, | |
| {"role": "user", "content": "Build a navbar."}, | |
| {"role": "assistant", "content": "<|think_start|>\nPlanning navbar...\n<|think_end|>\n\n<|code_start|>\nexport default function Navbar() { return <nav>Nav</nav>; }\n<|code_end|>"}, | |
| ] | |
| conv_ids = tok.encode_conversation(messages, wrap_mindi=True) | |
| conv_decoded = tok.decode(conv_ids) | |
| assert "<|mindi_start|>" in conv_decoded, "mindi_start missing" | |
| assert "<|mindi_end|>" in conv_decoded, "mindi_end missing" | |
| assert "<|im_start|>" in conv_decoded, "im_start missing" | |
| assert "<|think_start|>" in conv_decoded, "think_start missing" | |
| assert "<|code_start|>" in conv_decoded, "code_start missing" | |
| print(f" β Conversation encoded β {len(conv_ids)} tokens, mindi/im/think/code all present") | |
| # ββ 5. Test get_special_token_ids ββββββββββββββββββββββββββββββββββββ | |
| print("\n5οΈβ£ get_special_token_ids()...") | |
| all_ids = tok.get_special_token_ids() | |
| assert len(all_ids) == 22, f"Expected 22, got {len(all_ids)}" | |
| for name, tid in all_ids.items(): | |
| assert isinstance(tid, int) and tid > 0, f"Bad ID for {name}: {tid}" | |
| print(f" β 22 special token IDs returned, all valid integers") | |
| # ββ 6. Test get_vocab_size βββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n6οΈβ£ get_vocab_size()...") | |
| vs = tok.get_vocab_size() | |
| assert vs == 151685, f"Expected 151685, got {vs}" | |
| print(f" β Vocab size: {vs:,}") | |
| # ββ Generate test report βββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "β" * 70) | |
| print("π Generating test report...") | |
| report_lines = [ | |
| "=" * 70, | |
| "MINDI 1.5 VISION-CODER β TOKENIZER TEST REPORT", | |
| f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", | |
| "=" * 70, | |
| "", | |
| "BASE MODEL: Qwen/Qwen2.5-Coder-7B-Instruct", | |
| f"VOCAB SIZE: {vs:,}", | |
| f"SPECIAL TOKENS: {len(all_ids)} (22 MINDI tokens)", | |
| f"TOKENIZER PATH: data/tokenizer/mindi_tokenizer/", | |
| "", | |
| "β" * 70, | |
| "SPECIAL TOKEN REGISTRY", | |
| "β" * 70, | |
| ] | |
| for name, tid in sorted(all_ids.items(), key=lambda x: x[1]): | |
| token_str = MINDI_SPECIAL_TOKENS[name] | |
| report_lines.append(f" {token_str:<25} β ID {tid}") | |
| report_lines += [ | |
| "", | |
| "β" * 70, | |
| "WRAPPER CLASS API TESTS", | |
| "β" * 70, | |
| " β encode() β round-trip plain text", | |
| " β decode() β reconstructs original text", | |
| " β encode_with_special_tokens() β preserves special tokens as single IDs", | |
| " β encode_conversation() β formats system/user/assistant with im_start/end + mindi wrapper", | |
| " β get_vocab_size() β returns 151,685", | |
| " β get_special_token_ids() β returns all 22 MINDI token IDs", | |
| " β get_special_token_id(name) β individual token lookup", | |
| "", | |
| "β" * 70, | |
| "CONVERSATION FORMAT TEST (from Step 5)", | |
| "β" * 70, | |
| " Total tokens: 971", | |
| " Round-trip: PERFECT MATCH", | |
| " Special tokens: 22/22 preserved as single tokens", | |
| " Qwen chat tokens: im_start Γ3, im_end Γ3", | |
| " Context usage: 971 / 32,768 = 3.0%", | |
| "", | |
| "β" * 70, | |
| "FILES SAVED", | |
| "β" * 70, | |
| " data/tokenizer/base_tokenizer/ β Original Qwen tokenizer (3 files)", | |
| " data/tokenizer/mindi_tokenizer/ β MINDI tokenizer with 22 special tokens", | |
| " src/tokenizer/tokenizer.py β MindiTokenizer wrapper class", | |
| " logs/tokenizer_test.txt β This report", | |
| " scripts/download_tokenizer.py β Tokenizer download script", | |
| " scripts/add_special_tokens.py β Special token addition script", | |
| " scripts/test_mindi_format.py β Conversation format test script", | |
| "", | |
| "=" * 70, | |
| "STATUS: ALL TESTS PASSED β ", | |
| "=" * 70, | |
| ] | |
| report_text = "\n".join(report_lines) | |
| logs_dir = PROJECT_ROOT / "logs" | |
| logs_dir.mkdir(parents=True, exist_ok=True) | |
| report_path = logs_dir / "tokenizer_test.txt" | |
| report_path.write_text(report_text, encoding="utf-8") | |
| print(f" β Saved to: {report_path}") | |
| # ββ Final summary ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "=" * 70) | |
| print("β STEP 6 COMPLETE: Everything saved!") | |
| print(" β’ MindiTokenizer wrapper class β 6/6 API methods tested") | |
| print(" β’ Test report β logs/tokenizer_test.txt") | |
| print(f" β’ Tokenizer files β data/tokenizer/mindi_tokenizer/") | |
| print("=" * 70) | |