MINDI-1.5-Vision-Coder / scripts /save_everything.py
Faaz
Day 1 Complete: Tokenizer setup β€” Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89
"""
MINDI 1.5 Vision-Coder β€” Step 6: Smoke-test MindiTokenizer wrapper & generate test report.
"""
import sys
import datetime
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
from tokenizer.tokenizer import MindiTokenizer, MINDI_SPECIAL_TOKENS
print("=" * 70)
print("STEP 6: SAVE EVERYTHING β€” WRAPPER SMOKE TEST + REPORT")
print("=" * 70)
# ── 1. Load via wrapper class ────────────────────────────────────────
print("\n1️⃣ Loading MindiTokenizer wrapper...")
tok = MindiTokenizer()
print(f" βœ… Loaded from: {tok.tokenizer_path}")
print(f" Vocab size: {tok.get_vocab_size():,}")
# ── 2. Test encode / decode ──────────────────────────────────────────
print("\n2️⃣ encode() / decode()...")
text = "export default function Hero() { return <h1>Hello</h1>; }"
ids = tok.encode(text)
decoded = tok.decode(ids)
assert decoded.strip() == text.strip(), f"Round-trip failed: {decoded!r}"
print(f" βœ… Round-trip OK β€” {len(ids)} tokens")
# ── 3. Test encode_with_special_tokens ───────────────────────────────
print("\n3️⃣ encode_with_special_tokens()...")
special_text = "<|code_start|>\nconsole.log('hi');\n<|code_end|>"
ids2 = tok.encode_with_special_tokens(special_text)
decoded2 = tok.decode(ids2)
assert decoded2.strip() == special_text.strip(), f"Special round-trip failed"
code_start_id = tok.get_special_token_id("code_start")
code_end_id = tok.get_special_token_id("code_end")
assert code_start_id in ids2, "code_start token not found"
assert code_end_id in ids2, "code_end token not found"
print(f" βœ… Special tokens preserved β€” {len(ids2)} tokens")
# ── 4. Test encode_conversation ──────────────────────────────────────
print("\n4️⃣ encode_conversation()...")
messages = [
{"role": "system", "content": "You are MINDI 1.5 Vision-Coder."},
{"role": "user", "content": "Build a navbar."},
{"role": "assistant", "content": "<|think_start|>\nPlanning navbar...\n<|think_end|>\n\n<|code_start|>\nexport default function Navbar() { return <nav>Nav</nav>; }\n<|code_end|>"},
]
conv_ids = tok.encode_conversation(messages, wrap_mindi=True)
conv_decoded = tok.decode(conv_ids)
assert "<|mindi_start|>" in conv_decoded, "mindi_start missing"
assert "<|mindi_end|>" in conv_decoded, "mindi_end missing"
assert "<|im_start|>" in conv_decoded, "im_start missing"
assert "<|think_start|>" in conv_decoded, "think_start missing"
assert "<|code_start|>" in conv_decoded, "code_start missing"
print(f" βœ… Conversation encoded β€” {len(conv_ids)} tokens, mindi/im/think/code all present")
# ── 5. Test get_special_token_ids ────────────────────────────────────
print("\n5️⃣ get_special_token_ids()...")
all_ids = tok.get_special_token_ids()
assert len(all_ids) == 22, f"Expected 22, got {len(all_ids)}"
for name, tid in all_ids.items():
assert isinstance(tid, int) and tid > 0, f"Bad ID for {name}: {tid}"
print(f" βœ… 22 special token IDs returned, all valid integers")
# ── 6. Test get_vocab_size ───────────────────────────────────────────
print("\n6️⃣ get_vocab_size()...")
vs = tok.get_vocab_size()
assert vs == 151685, f"Expected 151685, got {vs}"
print(f" βœ… Vocab size: {vs:,}")
# ── Generate test report ─────────────────────────────────────────────
print("\n" + "─" * 70)
print("πŸ“„ Generating test report...")
report_lines = [
"=" * 70,
"MINDI 1.5 VISION-CODER β€” TOKENIZER TEST REPORT",
f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"=" * 70,
"",
"BASE MODEL: Qwen/Qwen2.5-Coder-7B-Instruct",
f"VOCAB SIZE: {vs:,}",
f"SPECIAL TOKENS: {len(all_ids)} (22 MINDI tokens)",
f"TOKENIZER PATH: data/tokenizer/mindi_tokenizer/",
"",
"─" * 70,
"SPECIAL TOKEN REGISTRY",
"─" * 70,
]
for name, tid in sorted(all_ids.items(), key=lambda x: x[1]):
token_str = MINDI_SPECIAL_TOKENS[name]
report_lines.append(f" {token_str:<25} β†’ ID {tid}")
report_lines += [
"",
"─" * 70,
"WRAPPER CLASS API TESTS",
"─" * 70,
" βœ… encode() β€” round-trip plain text",
" βœ… decode() β€” reconstructs original text",
" βœ… encode_with_special_tokens() β€” preserves special tokens as single IDs",
" βœ… encode_conversation() β€” formats system/user/assistant with im_start/end + mindi wrapper",
" βœ… get_vocab_size() β€” returns 151,685",
" βœ… get_special_token_ids() β€” returns all 22 MINDI token IDs",
" βœ… get_special_token_id(name) β€” individual token lookup",
"",
"─" * 70,
"CONVERSATION FORMAT TEST (from Step 5)",
"─" * 70,
" Total tokens: 971",
" Round-trip: PERFECT MATCH",
" Special tokens: 22/22 preserved as single tokens",
" Qwen chat tokens: im_start Γ—3, im_end Γ—3",
" Context usage: 971 / 32,768 = 3.0%",
"",
"─" * 70,
"FILES SAVED",
"─" * 70,
" data/tokenizer/base_tokenizer/ β€” Original Qwen tokenizer (3 files)",
" data/tokenizer/mindi_tokenizer/ β€” MINDI tokenizer with 22 special tokens",
" src/tokenizer/tokenizer.py β€” MindiTokenizer wrapper class",
" logs/tokenizer_test.txt β€” This report",
" scripts/download_tokenizer.py β€” Tokenizer download script",
" scripts/add_special_tokens.py β€” Special token addition script",
" scripts/test_mindi_format.py β€” Conversation format test script",
"",
"=" * 70,
"STATUS: ALL TESTS PASSED βœ…",
"=" * 70,
]
report_text = "\n".join(report_lines)
logs_dir = PROJECT_ROOT / "logs"
logs_dir.mkdir(parents=True, exist_ok=True)
report_path = logs_dir / "tokenizer_test.txt"
report_path.write_text(report_text, encoding="utf-8")
print(f" βœ… Saved to: {report_path}")
# ── Final summary ────────────────────────────────────────────────────
print("\n" + "=" * 70)
print("βœ… STEP 6 COMPLETE: Everything saved!")
print(" β€’ MindiTokenizer wrapper class β€” 6/6 API methods tested")
print(" β€’ Test report β€” logs/tokenizer_test.txt")
print(f" β€’ Tokenizer files β€” data/tokenizer/mindi_tokenizer/")
print("=" * 70)