MINDI-1.5-Vision-Coder / scripts /save_everything.py

Faaz

Day 1 Complete: Tokenizer setup — Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test

11e0d89 about 1 month ago

raw

history blame contribute delete

6.91 kB

	"""
	MINDI 1.5 Vision-Coder — Step 6: Smoke-test MindiTokenizer wrapper & generate test report.
	"""

	import sys
	import datetime
	from pathlib import Path

	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(PROJECT_ROOT / "src"))

	from tokenizer.tokenizer import MindiTokenizer, MINDI_SPECIAL_TOKENS

	print("=" * 70)
	print("STEP 6: SAVE EVERYTHING — WRAPPER SMOKE TEST + REPORT")
	print("=" * 70)

	# ── 1. Load via wrapper class ────────────────────────────────────────
	print("\n1️⃣ Loading MindiTokenizer wrapper...")
	tok = MindiTokenizer()
	print(f" ✅ Loaded from: {tok.tokenizer_path}")
	print(f" Vocab size: {tok.get_vocab_size():,}")

	# ── 2. Test encode / decode ──────────────────────────────────────────
	print("\n2️⃣ encode() / decode()...")
	text = "export default function Hero() { return <h1>Hello</h1>; }"
	ids = tok.encode(text)
	decoded = tok.decode(ids)
	assert decoded.strip() == text.strip(), f"Round-trip failed: {decoded!r}"
	print(f" ✅ Round-trip OK — {len(ids)} tokens")

	# ── 3. Test encode_with_special_tokens ───────────────────────────────
	print("\n3️⃣ encode_with_special_tokens()...")
	special_text = "<\|code_start\|>\nconsole.log('hi');\n<\|code_end\|>"
	ids2 = tok.encode_with_special_tokens(special_text)
	decoded2 = tok.decode(ids2)
	assert decoded2.strip() == special_text.strip(), f"Special round-trip failed"
	code_start_id = tok.get_special_token_id("code_start")
	code_end_id = tok.get_special_token_id("code_end")
	assert code_start_id in ids2, "code_start token not found"
	assert code_end_id in ids2, "code_end token not found"
	print(f" ✅ Special tokens preserved — {len(ids2)} tokens")

	# ── 4. Test encode_conversation ──────────────────────────────────────
	print("\n4️⃣ encode_conversation()...")
	messages = [
	{"role": "system", "content": "You are MINDI 1.5 Vision-Coder."},
	{"role": "user", "content": "Build a navbar."},
	{"role": "assistant", "content": "<\|think_start\|>\nPlanning navbar...\n<\|think_end\|>\n\n<\|code_start\|>\nexport default function Navbar() { return <nav>Nav</nav>; }\n<\|code_end\|>"},
	]
	conv_ids = tok.encode_conversation(messages, wrap_mindi=True)
	conv_decoded = tok.decode(conv_ids)
	assert "<\|mindi_start\|>" in conv_decoded, "mindi_start missing"
	assert "<\|mindi_end\|>" in conv_decoded, "mindi_end missing"
	assert "<\|im_start\|>" in conv_decoded, "im_start missing"
	assert "<\|think_start\|>" in conv_decoded, "think_start missing"
	assert "<\|code_start\|>" in conv_decoded, "code_start missing"
	print(f" ✅ Conversation encoded — {len(conv_ids)} tokens, mindi/im/think/code all present")

	# ── 5. Test get_special_token_ids ────────────────────────────────────
	print("\n5️⃣ get_special_token_ids()...")
	all_ids = tok.get_special_token_ids()
	assert len(all_ids) == 22, f"Expected 22, got {len(all_ids)}"
	for name, tid in all_ids.items():
	assert isinstance(tid, int) and tid > 0, f"Bad ID for {name}: {tid}"
	print(f" ✅ 22 special token IDs returned, all valid integers")

	# ── 6. Test get_vocab_size ───────────────────────────────────────────
	print("\n6️⃣ get_vocab_size()...")
	vs = tok.get_vocab_size()
	assert vs == 151685, f"Expected 151685, got {vs}"
	print(f" ✅ Vocab size: {vs:,}")

	# ── Generate test report ─────────────────────────────────────────────
	print("\n" + "─" * 70)
	print("📄 Generating test report...")

	report_lines = [
	"=" * 70,
	"MINDI 1.5 VISION-CODER — TOKENIZER TEST REPORT",
	f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
	"=" * 70,
	"",
	"BASE MODEL: Qwen/Qwen2.5-Coder-7B-Instruct",
	f"VOCAB SIZE: {vs:,}",
	f"SPECIAL TOKENS: {len(all_ids)} (22 MINDI tokens)",
	f"TOKENIZER PATH: data/tokenizer/mindi_tokenizer/",
	"",
	"─" * 70,
	"SPECIAL TOKEN REGISTRY",
	"─" * 70,
	]

	for name, tid in sorted(all_ids.items(), key=lambda x: x[1]):
	token_str = MINDI_SPECIAL_TOKENS[name]
	report_lines.append(f" {token_str:<25} → ID {tid}")

	report_lines += [
	"",
	"─" * 70,
	"WRAPPER CLASS API TESTS",
	"─" * 70,
	" ✅ encode() — round-trip plain text",
	" ✅ decode() — reconstructs original text",
	" ✅ encode_with_special_tokens() — preserves special tokens as single IDs",
	" ✅ encode_conversation() — formats system/user/assistant with im_start/end + mindi wrapper",
	" ✅ get_vocab_size() — returns 151,685",
	" ✅ get_special_token_ids() — returns all 22 MINDI token IDs",
	" ✅ get_special_token_id(name) — individual token lookup",
	"",
	"─" * 70,
	"CONVERSATION FORMAT TEST (from Step 5)",
	"─" * 70,
	" Total tokens: 971",
	" Round-trip: PERFECT MATCH",
	" Special tokens: 22/22 preserved as single tokens",
	" Qwen chat tokens: im_start ×3, im_end ×3",
	" Context usage: 971 / 32,768 = 3.0%",
	"",
	"─" * 70,
	"FILES SAVED",
	"─" * 70,
	" data/tokenizer/base_tokenizer/ — Original Qwen tokenizer (3 files)",
	" data/tokenizer/mindi_tokenizer/ — MINDI tokenizer with 22 special tokens",
	" src/tokenizer/tokenizer.py — MindiTokenizer wrapper class",
	" logs/tokenizer_test.txt — This report",
	" scripts/download_tokenizer.py — Tokenizer download script",
	" scripts/add_special_tokens.py — Special token addition script",
	" scripts/test_mindi_format.py — Conversation format test script",
	"",
	"=" * 70,
	"STATUS: ALL TESTS PASSED ✅",
	"=" * 70,
	]

	report_text = "\n".join(report_lines)

	logs_dir = PROJECT_ROOT / "logs"
	logs_dir.mkdir(parents=True, exist_ok=True)
	report_path = logs_dir / "tokenizer_test.txt"
	report_path.write_text(report_text, encoding="utf-8")
	print(f" ✅ Saved to: {report_path}")

	# ── Final summary ────────────────────────────────────────────────────
	print("\n" + "=" * 70)
	print("✅ STEP 6 COMPLETE: Everything saved!")
	print(" • MindiTokenizer wrapper class — 6/6 API methods tested")
	print(" • Test report — logs/tokenizer_test.txt")
	print(f" • Tokenizer files — data/tokenizer/mindi_tokenizer/")
	print("=" * 70)