File size: 6,909 Bytes
11e0d89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | """
MINDI 1.5 Vision-Coder β Step 6: Smoke-test MindiTokenizer wrapper & generate test report.
"""
import sys
import datetime
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "src"))
from tokenizer.tokenizer import MindiTokenizer, MINDI_SPECIAL_TOKENS
print("=" * 70)
print("STEP 6: SAVE EVERYTHING β WRAPPER SMOKE TEST + REPORT")
print("=" * 70)
# ββ 1. Load via wrapper class ββββββββββββββββββββββββββββββββββββββββ
print("\n1οΈβ£ Loading MindiTokenizer wrapper...")
tok = MindiTokenizer()
print(f" β
Loaded from: {tok.tokenizer_path}")
print(f" Vocab size: {tok.get_vocab_size():,}")
# ββ 2. Test encode / decode ββββββββββββββββββββββββββββββββββββββββββ
print("\n2οΈβ£ encode() / decode()...")
text = "export default function Hero() { return <h1>Hello</h1>; }"
ids = tok.encode(text)
decoded = tok.decode(ids)
assert decoded.strip() == text.strip(), f"Round-trip failed: {decoded!r}"
print(f" β
Round-trip OK β {len(ids)} tokens")
# ββ 3. Test encode_with_special_tokens βββββββββββββββββββββββββββββββ
print("\n3οΈβ£ encode_with_special_tokens()...")
special_text = "<|code_start|>\nconsole.log('hi');\n<|code_end|>"
ids2 = tok.encode_with_special_tokens(special_text)
decoded2 = tok.decode(ids2)
assert decoded2.strip() == special_text.strip(), f"Special round-trip failed"
code_start_id = tok.get_special_token_id("code_start")
code_end_id = tok.get_special_token_id("code_end")
assert code_start_id in ids2, "code_start token not found"
assert code_end_id in ids2, "code_end token not found"
print(f" β
Special tokens preserved β {len(ids2)} tokens")
# ββ 4. Test encode_conversation ββββββββββββββββββββββββββββββββββββββ
print("\n4οΈβ£ encode_conversation()...")
messages = [
{"role": "system", "content": "You are MINDI 1.5 Vision-Coder."},
{"role": "user", "content": "Build a navbar."},
{"role": "assistant", "content": "<|think_start|>\nPlanning navbar...\n<|think_end|>\n\n<|code_start|>\nexport default function Navbar() { return <nav>Nav</nav>; }\n<|code_end|>"},
]
conv_ids = tok.encode_conversation(messages, wrap_mindi=True)
conv_decoded = tok.decode(conv_ids)
assert "<|mindi_start|>" in conv_decoded, "mindi_start missing"
assert "<|mindi_end|>" in conv_decoded, "mindi_end missing"
assert "<|im_start|>" in conv_decoded, "im_start missing"
assert "<|think_start|>" in conv_decoded, "think_start missing"
assert "<|code_start|>" in conv_decoded, "code_start missing"
print(f" β
Conversation encoded β {len(conv_ids)} tokens, mindi/im/think/code all present")
# ββ 5. Test get_special_token_ids ββββββββββββββββββββββββββββββββββββ
print("\n5οΈβ£ get_special_token_ids()...")
all_ids = tok.get_special_token_ids()
assert len(all_ids) == 22, f"Expected 22, got {len(all_ids)}"
for name, tid in all_ids.items():
assert isinstance(tid, int) and tid > 0, f"Bad ID for {name}: {tid}"
print(f" β
22 special token IDs returned, all valid integers")
# ββ 6. Test get_vocab_size βββββββββββββββββββββββββββββββββββββββββββ
print("\n6οΈβ£ get_vocab_size()...")
vs = tok.get_vocab_size()
assert vs == 151685, f"Expected 151685, got {vs}"
print(f" β
Vocab size: {vs:,}")
# ββ Generate test report βββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "β" * 70)
print("π Generating test report...")
report_lines = [
"=" * 70,
"MINDI 1.5 VISION-CODER β TOKENIZER TEST REPORT",
f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"=" * 70,
"",
"BASE MODEL: Qwen/Qwen2.5-Coder-7B-Instruct",
f"VOCAB SIZE: {vs:,}",
f"SPECIAL TOKENS: {len(all_ids)} (22 MINDI tokens)",
f"TOKENIZER PATH: data/tokenizer/mindi_tokenizer/",
"",
"β" * 70,
"SPECIAL TOKEN REGISTRY",
"β" * 70,
]
for name, tid in sorted(all_ids.items(), key=lambda x: x[1]):
token_str = MINDI_SPECIAL_TOKENS[name]
report_lines.append(f" {token_str:<25} β ID {tid}")
report_lines += [
"",
"β" * 70,
"WRAPPER CLASS API TESTS",
"β" * 70,
" β
encode() β round-trip plain text",
" β
decode() β reconstructs original text",
" β
encode_with_special_tokens() β preserves special tokens as single IDs",
" β
encode_conversation() β formats system/user/assistant with im_start/end + mindi wrapper",
" β
get_vocab_size() β returns 151,685",
" β
get_special_token_ids() β returns all 22 MINDI token IDs",
" β
get_special_token_id(name) β individual token lookup",
"",
"β" * 70,
"CONVERSATION FORMAT TEST (from Step 5)",
"β" * 70,
" Total tokens: 971",
" Round-trip: PERFECT MATCH",
" Special tokens: 22/22 preserved as single tokens",
" Qwen chat tokens: im_start Γ3, im_end Γ3",
" Context usage: 971 / 32,768 = 3.0%",
"",
"β" * 70,
"FILES SAVED",
"β" * 70,
" data/tokenizer/base_tokenizer/ β Original Qwen tokenizer (3 files)",
" data/tokenizer/mindi_tokenizer/ β MINDI tokenizer with 22 special tokens",
" src/tokenizer/tokenizer.py β MindiTokenizer wrapper class",
" logs/tokenizer_test.txt β This report",
" scripts/download_tokenizer.py β Tokenizer download script",
" scripts/add_special_tokens.py β Special token addition script",
" scripts/test_mindi_format.py β Conversation format test script",
"",
"=" * 70,
"STATUS: ALL TESTS PASSED β
",
"=" * 70,
]
report_text = "\n".join(report_lines)
logs_dir = PROJECT_ROOT / "logs"
logs_dir.mkdir(parents=True, exist_ok=True)
report_path = logs_dir / "tokenizer_test.txt"
report_path.write_text(report_text, encoding="utf-8")
print(f" β
Saved to: {report_path}")
# ββ Final summary ββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 70)
print("β
STEP 6 COMPLETE: Everything saved!")
print(" β’ MindiTokenizer wrapper class β 6/6 API methods tested")
print(" β’ Test report β logs/tokenizer_test.txt")
print(f" β’ Tokenizer files β data/tokenizer/mindi_tokenizer/")
print("=" * 70)
|