""" MINDI 1.5 Vision-Coder — Step 5: Test MINDI Conversation Format Tests full conversation tokenization with all special tokens. """ from pathlib import Path from transformers import AutoTokenizer PROJECT_ROOT = Path(__file__).resolve().parent.parent TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer" # ── Load MINDI tokenizer ────────────────────────────────────────────── print("=" * 70) print("STEP 5: TEST MINDI CONVERSATION FORMAT") print("=" * 70) print(f"\n📂 Loading MINDI tokenizer from: {TOKENIZER_PATH}") tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), trust_remote_code=True) print(f" Vocab size: {len(tokenizer):,}") # ── Define MINDI special tokens ────────────────────────────────────── MINDI_SPECIAL_TOKENS = [ "<|mindi_start|>", "<|mindi_end|>", "<|code_start|>", "<|code_end|>", "<|vision_start|>", "<|vision_end|>", "<|critique_start|>", "<|critique_end|>", "<|suggest_start|>", "<|suggest_end|>", "<|think_start|>", "<|think_end|>", "<|file_start|>", "<|file_end|>", "<|search_start|>", "<|search_end|>", "<|sandbox_start|>", "<|sandbox_end|>", "<|error_start|>", "<|error_end|>", "<|fix_start|>", "<|fix_end|>", ] # ── Build test conversation ────────────────────────────────────────── system_message = ( "You are MINDI 1.5 Vision-Coder, a multimodal agentic AI coding assistant " "created by MINDIGENOUS.AI. You generate production-ready Next.js 14 + " "Tailwind CSS + TypeScript code, see your own output via vision, critique " "UI/UX, search the web for best practices, test in a sandbox, and self-fix errors." ) user_message = "Build me a hero section for a SaaS landing page with a gradient background, animated headline, and a CTA button." assistant_message = """<|think_start|> The user wants a hero section for a SaaS landing page. I need to: 1. Search for current hero section best practices 2. Create a Next.js 14 component with Tailwind CSS 3. Include gradient background, animated headline, and CTA button 4. Critique the result for UI/UX quality <|think_end|> <|search_start|> query: "best SaaS hero section design patterns 2025 Next.js Tailwind" results: [ {"title": "Modern Hero Sections", "url": "https://example.com/hero-patterns"}, {"title": "SaaS Landing Best Practices", "url": "https://example.com/saas-landing"} ] <|search_end|> <|file_start|> path: src/components/HeroSection.tsx language: typescript framework: next.js 14 <|file_end|> <|code_start|> 'use client'; import { motion } from 'framer-motion'; export default function HeroSection() { return (
Ship faster with{' '} AI-powered code Build production-ready web apps in minutes. MINDI writes, tests, and deploys your code while you focus on what matters. Get Started Free →
); } <|code_end|> <|critique_start|> Visual Analysis: - ✅ Gradient background: Strong indigo→purple→pink gradient creates visual depth - ✅ Animated headline: Framer Motion fade-up provides smooth entrance - ✅ CTA button: High contrast white on gradient, rounded-full shape draws attention - ⚠️ Accessibility: Need to verify color contrast ratios for text on gradient - ⚠️ Mobile: Font sizes may need adjustment below md breakpoint - ✅ Performance: Motion animations are hardware-accelerated transforms <|critique_end|> <|suggest_start|> Improvements for next iteration: 1. Add aria-label to the CTA link for screen readers 2. Consider adding a secondary CTA (e.g., "Watch Demo") for users not ready to commit 3. Add a subtle particle or floating shape animation in the background 4. Include social proof (e.g., "Trusted by 10,000+ developers") below the CTA <|suggest_end|>""" # ── Build full conversation string ─────────────────────────────────── conversation = f"""<|mindi_start|> <|im_start|>system {system_message}<|im_end|> <|im_start|>user {user_message}<|im_end|> <|im_start|>assistant {assistant_message}<|im_end|> <|mindi_end|>""" print("\n" + "─" * 70) print("FULL MINDI CONVERSATION (raw text)") print("─" * 70) print(conversation) print("─" * 70) # ── Tokenize the full conversation ─────────────────────────────────── print("\n📊 TOKENIZATION RESULTS") print("─" * 70) token_ids = tokenizer.encode(conversation, add_special_tokens=False) print(f" Total tokens: {len(token_ids):,}") decoded = tokenizer.decode(token_ids) print(f" Decoded length (chars): {len(decoded):,}") # ── Round-trip verification ────────────────────────────────────────── print("\n🔄 ROUND-TRIP VERIFICATION") print("─" * 70) if decoded.strip() == conversation.strip(): print(" ✅ PERFECT MATCH — decoded text matches original conversation exactly") round_trip_pass = True else: # Show differences for debugging print(" ❌ MISMATCH detected!") orig_lines = conversation.strip().splitlines() dec_lines = decoded.strip().splitlines() print(f" Original lines: {len(orig_lines)}, Decoded lines: {len(dec_lines)}") for i, (o, d) in enumerate(zip(orig_lines, dec_lines)): if o != d: print(f" Line {i}: DIFF") print(f" Original: {repr(o[:100])}") print(f" Decoded: {repr(d[:100])}") round_trip_pass = False # ── Verify all MINDI special tokens are preserved as single tokens ─── print("\n🔍 SPECIAL TOKEN PRESERVATION") print("─" * 70) all_passed = True for token_str in MINDI_SPECIAL_TOKENS: token_id = tokenizer.convert_tokens_to_ids(token_str) # Check the token encodes to a single ID encoded = tokenizer.encode(token_str, add_special_tokens=False) if len(encoded) == 1 and encoded[0] == token_id: status = "✅" else: status = "❌" all_passed = False # Check this token_id appears in the full conversation encoding count_in_conv = token_ids.count(token_id) print(f" {status} {token_str:<25} ID={token_id:<8} single_token=True occurrences_in_conv={count_in_conv}") # ── Qwen chat template tokens ────────────────────────────────────── print("\n🔍 QWEN CHAT TEMPLATE TOKENS") print("─" * 70) qwen_tokens = ["<|im_start|>", "<|im_end|>"] for token_str in qwen_tokens: token_id = tokenizer.convert_tokens_to_ids(token_str) encoded = tokenizer.encode(token_str, add_special_tokens=False) count_in_conv = token_ids.count(token_id) status = "✅" if len(encoded) == 1 else "❌" print(f" {status} {token_str:<25} ID={token_id:<8} occurrences_in_conv={count_in_conv}") # ── Token distribution analysis ────────────────────────────────────── print("\n📈 TOKEN DISTRIBUTION") print("─" * 70) # Count special vs regular tokens special_ids = set() for t in MINDI_SPECIAL_TOKENS + qwen_tokens: tid = tokenizer.convert_tokens_to_ids(t) special_ids.add(tid) special_count = sum(1 for tid in token_ids if tid in special_ids) regular_count = len(token_ids) - special_count print(f" Special tokens: {special_count}") print(f" Regular tokens: {regular_count}") print(f" Total tokens: {len(token_ids):,}") print(f" Special ratio: {special_count / len(token_ids) * 100:.1f}%") # ── Estimate tokens per message ────────────────────────────────────── print("\n📏 TOKENS PER MESSAGE") print("─" * 70) sys_tokens = tokenizer.encode(system_message, add_special_tokens=False) usr_tokens = tokenizer.encode(user_message, add_special_tokens=False) ast_tokens = tokenizer.encode(assistant_message, add_special_tokens=False) print(f" System message: {len(sys_tokens):>5} tokens ({len(system_message):>5} chars)") print(f" User message: {len(usr_tokens):>5} tokens ({len(user_message):>5} chars)") print(f" Assistant message: {len(ast_tokens):>5} tokens ({len(assistant_message):>5} chars)") print(f" Wrapper overhead: ~{len(token_ids) - len(sys_tokens) - len(usr_tokens) - len(ast_tokens):>5} tokens (mindi_start/end, im_start/end, roles)") # ── Context window fit check ───────────────────────────────────────── print("\n📐 CONTEXT WINDOW FIT") print("─" * 70) context_length = 32768 print(f" Context window: {context_length:>6} tokens") print(f" This conversation: {len(token_ids):>6} tokens") print(f" Remaining: {context_length - len(token_ids):>6} tokens ({(context_length - len(token_ids)) / context_length * 100:.1f}%)") print(f" ✅ Fits easily within context window") # ── Final verdict ──────────────────────────────────────────────────── print("\n" + "=" * 70) if round_trip_pass and all_passed: print("✅ STEP 5 PASSED: MINDI conversation format works perfectly!") print(" • Full conversation tokenizes and decodes with perfect fidelity") print(" • All 22 MINDI special tokens preserved as single tokens") print(" • Qwen chat template tokens (im_start/im_end) working correctly") print(f" • Total: {len(token_ids):,} tokens for a realistic conversation") else: print("❌ STEP 5 FAILED — issues detected above") print("=" * 70)