Faaz
Day 1 Complete: Tokenizer setup β Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89 | """ | |
| MINDI 1.5 Vision-Coder β Step 5: Test MINDI Conversation Format | |
| Tests full conversation tokenization with all special tokens. | |
| """ | |
| from pathlib import Path | |
| from transformers import AutoTokenizer | |
| PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer" | |
| # ββ Load MINDI tokenizer ββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("=" * 70) | |
| print("STEP 5: TEST MINDI CONVERSATION FORMAT") | |
| print("=" * 70) | |
| print(f"\nπ Loading MINDI tokenizer from: {TOKENIZER_PATH}") | |
| tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), trust_remote_code=True) | |
| print(f" Vocab size: {len(tokenizer):,}") | |
| # ββ Define MINDI special tokens ββββββββββββββββββββββββββββββββββββββ | |
| MINDI_SPECIAL_TOKENS = [ | |
| "<|mindi_start|>", "<|mindi_end|>", | |
| "<|code_start|>", "<|code_end|>", | |
| "<|vision_start|>", "<|vision_end|>", | |
| "<|critique_start|>", "<|critique_end|>", | |
| "<|suggest_start|>", "<|suggest_end|>", | |
| "<|think_start|>", "<|think_end|>", | |
| "<|file_start|>", "<|file_end|>", | |
| "<|search_start|>", "<|search_end|>", | |
| "<|sandbox_start|>", "<|sandbox_end|>", | |
| "<|error_start|>", "<|error_end|>", | |
| "<|fix_start|>", "<|fix_end|>", | |
| ] | |
| # ββ Build test conversation ββββββββββββββββββββββββββββββββββββββββββ | |
| system_message = ( | |
| "You are MINDI 1.5 Vision-Coder, a multimodal agentic AI coding assistant " | |
| "created by MINDIGENOUS.AI. You generate production-ready Next.js 14 + " | |
| "Tailwind CSS + TypeScript code, see your own output via vision, critique " | |
| "UI/UX, search the web for best practices, test in a sandbox, and self-fix errors." | |
| ) | |
| user_message = "Build me a hero section for a SaaS landing page with a gradient background, animated headline, and a CTA button." | |
| assistant_message = """<|think_start|> | |
| The user wants a hero section for a SaaS landing page. I need to: | |
| 1. Search for current hero section best practices | |
| 2. Create a Next.js 14 component with Tailwind CSS | |
| 3. Include gradient background, animated headline, and CTA button | |
| 4. Critique the result for UI/UX quality | |
| <|think_end|> | |
| <|search_start|> | |
| query: "best SaaS hero section design patterns 2025 Next.js Tailwind" | |
| results: [ | |
| {"title": "Modern Hero Sections", "url": "https://example.com/hero-patterns"}, | |
| {"title": "SaaS Landing Best Practices", "url": "https://example.com/saas-landing"} | |
| ] | |
| <|search_end|> | |
| <|file_start|> | |
| path: src/components/HeroSection.tsx | |
| language: typescript | |
| framework: next.js 14 | |
| <|file_end|> | |
| <|code_start|> | |
| 'use client'; | |
| import { motion } from 'framer-motion'; | |
| export default function HeroSection() { | |
| return ( | |
| <section className="relative min-h-screen flex items-center justify-center bg-gradient-to-br from-indigo-600 via-purple-600 to-pink-500 overflow-hidden"> | |
| <div className="absolute inset-0 bg-[url('/grid.svg')] opacity-10" /> | |
| <div className="relative z-10 text-center px-4 max-w-4xl mx-auto"> | |
| <motion.h1 | |
| initial={{ opacity: 0, y: 20 }} | |
| animate={{ opacity: 1, y: 0 }} | |
| transition={{ duration: 0.8, ease: 'easeOut' }} | |
| className="text-5xl md:text-7xl font-bold text-white mb-6 leading-tight" | |
| > | |
| Ship faster with{' '} | |
| <span className="bg-clip-text text-transparent bg-gradient-to-r from-yellow-200 to-yellow-400"> | |
| AI-powered code | |
| </span> | |
| </motion.h1> | |
| <motion.p | |
| initial={{ opacity: 0, y: 20 }} | |
| animate={{ opacity: 1, y: 0 }} | |
| transition={{ duration: 0.8, delay: 0.2 }} | |
| className="text-xl text-white/80 mb-10 max-w-2xl mx-auto" | |
| > | |
| Build production-ready web apps in minutes. MINDI writes, tests, | |
| and deploys your code while you focus on what matters. | |
| </motion.p> | |
| <motion.div | |
| initial={{ opacity: 0, scale: 0.9 }} | |
| animate={{ opacity: 1, scale: 1 }} | |
| transition={{ duration: 0.5, delay: 0.4 }} | |
| > | |
| <a | |
| href="/get-started" | |
| className="inline-block px-8 py-4 text-lg font-semibold text-indigo-900 bg-white rounded-full shadow-lg hover:shadow-xl hover:scale-105 transition-all duration-300" | |
| > | |
| Get Started Free β | |
| </a> | |
| </motion.div> | |
| </div> | |
| </section> | |
| ); | |
| } | |
| <|code_end|> | |
| <|critique_start|> | |
| Visual Analysis: | |
| - β Gradient background: Strong indigoβpurpleβpink gradient creates visual depth | |
| - β Animated headline: Framer Motion fade-up provides smooth entrance | |
| - β CTA button: High contrast white on gradient, rounded-full shape draws attention | |
| - β οΈ Accessibility: Need to verify color contrast ratios for text on gradient | |
| - β οΈ Mobile: Font sizes may need adjustment below md breakpoint | |
| - β Performance: Motion animations are hardware-accelerated transforms | |
| <|critique_end|> | |
| <|suggest_start|> | |
| Improvements for next iteration: | |
| 1. Add aria-label to the CTA link for screen readers | |
| 2. Consider adding a secondary CTA (e.g., "Watch Demo") for users not ready to commit | |
| 3. Add a subtle particle or floating shape animation in the background | |
| 4. Include social proof (e.g., "Trusted by 10,000+ developers") below the CTA | |
| <|suggest_end|>""" | |
| # ββ Build full conversation string βββββββββββββββββββββββββββββββββββ | |
| conversation = f"""<|mindi_start|> | |
| <|im_start|>system | |
| {system_message}<|im_end|> | |
| <|im_start|>user | |
| {user_message}<|im_end|> | |
| <|im_start|>assistant | |
| {assistant_message}<|im_end|> | |
| <|mindi_end|>""" | |
| print("\n" + "β" * 70) | |
| print("FULL MINDI CONVERSATION (raw text)") | |
| print("β" * 70) | |
| print(conversation) | |
| print("β" * 70) | |
| # ββ Tokenize the full conversation βββββββββββββββββββββββββββββββββββ | |
| print("\nπ TOKENIZATION RESULTS") | |
| print("β" * 70) | |
| token_ids = tokenizer.encode(conversation, add_special_tokens=False) | |
| print(f" Total tokens: {len(token_ids):,}") | |
| decoded = tokenizer.decode(token_ids) | |
| print(f" Decoded length (chars): {len(decoded):,}") | |
| # ββ Round-trip verification ββββββββββββββββββββββββββββββββββββββββββ | |
| print("\nπ ROUND-TRIP VERIFICATION") | |
| print("β" * 70) | |
| if decoded.strip() == conversation.strip(): | |
| print(" β PERFECT MATCH β decoded text matches original conversation exactly") | |
| round_trip_pass = True | |
| else: | |
| # Show differences for debugging | |
| print(" β MISMATCH detected!") | |
| orig_lines = conversation.strip().splitlines() | |
| dec_lines = decoded.strip().splitlines() | |
| print(f" Original lines: {len(orig_lines)}, Decoded lines: {len(dec_lines)}") | |
| for i, (o, d) in enumerate(zip(orig_lines, dec_lines)): | |
| if o != d: | |
| print(f" Line {i}: DIFF") | |
| print(f" Original: {repr(o[:100])}") | |
| print(f" Decoded: {repr(d[:100])}") | |
| round_trip_pass = False | |
| # ββ Verify all MINDI special tokens are preserved as single tokens βββ | |
| print("\nπ SPECIAL TOKEN PRESERVATION") | |
| print("β" * 70) | |
| all_passed = True | |
| for token_str in MINDI_SPECIAL_TOKENS: | |
| token_id = tokenizer.convert_tokens_to_ids(token_str) | |
| # Check the token encodes to a single ID | |
| encoded = tokenizer.encode(token_str, add_special_tokens=False) | |
| if len(encoded) == 1 and encoded[0] == token_id: | |
| status = "β " | |
| else: | |
| status = "β" | |
| all_passed = False | |
| # Check this token_id appears in the full conversation encoding | |
| count_in_conv = token_ids.count(token_id) | |
| print(f" {status} {token_str:<25} ID={token_id:<8} single_token=True occurrences_in_conv={count_in_conv}") | |
| # ββ Qwen chat template tokens ββββββββββββββββββββββββββββββββββββββ | |
| print("\nπ QWEN CHAT TEMPLATE TOKENS") | |
| print("β" * 70) | |
| qwen_tokens = ["<|im_start|>", "<|im_end|>"] | |
| for token_str in qwen_tokens: | |
| token_id = tokenizer.convert_tokens_to_ids(token_str) | |
| encoded = tokenizer.encode(token_str, add_special_tokens=False) | |
| count_in_conv = token_ids.count(token_id) | |
| status = "β " if len(encoded) == 1 else "β" | |
| print(f" {status} {token_str:<25} ID={token_id:<8} occurrences_in_conv={count_in_conv}") | |
| # ββ Token distribution analysis ββββββββββββββββββββββββββββββββββββββ | |
| print("\nπ TOKEN DISTRIBUTION") | |
| print("β" * 70) | |
| # Count special vs regular tokens | |
| special_ids = set() | |
| for t in MINDI_SPECIAL_TOKENS + qwen_tokens: | |
| tid = tokenizer.convert_tokens_to_ids(t) | |
| special_ids.add(tid) | |
| special_count = sum(1 for tid in token_ids if tid in special_ids) | |
| regular_count = len(token_ids) - special_count | |
| print(f" Special tokens: {special_count}") | |
| print(f" Regular tokens: {regular_count}") | |
| print(f" Total tokens: {len(token_ids):,}") | |
| print(f" Special ratio: {special_count / len(token_ids) * 100:.1f}%") | |
| # ββ Estimate tokens per message ββββββββββββββββββββββββββββββββββββββ | |
| print("\nπ TOKENS PER MESSAGE") | |
| print("β" * 70) | |
| sys_tokens = tokenizer.encode(system_message, add_special_tokens=False) | |
| usr_tokens = tokenizer.encode(user_message, add_special_tokens=False) | |
| ast_tokens = tokenizer.encode(assistant_message, add_special_tokens=False) | |
| print(f" System message: {len(sys_tokens):>5} tokens ({len(system_message):>5} chars)") | |
| print(f" User message: {len(usr_tokens):>5} tokens ({len(user_message):>5} chars)") | |
| print(f" Assistant message: {len(ast_tokens):>5} tokens ({len(assistant_message):>5} chars)") | |
| print(f" Wrapper overhead: ~{len(token_ids) - len(sys_tokens) - len(usr_tokens) - len(ast_tokens):>5} tokens (mindi_start/end, im_start/end, roles)") | |
| # ββ Context window fit check βββββββββββββββββββββββββββββββββββββββββ | |
| print("\nπ CONTEXT WINDOW FIT") | |
| print("β" * 70) | |
| context_length = 32768 | |
| print(f" Context window: {context_length:>6} tokens") | |
| print(f" This conversation: {len(token_ids):>6} tokens") | |
| print(f" Remaining: {context_length - len(token_ids):>6} tokens ({(context_length - len(token_ids)) / context_length * 100:.1f}%)") | |
| print(f" β Fits easily within context window") | |
| # ββ Final verdict ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "=" * 70) | |
| if round_trip_pass and all_passed: | |
| print("β STEP 5 PASSED: MINDI conversation format works perfectly!") | |
| print(" β’ Full conversation tokenizes and decodes with perfect fidelity") | |
| print(" β’ All 22 MINDI special tokens preserved as single tokens") | |
| print(" β’ Qwen chat template tokens (im_start/im_end) working correctly") | |
| print(f" β’ Total: {len(token_ids):,} tokens for a realistic conversation") | |
| else: | |
| print("β STEP 5 FAILED β issues detected above") | |
| print("=" * 70) | |