File size: 11,386 Bytes
11e0d89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | """
MINDI 1.5 Vision-Coder β Step 5: Test MINDI Conversation Format
Tests full conversation tokenization with all special tokens.
"""
from pathlib import Path
from transformers import AutoTokenizer
PROJECT_ROOT = Path(__file__).resolve().parent.parent
TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
# ββ Load MINDI tokenizer ββββββββββββββββββββββββββββββββββββββββββββββ
print("=" * 70)
print("STEP 5: TEST MINDI CONVERSATION FORMAT")
print("=" * 70)
print(f"\nπ Loading MINDI tokenizer from: {TOKENIZER_PATH}")
tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), trust_remote_code=True)
print(f" Vocab size: {len(tokenizer):,}")
# ββ Define MINDI special tokens ββββββββββββββββββββββββββββββββββββββ
MINDI_SPECIAL_TOKENS = [
"<|mindi_start|>", "<|mindi_end|>",
"<|code_start|>", "<|code_end|>",
"<|vision_start|>", "<|vision_end|>",
"<|critique_start|>", "<|critique_end|>",
"<|suggest_start|>", "<|suggest_end|>",
"<|think_start|>", "<|think_end|>",
"<|file_start|>", "<|file_end|>",
"<|search_start|>", "<|search_end|>",
"<|sandbox_start|>", "<|sandbox_end|>",
"<|error_start|>", "<|error_end|>",
"<|fix_start|>", "<|fix_end|>",
]
# ββ Build test conversation ββββββββββββββββββββββββββββββββββββββββββ
system_message = (
"You are MINDI 1.5 Vision-Coder, a multimodal agentic AI coding assistant "
"created by MINDIGENOUS.AI. You generate production-ready Next.js 14 + "
"Tailwind CSS + TypeScript code, see your own output via vision, critique "
"UI/UX, search the web for best practices, test in a sandbox, and self-fix errors."
)
user_message = "Build me a hero section for a SaaS landing page with a gradient background, animated headline, and a CTA button."
assistant_message = """<|think_start|>
The user wants a hero section for a SaaS landing page. I need to:
1. Search for current hero section best practices
2. Create a Next.js 14 component with Tailwind CSS
3. Include gradient background, animated headline, and CTA button
4. Critique the result for UI/UX quality
<|think_end|>
<|search_start|>
query: "best SaaS hero section design patterns 2025 Next.js Tailwind"
results: [
{"title": "Modern Hero Sections", "url": "https://example.com/hero-patterns"},
{"title": "SaaS Landing Best Practices", "url": "https://example.com/saas-landing"}
]
<|search_end|>
<|file_start|>
path: src/components/HeroSection.tsx
language: typescript
framework: next.js 14
<|file_end|>
<|code_start|>
'use client';
import { motion } from 'framer-motion';
export default function HeroSection() {
return (
<section className="relative min-h-screen flex items-center justify-center bg-gradient-to-br from-indigo-600 via-purple-600 to-pink-500 overflow-hidden">
<div className="absolute inset-0 bg-[url('/grid.svg')] opacity-10" />
<div className="relative z-10 text-center px-4 max-w-4xl mx-auto">
<motion.h1
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.8, ease: 'easeOut' }}
className="text-5xl md:text-7xl font-bold text-white mb-6 leading-tight"
>
Ship faster with{' '}
<span className="bg-clip-text text-transparent bg-gradient-to-r from-yellow-200 to-yellow-400">
AI-powered code
</span>
</motion.h1>
<motion.p
initial={{ opacity: 0, y: 20 }}
animate={{ opacity: 1, y: 0 }}
transition={{ duration: 0.8, delay: 0.2 }}
className="text-xl text-white/80 mb-10 max-w-2xl mx-auto"
>
Build production-ready web apps in minutes. MINDI writes, tests,
and deploys your code while you focus on what matters.
</motion.p>
<motion.div
initial={{ opacity: 0, scale: 0.9 }}
animate={{ opacity: 1, scale: 1 }}
transition={{ duration: 0.5, delay: 0.4 }}
>
<a
href="/get-started"
className="inline-block px-8 py-4 text-lg font-semibold text-indigo-900 bg-white rounded-full shadow-lg hover:shadow-xl hover:scale-105 transition-all duration-300"
>
Get Started Free β
</a>
</motion.div>
</div>
</section>
);
}
<|code_end|>
<|critique_start|>
Visual Analysis:
- β
Gradient background: Strong indigoβpurpleβpink gradient creates visual depth
- β
Animated headline: Framer Motion fade-up provides smooth entrance
- β
CTA button: High contrast white on gradient, rounded-full shape draws attention
- β οΈ Accessibility: Need to verify color contrast ratios for text on gradient
- β οΈ Mobile: Font sizes may need adjustment below md breakpoint
- β
Performance: Motion animations are hardware-accelerated transforms
<|critique_end|>
<|suggest_start|>
Improvements for next iteration:
1. Add aria-label to the CTA link for screen readers
2. Consider adding a secondary CTA (e.g., "Watch Demo") for users not ready to commit
3. Add a subtle particle or floating shape animation in the background
4. Include social proof (e.g., "Trusted by 10,000+ developers") below the CTA
<|suggest_end|>"""
# ββ Build full conversation string βββββββββββββββββββββββββββββββββββ
conversation = f"""<|mindi_start|>
<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{user_message}<|im_end|>
<|im_start|>assistant
{assistant_message}<|im_end|>
<|mindi_end|>"""
print("\n" + "β" * 70)
print("FULL MINDI CONVERSATION (raw text)")
print("β" * 70)
print(conversation)
print("β" * 70)
# ββ Tokenize the full conversation βββββββββββββββββββββββββββββββββββ
print("\nπ TOKENIZATION RESULTS")
print("β" * 70)
token_ids = tokenizer.encode(conversation, add_special_tokens=False)
print(f" Total tokens: {len(token_ids):,}")
decoded = tokenizer.decode(token_ids)
print(f" Decoded length (chars): {len(decoded):,}")
# ββ Round-trip verification ββββββββββββββββββββββββββββββββββββββββββ
print("\nπ ROUND-TRIP VERIFICATION")
print("β" * 70)
if decoded.strip() == conversation.strip():
print(" β
PERFECT MATCH β decoded text matches original conversation exactly")
round_trip_pass = True
else:
# Show differences for debugging
print(" β MISMATCH detected!")
orig_lines = conversation.strip().splitlines()
dec_lines = decoded.strip().splitlines()
print(f" Original lines: {len(orig_lines)}, Decoded lines: {len(dec_lines)}")
for i, (o, d) in enumerate(zip(orig_lines, dec_lines)):
if o != d:
print(f" Line {i}: DIFF")
print(f" Original: {repr(o[:100])}")
print(f" Decoded: {repr(d[:100])}")
round_trip_pass = False
# ββ Verify all MINDI special tokens are preserved as single tokens βββ
print("\nπ SPECIAL TOKEN PRESERVATION")
print("β" * 70)
all_passed = True
for token_str in MINDI_SPECIAL_TOKENS:
token_id = tokenizer.convert_tokens_to_ids(token_str)
# Check the token encodes to a single ID
encoded = tokenizer.encode(token_str, add_special_tokens=False)
if len(encoded) == 1 and encoded[0] == token_id:
status = "β
"
else:
status = "β"
all_passed = False
# Check this token_id appears in the full conversation encoding
count_in_conv = token_ids.count(token_id)
print(f" {status} {token_str:<25} ID={token_id:<8} single_token=True occurrences_in_conv={count_in_conv}")
# ββ Qwen chat template tokens ββββββββββββββββββββββββββββββββββββββ
print("\nπ QWEN CHAT TEMPLATE TOKENS")
print("β" * 70)
qwen_tokens = ["<|im_start|>", "<|im_end|>"]
for token_str in qwen_tokens:
token_id = tokenizer.convert_tokens_to_ids(token_str)
encoded = tokenizer.encode(token_str, add_special_tokens=False)
count_in_conv = token_ids.count(token_id)
status = "β
" if len(encoded) == 1 else "β"
print(f" {status} {token_str:<25} ID={token_id:<8} occurrences_in_conv={count_in_conv}")
# ββ Token distribution analysis ββββββββββββββββββββββββββββββββββββββ
print("\nπ TOKEN DISTRIBUTION")
print("β" * 70)
# Count special vs regular tokens
special_ids = set()
for t in MINDI_SPECIAL_TOKENS + qwen_tokens:
tid = tokenizer.convert_tokens_to_ids(t)
special_ids.add(tid)
special_count = sum(1 for tid in token_ids if tid in special_ids)
regular_count = len(token_ids) - special_count
print(f" Special tokens: {special_count}")
print(f" Regular tokens: {regular_count}")
print(f" Total tokens: {len(token_ids):,}")
print(f" Special ratio: {special_count / len(token_ids) * 100:.1f}%")
# ββ Estimate tokens per message ββββββββββββββββββββββββββββββββββββββ
print("\nπ TOKENS PER MESSAGE")
print("β" * 70)
sys_tokens = tokenizer.encode(system_message, add_special_tokens=False)
usr_tokens = tokenizer.encode(user_message, add_special_tokens=False)
ast_tokens = tokenizer.encode(assistant_message, add_special_tokens=False)
print(f" System message: {len(sys_tokens):>5} tokens ({len(system_message):>5} chars)")
print(f" User message: {len(usr_tokens):>5} tokens ({len(user_message):>5} chars)")
print(f" Assistant message: {len(ast_tokens):>5} tokens ({len(assistant_message):>5} chars)")
print(f" Wrapper overhead: ~{len(token_ids) - len(sys_tokens) - len(usr_tokens) - len(ast_tokens):>5} tokens (mindi_start/end, im_start/end, roles)")
# ββ Context window fit check βββββββββββββββββββββββββββββββββββββββββ
print("\nπ CONTEXT WINDOW FIT")
print("β" * 70)
context_length = 32768
print(f" Context window: {context_length:>6} tokens")
print(f" This conversation: {len(token_ids):>6} tokens")
print(f" Remaining: {context_length - len(token_ids):>6} tokens ({(context_length - len(token_ids)) / context_length * 100:.1f}%)")
print(f" β
Fits easily within context window")
# ββ Final verdict ββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("\n" + "=" * 70)
if round_trip_pass and all_passed:
print("β
STEP 5 PASSED: MINDI conversation format works perfectly!")
print(" β’ Full conversation tokenizes and decodes with perfect fidelity")
print(" β’ All 22 MINDI special tokens preserved as single tokens")
print(" β’ Qwen chat template tokens (im_start/im_end) working correctly")
print(f" β’ Total: {len(token_ids):,} tokens for a realistic conversation")
else:
print("β STEP 5 FAILED β issues detected above")
print("=" * 70)
|