File size: 11,386 Bytes
11e0d89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
"""
MINDI 1.5 Vision-Coder β€” Step 5: Test MINDI Conversation Format
Tests full conversation tokenization with all special tokens.
"""

from pathlib import Path
from transformers import AutoTokenizer

PROJECT_ROOT = Path(__file__).resolve().parent.parent
TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"

# ── Load MINDI tokenizer ──────────────────────────────────────────────
print("=" * 70)
print("STEP 5: TEST MINDI CONVERSATION FORMAT")
print("=" * 70)

print(f"\nπŸ“‚ Loading MINDI tokenizer from: {TOKENIZER_PATH}")
tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), trust_remote_code=True)
print(f"   Vocab size: {len(tokenizer):,}")

# ── Define MINDI special tokens ──────────────────────────────────────
MINDI_SPECIAL_TOKENS = [
    "<|mindi_start|>", "<|mindi_end|>",
    "<|code_start|>", "<|code_end|>",
    "<|vision_start|>", "<|vision_end|>",
    "<|critique_start|>", "<|critique_end|>",
    "<|suggest_start|>", "<|suggest_end|>",
    "<|think_start|>", "<|think_end|>",
    "<|file_start|>", "<|file_end|>",
    "<|search_start|>", "<|search_end|>",
    "<|sandbox_start|>", "<|sandbox_end|>",
    "<|error_start|>", "<|error_end|>",
    "<|fix_start|>", "<|fix_end|>",
]

# ── Build test conversation ──────────────────────────────────────────
system_message = (
    "You are MINDI 1.5 Vision-Coder, a multimodal agentic AI coding assistant "
    "created by MINDIGENOUS.AI. You generate production-ready Next.js 14 + "
    "Tailwind CSS + TypeScript code, see your own output via vision, critique "
    "UI/UX, search the web for best practices, test in a sandbox, and self-fix errors."
)

user_message = "Build me a hero section for a SaaS landing page with a gradient background, animated headline, and a CTA button."

assistant_message = """<|think_start|>
The user wants a hero section for a SaaS landing page. I need to:
1. Search for current hero section best practices
2. Create a Next.js 14 component with Tailwind CSS
3. Include gradient background, animated headline, and CTA button
4. Critique the result for UI/UX quality
<|think_end|>

<|search_start|>
query: "best SaaS hero section design patterns 2025 Next.js Tailwind"
results: [
  {"title": "Modern Hero Sections", "url": "https://example.com/hero-patterns"},
  {"title": "SaaS Landing Best Practices", "url": "https://example.com/saas-landing"}
]
<|search_end|>

<|file_start|>
path: src/components/HeroSection.tsx
language: typescript
framework: next.js 14
<|file_end|>

<|code_start|>
'use client';

import { motion } from 'framer-motion';

export default function HeroSection() {
  return (
    <section className="relative min-h-screen flex items-center justify-center bg-gradient-to-br from-indigo-600 via-purple-600 to-pink-500 overflow-hidden">
      <div className="absolute inset-0 bg-[url('/grid.svg')] opacity-10" />
      <div className="relative z-10 text-center px-4 max-w-4xl mx-auto">
        <motion.h1
          initial={{ opacity: 0, y: 20 }}
          animate={{ opacity: 1, y: 0 }}
          transition={{ duration: 0.8, ease: 'easeOut' }}
          className="text-5xl md:text-7xl font-bold text-white mb-6 leading-tight"
        >
          Ship faster with{' '}
          <span className="bg-clip-text text-transparent bg-gradient-to-r from-yellow-200 to-yellow-400">
            AI-powered code
          </span>
        </motion.h1>
        <motion.p
          initial={{ opacity: 0, y: 20 }}
          animate={{ opacity: 1, y: 0 }}
          transition={{ duration: 0.8, delay: 0.2 }}
          className="text-xl text-white/80 mb-10 max-w-2xl mx-auto"
        >
          Build production-ready web apps in minutes. MINDI writes, tests,
          and deploys your code while you focus on what matters.
        </motion.p>
        <motion.div
          initial={{ opacity: 0, scale: 0.9 }}
          animate={{ opacity: 1, scale: 1 }}
          transition={{ duration: 0.5, delay: 0.4 }}
        >
          <a
            href="/get-started"
            className="inline-block px-8 py-4 text-lg font-semibold text-indigo-900 bg-white rounded-full shadow-lg hover:shadow-xl hover:scale-105 transition-all duration-300"
          >
            Get Started Free β†’
          </a>
        </motion.div>
      </div>
    </section>
  );
}
<|code_end|>

<|critique_start|>
Visual Analysis:
- βœ… Gradient background: Strong indigoβ†’purpleβ†’pink gradient creates visual depth
- βœ… Animated headline: Framer Motion fade-up provides smooth entrance
- βœ… CTA button: High contrast white on gradient, rounded-full shape draws attention
- ⚠️ Accessibility: Need to verify color contrast ratios for text on gradient
- ⚠️ Mobile: Font sizes may need adjustment below md breakpoint
- βœ… Performance: Motion animations are hardware-accelerated transforms
<|critique_end|>

<|suggest_start|>
Improvements for next iteration:
1. Add aria-label to the CTA link for screen readers
2. Consider adding a secondary CTA (e.g., "Watch Demo") for users not ready to commit
3. Add a subtle particle or floating shape animation in the background
4. Include social proof (e.g., "Trusted by 10,000+ developers") below the CTA
<|suggest_end|>"""

# ── Build full conversation string ───────────────────────────────────
conversation = f"""<|mindi_start|>
<|im_start|>system
{system_message}<|im_end|>
<|im_start|>user
{user_message}<|im_end|>
<|im_start|>assistant
{assistant_message}<|im_end|>
<|mindi_end|>"""

print("\n" + "─" * 70)
print("FULL MINDI CONVERSATION (raw text)")
print("─" * 70)
print(conversation)
print("─" * 70)

# ── Tokenize the full conversation ───────────────────────────────────
print("\nπŸ“Š TOKENIZATION RESULTS")
print("─" * 70)

token_ids = tokenizer.encode(conversation, add_special_tokens=False)
print(f"   Total tokens: {len(token_ids):,}")

decoded = tokenizer.decode(token_ids)
print(f"   Decoded length (chars): {len(decoded):,}")

# ── Round-trip verification ──────────────────────────────────────────
print("\nπŸ”„ ROUND-TRIP VERIFICATION")
print("─" * 70)

if decoded.strip() == conversation.strip():
    print("   βœ… PERFECT MATCH β€” decoded text matches original conversation exactly")
    round_trip_pass = True
else:
    # Show differences for debugging
    print("   ❌ MISMATCH detected!")
    orig_lines = conversation.strip().splitlines()
    dec_lines = decoded.strip().splitlines()
    print(f"   Original lines: {len(orig_lines)}, Decoded lines: {len(dec_lines)}")
    for i, (o, d) in enumerate(zip(orig_lines, dec_lines)):
        if o != d:
            print(f"   Line {i}: DIFF")
            print(f"     Original: {repr(o[:100])}")
            print(f"     Decoded:  {repr(d[:100])}")
    round_trip_pass = False

# ── Verify all MINDI special tokens are preserved as single tokens ───
print("\nπŸ” SPECIAL TOKEN PRESERVATION")
print("─" * 70)

all_passed = True
for token_str in MINDI_SPECIAL_TOKENS:
    token_id = tokenizer.convert_tokens_to_ids(token_str)
    # Check the token encodes to a single ID
    encoded = tokenizer.encode(token_str, add_special_tokens=False)

    if len(encoded) == 1 and encoded[0] == token_id:
        status = "βœ…"
    else:
        status = "❌"
        all_passed = False

    # Check this token_id appears in the full conversation encoding
    count_in_conv = token_ids.count(token_id)
    print(f"   {status} {token_str:<25} ID={token_id:<8} single_token=True  occurrences_in_conv={count_in_conv}")

# ── Qwen chat template tokens ──────────────────────────────────────
print("\nπŸ” QWEN CHAT TEMPLATE TOKENS")
print("─" * 70)

qwen_tokens = ["<|im_start|>", "<|im_end|>"]
for token_str in qwen_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token_str)
    encoded = tokenizer.encode(token_str, add_special_tokens=False)
    count_in_conv = token_ids.count(token_id)
    status = "βœ…" if len(encoded) == 1 else "❌"
    print(f"   {status} {token_str:<25} ID={token_id:<8} occurrences_in_conv={count_in_conv}")

# ── Token distribution analysis ──────────────────────────────────────
print("\nπŸ“ˆ TOKEN DISTRIBUTION")
print("─" * 70)

# Count special vs regular tokens
special_ids = set()
for t in MINDI_SPECIAL_TOKENS + qwen_tokens:
    tid = tokenizer.convert_tokens_to_ids(t)
    special_ids.add(tid)

special_count = sum(1 for tid in token_ids if tid in special_ids)
regular_count = len(token_ids) - special_count

print(f"   Special tokens: {special_count}")
print(f"   Regular tokens: {regular_count}")
print(f"   Total tokens:   {len(token_ids):,}")
print(f"   Special ratio:  {special_count / len(token_ids) * 100:.1f}%")

# ── Estimate tokens per message ──────────────────────────────────────
print("\nπŸ“ TOKENS PER MESSAGE")
print("─" * 70)

sys_tokens = tokenizer.encode(system_message, add_special_tokens=False)
usr_tokens = tokenizer.encode(user_message, add_special_tokens=False)
ast_tokens = tokenizer.encode(assistant_message, add_special_tokens=False)

print(f"   System message:    {len(sys_tokens):>5} tokens ({len(system_message):>5} chars)")
print(f"   User message:      {len(usr_tokens):>5} tokens ({len(user_message):>5} chars)")
print(f"   Assistant message:  {len(ast_tokens):>5} tokens ({len(assistant_message):>5} chars)")
print(f"   Wrapper overhead:  ~{len(token_ids) - len(sys_tokens) - len(usr_tokens) - len(ast_tokens):>5} tokens (mindi_start/end, im_start/end, roles)")

# ── Context window fit check ─────────────────────────────────────────
print("\nπŸ“ CONTEXT WINDOW FIT")
print("─" * 70)
context_length = 32768
print(f"   Context window:   {context_length:>6} tokens")
print(f"   This conversation: {len(token_ids):>6} tokens")
print(f"   Remaining:        {context_length - len(token_ids):>6} tokens ({(context_length - len(token_ids)) / context_length * 100:.1f}%)")
print(f"   βœ… Fits easily within context window")

# ── Final verdict ────────────────────────────────────────────────────
print("\n" + "=" * 70)
if round_trip_pass and all_passed:
    print("βœ… STEP 5 PASSED: MINDI conversation format works perfectly!")
    print("   β€’ Full conversation tokenizes and decodes with perfect fidelity")
    print("   β€’ All 22 MINDI special tokens preserved as single tokens")
    print("   β€’ Qwen chat template tokens (im_start/im_end) working correctly")
    print(f"   β€’ Total: {len(token_ids):,} tokens for a realistic conversation")
else:
    print("❌ STEP 5 FAILED β€” issues detected above")
print("=" * 70)