Faaz commited on
Commit
11e0d89
·
1 Parent(s): 553fbf7

Day 1 Complete: Tokenizer setup — Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test

Browse files
.env.example CHANGED
@@ -28,7 +28,7 @@ E2B_API_KEY=e2b_your_key_here
28
  SANDBOX_TYPE=e2b
29
 
30
  # ── Model Settings ──
31
- MODEL_NAME=deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct
32
  BASE_MODEL_PATH=./checkpoints/base
33
  FINETUNED_MODEL_PATH=./checkpoints/finetuned
34
 
 
28
  SANDBOX_TYPE=e2b
29
 
30
  # ── Model Settings ──
31
+ MODEL_NAME=Qwen/Qwen2.5-Coder-7B-Instruct
32
  BASE_MODEL_PATH=./checkpoints/base
33
  FINETUNED_MODEL_PATH=./checkpoints/finetuned
34
 
configs/data_config.yaml CHANGED
@@ -36,7 +36,7 @@ dataset:
36
 
37
  # Processing
38
  processing:
39
- tokenizer: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
40
  max_length: 8192
41
  min_length: 64
42
  dedup_strategy: "minhash"
 
36
 
37
  # Processing
38
  processing:
39
+ tokenizer: "Qwen/Qwen2.5-Coder-7B-Instruct"
40
  max_length: 8192
41
  min_length: 64
42
  dedup_strategy: "minhash"
configs/model_config.yaml CHANGED
@@ -8,10 +8,10 @@ model:
8
 
9
  # Base coding model (Apache 2.0 licensed)
10
  base:
11
- name: "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
12
- parameters: "16B"
13
  license: "Apache-2.0"
14
- context_length: 8192
15
  dtype: "bfloat16"
16
 
17
  # Vision encoder for UI screenshot understanding
 
8
 
9
  # Base coding model (Apache 2.0 licensed)
10
  base:
11
+ name: "Qwen/Qwen2.5-Coder-7B-Instruct"
12
+ parameters: "7.61B"
13
  license: "Apache-2.0"
14
+ context_length: 32768
15
  dtype: "bfloat16"
16
 
17
  # Vision encoder for UI screenshot understanding
data/tokenizer/base_tokenizer/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
data/tokenizer/base_tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
data/tokenizer/base_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 32768,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
data/tokenizer/mindi_tokenizer/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
data/tokenizer/mindi_tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08d9f5b46199913fa238437fd9bbee25cef9eb1fb59bd860a347af628f161062
3
+ size 11425720
data/tokenizer/mindi_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|mindi_start|>",
10
+ "<|mindi_end|>",
11
+ "<|code_start|>",
12
+ "<|code_end|>",
13
+ "<|vision_start|>",
14
+ "<|vision_end|>",
15
+ "<|critique_start|>",
16
+ "<|critique_end|>",
17
+ "<|suggest_start|>",
18
+ "<|suggest_end|>",
19
+ "<|think_start|>",
20
+ "<|think_end|>",
21
+ "<|file_start|>",
22
+ "<|file_end|>",
23
+ "<|search_start|>",
24
+ "<|search_end|>",
25
+ "<|sandbox_start|>",
26
+ "<|sandbox_end|>",
27
+ "<|error_start|>",
28
+ "<|error_end|>",
29
+ "<|fix_start|>",
30
+ "<|fix_end|>"
31
+ ],
32
+ "is_local": true,
33
+ "model_max_length": 32768,
34
+ "pad_token": "<|endoftext|>",
35
+ "split_special_tokens": false,
36
+ "tokenizer_class": "Qwen2Tokenizer",
37
+ "unk_token": null
38
+ }
scripts/add_special_tokens.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MINDI 1.5 Vision-Coder — Step 4: Add MINDI Special Tokens
3
+
4
+ Loads the base Qwen2.5-Coder tokenizer, adds 22 MINDI-specific
5
+ special tokens, saves the updated tokenizer, and reports vocab changes.
6
+ """
7
+
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
12
+ sys.path.insert(0, str(PROJECT_ROOT))
13
+
14
+
15
+ MINDI_SPECIAL_TOKENS = [
16
+ "<|mindi_start|>",
17
+ "<|mindi_end|>",
18
+ "<|code_start|>",
19
+ "<|code_end|>",
20
+ "<|vision_start|>",
21
+ "<|vision_end|>",
22
+ "<|critique_start|>",
23
+ "<|critique_end|>",
24
+ "<|suggest_start|>",
25
+ "<|suggest_end|>",
26
+ "<|think_start|>",
27
+ "<|think_end|>",
28
+ "<|file_start|>",
29
+ "<|file_end|>",
30
+ "<|search_start|>",
31
+ "<|search_end|>",
32
+ "<|sandbox_start|>",
33
+ "<|sandbox_end|>",
34
+ "<|error_start|>",
35
+ "<|error_end|>",
36
+ "<|fix_start|>",
37
+ "<|fix_end|>",
38
+ ]
39
+
40
+
41
+ def main():
42
+ from transformers import AutoTokenizer
43
+
44
+ base_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
45
+ save_dir = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
46
+
47
+ print(f"\n{'='*60}")
48
+ print(f" Step 4: Adding MINDI Special Tokens")
49
+ print(f"{'='*60}")
50
+
51
+ # Load base tokenizer
52
+ print(f"\n Loading base tokenizer from: {base_dir}")
53
+ tokenizer = AutoTokenizer.from_pretrained(str(base_dir), trust_remote_code=True)
54
+ original_vocab_size = len(tokenizer)
55
+ print(f" ✅ Base vocab size: {original_vocab_size:,}")
56
+
57
+ # Add special tokens
58
+ print(f"\n Adding {len(MINDI_SPECIAL_TOKENS)} MINDI special tokens...")
59
+ num_added = tokenizer.add_special_tokens({
60
+ "additional_special_tokens": MINDI_SPECIAL_TOKENS
61
+ })
62
+ new_vocab_size = len(tokenizer)
63
+ print(f" ✅ Tokens added: {num_added}")
64
+ print(f" ✅ New vocab size: {new_vocab_size:,}")
65
+ print(f" ✅ Delta: +{new_vocab_size - original_vocab_size}")
66
+
67
+ # Save updated tokenizer
68
+ save_dir.mkdir(parents=True, exist_ok=True)
69
+ tokenizer.save_pretrained(str(save_dir))
70
+ print(f"\n ✅ Saved MINDI tokenizer to: {save_dir}")
71
+
72
+ # Show token ID mapping
73
+ print(f"\n{'='*60}")
74
+ print(f" Special Token ID Mapping")
75
+ print(f"{'='*60}")
76
+ for token in MINDI_SPECIAL_TOKENS:
77
+ token_id = tokenizer.convert_tokens_to_ids(token)
78
+ print(f" {token:<25} → ID {token_id}")
79
+
80
+ # Verify round-trip for each special token
81
+ print(f"\n{'='*60}")
82
+ print(f" Round-trip Verification")
83
+ print(f"{'='*60}")
84
+ all_pass = True
85
+ for token in MINDI_SPECIAL_TOKENS:
86
+ token_id = tokenizer.convert_tokens_to_ids(token)
87
+ decoded = tokenizer.decode([token_id])
88
+ match = decoded == token
89
+ if not match:
90
+ all_pass = False
91
+ status = "✅" if match else "❌"
92
+ print(f" {status} {token} → {token_id} → \"{decoded}\"")
93
+
94
+ # Summary
95
+ print(f"\n{'='*60}")
96
+ print(f" SUMMARY")
97
+ print(f"{'='*60}")
98
+ print(f" Original vocab size: {original_vocab_size:,}")
99
+ print(f" New vocab size: {new_vocab_size:,}")
100
+ print(f" Special tokens added: {num_added}")
101
+ if all_pass:
102
+ print(f" Round-trip test: ✅ ALL {len(MINDI_SPECIAL_TOKENS)} PASSED")
103
+ else:
104
+ print(f" Round-trip test: ❌ SOME FAILED")
105
+ print(f"{'='*60}\n")
106
+
107
+
108
+ if __name__ == "__main__":
109
+ main()
scripts/download_tokenizer.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MINDI 1.5 Vision-Coder — Step 3: Download Tokenizer & Test
3
+
4
+ Downloads ONLY the tokenizer (not model weights) from Qwen/Qwen2.5-Coder-7B-Instruct,
5
+ saves it locally, and runs encoding/decoding tests on 8 code strings.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ # Ensure project root
13
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
14
+ sys.path.insert(0, str(PROJECT_ROOT))
15
+
16
+ from dotenv import load_dotenv
17
+ load_dotenv(PROJECT_ROOT / ".env")
18
+
19
+
20
+ def main():
21
+ from transformers import AutoTokenizer
22
+
23
+ model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
24
+ save_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
25
+ hf_token = os.environ.get("HUGGINGFACE_TOKEN", "")
26
+
27
+ # ── Download tokenizer ──
28
+ print(f"\n{'='*60}")
29
+ print(f" Downloading tokenizer: {model_name}")
30
+ print(f" Save to: {save_dir}")
31
+ print(f"{'='*60}\n")
32
+
33
+ tokenizer = AutoTokenizer.from_pretrained(
34
+ model_name,
35
+ token=hf_token if hf_token else None,
36
+ trust_remote_code=True,
37
+ )
38
+
39
+ # Save locally
40
+ save_dir.mkdir(parents=True, exist_ok=True)
41
+ tokenizer.save_pretrained(str(save_dir))
42
+ print(f" ✅ Tokenizer saved to {save_dir}")
43
+ print(f" ✅ Vocab size: {tokenizer.vocab_size:,}")
44
+ print(f" ✅ Model max length: {tokenizer.model_max_length:,}")
45
+
46
+ # ── List saved files ──
47
+ print(f"\n Saved files:")
48
+ for f in sorted(save_dir.iterdir()):
49
+ size_kb = f.stat().st_size / 1024
50
+ print(f" {f.name} ({size_kb:.1f} KB)")
51
+
52
+ # ── Run tokenizer tests ──
53
+ test_strings = [
54
+ "Build me a Next.js dashboard",
55
+ "import React from 'react'",
56
+ "className='flex items-center gap-4'",
57
+ "'use client'",
58
+ "const [state, setState] = useState(null)",
59
+ "export default function Page() {",
60
+ "npm install framer-motion",
61
+ "async function getData() {",
62
+ ]
63
+
64
+ print(f"\n{'='*60}")
65
+ print(f" Tokenizer Tests — 8 Code Strings")
66
+ print(f"{'='*60}")
67
+
68
+ all_pass = True
69
+ for i, text in enumerate(test_strings, 1):
70
+ ids = tokenizer.encode(text, add_special_tokens=False)
71
+ decoded = tokenizer.decode(ids)
72
+ match = decoded == text
73
+ if not match:
74
+ all_pass = False
75
+
76
+ print(f"\n Test {i}: \"{text}\"")
77
+ print(f" Token count: {len(ids)}")
78
+ print(f" Token IDs: {ids}")
79
+ print(f" Decoded: \"{decoded}\"")
80
+ print(f" Match: {'✅ PERFECT' if match else '❌ MISMATCH'}")
81
+
82
+ print(f"\n{'='*60}")
83
+ if all_pass:
84
+ print(f" ✅ ALL 8 TESTS PASSED — Perfect reconstruction!")
85
+ else:
86
+ print(f" ⚠️ Some tests had reconstruction differences (whitespace normalization is normal)")
87
+ print(f"{'='*60}\n")
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
scripts/save_everything.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MINDI 1.5 Vision-Coder — Step 6: Smoke-test MindiTokenizer wrapper & generate test report.
3
+ """
4
+
5
+ import sys
6
+ import datetime
7
+ from pathlib import Path
8
+
9
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
10
+ sys.path.insert(0, str(PROJECT_ROOT / "src"))
11
+
12
+ from tokenizer.tokenizer import MindiTokenizer, MINDI_SPECIAL_TOKENS
13
+
14
+ print("=" * 70)
15
+ print("STEP 6: SAVE EVERYTHING — WRAPPER SMOKE TEST + REPORT")
16
+ print("=" * 70)
17
+
18
+ # ── 1. Load via wrapper class ────────────────────────────────────────
19
+ print("\n1️⃣ Loading MindiTokenizer wrapper...")
20
+ tok = MindiTokenizer()
21
+ print(f" ✅ Loaded from: {tok.tokenizer_path}")
22
+ print(f" Vocab size: {tok.get_vocab_size():,}")
23
+
24
+ # ── 2. Test encode / decode ──────────────────────────────────────────
25
+ print("\n2️⃣ encode() / decode()...")
26
+ text = "export default function Hero() { return <h1>Hello</h1>; }"
27
+ ids = tok.encode(text)
28
+ decoded = tok.decode(ids)
29
+ assert decoded.strip() == text.strip(), f"Round-trip failed: {decoded!r}"
30
+ print(f" ✅ Round-trip OK — {len(ids)} tokens")
31
+
32
+ # ── 3. Test encode_with_special_tokens ───────────────────────────────
33
+ print("\n3️⃣ encode_with_special_tokens()...")
34
+ special_text = "<|code_start|>\nconsole.log('hi');\n<|code_end|>"
35
+ ids2 = tok.encode_with_special_tokens(special_text)
36
+ decoded2 = tok.decode(ids2)
37
+ assert decoded2.strip() == special_text.strip(), f"Special round-trip failed"
38
+ code_start_id = tok.get_special_token_id("code_start")
39
+ code_end_id = tok.get_special_token_id("code_end")
40
+ assert code_start_id in ids2, "code_start token not found"
41
+ assert code_end_id in ids2, "code_end token not found"
42
+ print(f" ✅ Special tokens preserved — {len(ids2)} tokens")
43
+
44
+ # ── 4. Test encode_conversation ──────────────────────────────────────
45
+ print("\n4️⃣ encode_conversation()...")
46
+ messages = [
47
+ {"role": "system", "content": "You are MINDI 1.5 Vision-Coder."},
48
+ {"role": "user", "content": "Build a navbar."},
49
+ {"role": "assistant", "content": "<|think_start|>\nPlanning navbar...\n<|think_end|>\n\n<|code_start|>\nexport default function Navbar() { return <nav>Nav</nav>; }\n<|code_end|>"},
50
+ ]
51
+ conv_ids = tok.encode_conversation(messages, wrap_mindi=True)
52
+ conv_decoded = tok.decode(conv_ids)
53
+ assert "<|mindi_start|>" in conv_decoded, "mindi_start missing"
54
+ assert "<|mindi_end|>" in conv_decoded, "mindi_end missing"
55
+ assert "<|im_start|>" in conv_decoded, "im_start missing"
56
+ assert "<|think_start|>" in conv_decoded, "think_start missing"
57
+ assert "<|code_start|>" in conv_decoded, "code_start missing"
58
+ print(f" ✅ Conversation encoded — {len(conv_ids)} tokens, mindi/im/think/code all present")
59
+
60
+ # ── 5. Test get_special_token_ids ────────────────────────────────────
61
+ print("\n5️⃣ get_special_token_ids()...")
62
+ all_ids = tok.get_special_token_ids()
63
+ assert len(all_ids) == 22, f"Expected 22, got {len(all_ids)}"
64
+ for name, tid in all_ids.items():
65
+ assert isinstance(tid, int) and tid > 0, f"Bad ID for {name}: {tid}"
66
+ print(f" ✅ 22 special token IDs returned, all valid integers")
67
+
68
+ # ── 6. Test get_vocab_size ───────────────────────────────────────────
69
+ print("\n6️⃣ get_vocab_size()...")
70
+ vs = tok.get_vocab_size()
71
+ assert vs == 151685, f"Expected 151685, got {vs}"
72
+ print(f" ✅ Vocab size: {vs:,}")
73
+
74
+ # ── Generate test report ─────────────────────────────────────────────
75
+ print("\n" + "─" * 70)
76
+ print("📄 Generating test report...")
77
+
78
+ report_lines = [
79
+ "=" * 70,
80
+ "MINDI 1.5 VISION-CODER — TOKENIZER TEST REPORT",
81
+ f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
82
+ "=" * 70,
83
+ "",
84
+ "BASE MODEL: Qwen/Qwen2.5-Coder-7B-Instruct",
85
+ f"VOCAB SIZE: {vs:,}",
86
+ f"SPECIAL TOKENS: {len(all_ids)} (22 MINDI tokens)",
87
+ f"TOKENIZER PATH: data/tokenizer/mindi_tokenizer/",
88
+ "",
89
+ "─" * 70,
90
+ "SPECIAL TOKEN REGISTRY",
91
+ "─" * 70,
92
+ ]
93
+
94
+ for name, tid in sorted(all_ids.items(), key=lambda x: x[1]):
95
+ token_str = MINDI_SPECIAL_TOKENS[name]
96
+ report_lines.append(f" {token_str:<25} → ID {tid}")
97
+
98
+ report_lines += [
99
+ "",
100
+ "─" * 70,
101
+ "WRAPPER CLASS API TESTS",
102
+ "─" * 70,
103
+ " ✅ encode() — round-trip plain text",
104
+ " ✅ decode() — reconstructs original text",
105
+ " ✅ encode_with_special_tokens() — preserves special tokens as single IDs",
106
+ " ✅ encode_conversation() — formats system/user/assistant with im_start/end + mindi wrapper",
107
+ " ✅ get_vocab_size() — returns 151,685",
108
+ " ✅ get_special_token_ids() — returns all 22 MINDI token IDs",
109
+ " ✅ get_special_token_id(name) — individual token lookup",
110
+ "",
111
+ "─" * 70,
112
+ "CONVERSATION FORMAT TEST (from Step 5)",
113
+ "─" * 70,
114
+ " Total tokens: 971",
115
+ " Round-trip: PERFECT MATCH",
116
+ " Special tokens: 22/22 preserved as single tokens",
117
+ " Qwen chat tokens: im_start ×3, im_end ×3",
118
+ " Context usage: 971 / 32,768 = 3.0%",
119
+ "",
120
+ "─" * 70,
121
+ "FILES SAVED",
122
+ "─" * 70,
123
+ " data/tokenizer/base_tokenizer/ — Original Qwen tokenizer (3 files)",
124
+ " data/tokenizer/mindi_tokenizer/ — MINDI tokenizer with 22 special tokens",
125
+ " src/tokenizer/tokenizer.py — MindiTokenizer wrapper class",
126
+ " logs/tokenizer_test.txt — This report",
127
+ " scripts/download_tokenizer.py — Tokenizer download script",
128
+ " scripts/add_special_tokens.py — Special token addition script",
129
+ " scripts/test_mindi_format.py — Conversation format test script",
130
+ "",
131
+ "=" * 70,
132
+ "STATUS: ALL TESTS PASSED ✅",
133
+ "=" * 70,
134
+ ]
135
+
136
+ report_text = "\n".join(report_lines)
137
+
138
+ logs_dir = PROJECT_ROOT / "logs"
139
+ logs_dir.mkdir(parents=True, exist_ok=True)
140
+ report_path = logs_dir / "tokenizer_test.txt"
141
+ report_path.write_text(report_text, encoding="utf-8")
142
+ print(f" ✅ Saved to: {report_path}")
143
+
144
+ # ── Final summary ────────────────────────────────────────────────────
145
+ print("\n" + "=" * 70)
146
+ print("✅ STEP 6 COMPLETE: Everything saved!")
147
+ print(" • MindiTokenizer wrapper class — 6/6 API methods tested")
148
+ print(" • Test report — logs/tokenizer_test.txt")
149
+ print(f" • Tokenizer files — data/tokenizer/mindi_tokenizer/")
150
+ print("=" * 70)
scripts/test_mindi_format.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MINDI 1.5 Vision-Coder — Step 5: Test MINDI Conversation Format
3
+ Tests full conversation tokenization with all special tokens.
4
+ """
5
+
6
+ from pathlib import Path
7
+ from transformers import AutoTokenizer
8
+
9
+ PROJECT_ROOT = Path(__file__).resolve().parent.parent
10
+ TOKENIZER_PATH = PROJECT_ROOT / "data" / "tokenizer" / "mindi_tokenizer"
11
+
12
+ # ── Load MINDI tokenizer ──────────────────────────────────────────────
13
+ print("=" * 70)
14
+ print("STEP 5: TEST MINDI CONVERSATION FORMAT")
15
+ print("=" * 70)
16
+
17
+ print(f"\n📂 Loading MINDI tokenizer from: {TOKENIZER_PATH}")
18
+ tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_PATH), trust_remote_code=True)
19
+ print(f" Vocab size: {len(tokenizer):,}")
20
+
21
+ # ── Define MINDI special tokens ──────────────────────────────────────
22
+ MINDI_SPECIAL_TOKENS = [
23
+ "<|mindi_start|>", "<|mindi_end|>",
24
+ "<|code_start|>", "<|code_end|>",
25
+ "<|vision_start|>", "<|vision_end|>",
26
+ "<|critique_start|>", "<|critique_end|>",
27
+ "<|suggest_start|>", "<|suggest_end|>",
28
+ "<|think_start|>", "<|think_end|>",
29
+ "<|file_start|>", "<|file_end|>",
30
+ "<|search_start|>", "<|search_end|>",
31
+ "<|sandbox_start|>", "<|sandbox_end|>",
32
+ "<|error_start|>", "<|error_end|>",
33
+ "<|fix_start|>", "<|fix_end|>",
34
+ ]
35
+
36
+ # ── Build test conversation ──────────────────────────────────────────
37
+ system_message = (
38
+ "You are MINDI 1.5 Vision-Coder, a multimodal agentic AI coding assistant "
39
+ "created by MINDIGENOUS.AI. You generate production-ready Next.js 14 + "
40
+ "Tailwind CSS + TypeScript code, see your own output via vision, critique "
41
+ "UI/UX, search the web for best practices, test in a sandbox, and self-fix errors."
42
+ )
43
+
44
+ user_message = "Build me a hero section for a SaaS landing page with a gradient background, animated headline, and a CTA button."
45
+
46
+ assistant_message = """<|think_start|>
47
+ The user wants a hero section for a SaaS landing page. I need to:
48
+ 1. Search for current hero section best practices
49
+ 2. Create a Next.js 14 component with Tailwind CSS
50
+ 3. Include gradient background, animated headline, and CTA button
51
+ 4. Critique the result for UI/UX quality
52
+ <|think_end|>
53
+
54
+ <|search_start|>
55
+ query: "best SaaS hero section design patterns 2025 Next.js Tailwind"
56
+ results: [
57
+ {"title": "Modern Hero Sections", "url": "https://example.com/hero-patterns"},
58
+ {"title": "SaaS Landing Best Practices", "url": "https://example.com/saas-landing"}
59
+ ]
60
+ <|search_end|>
61
+
62
+ <|file_start|>
63
+ path: src/components/HeroSection.tsx
64
+ language: typescript
65
+ framework: next.js 14
66
+ <|file_end|>
67
+
68
+ <|code_start|>
69
+ 'use client';
70
+
71
+ import { motion } from 'framer-motion';
72
+
73
+ export default function HeroSection() {
74
+ return (
75
+ <section className="relative min-h-screen flex items-center justify-center bg-gradient-to-br from-indigo-600 via-purple-600 to-pink-500 overflow-hidden">
76
+ <div className="absolute inset-0 bg-[url('/grid.svg')] opacity-10" />
77
+ <div className="relative z-10 text-center px-4 max-w-4xl mx-auto">
78
+ <motion.h1
79
+ initial={{ opacity: 0, y: 20 }}
80
+ animate={{ opacity: 1, y: 0 }}
81
+ transition={{ duration: 0.8, ease: 'easeOut' }}
82
+ className="text-5xl md:text-7xl font-bold text-white mb-6 leading-tight"
83
+ >
84
+ Ship faster with{' '}
85
+ <span className="bg-clip-text text-transparent bg-gradient-to-r from-yellow-200 to-yellow-400">
86
+ AI-powered code
87
+ </span>
88
+ </motion.h1>
89
+ <motion.p
90
+ initial={{ opacity: 0, y: 20 }}
91
+ animate={{ opacity: 1, y: 0 }}
92
+ transition={{ duration: 0.8, delay: 0.2 }}
93
+ className="text-xl text-white/80 mb-10 max-w-2xl mx-auto"
94
+ >
95
+ Build production-ready web apps in minutes. MINDI writes, tests,
96
+ and deploys your code while you focus on what matters.
97
+ </motion.p>
98
+ <motion.div
99
+ initial={{ opacity: 0, scale: 0.9 }}
100
+ animate={{ opacity: 1, scale: 1 }}
101
+ transition={{ duration: 0.5, delay: 0.4 }}
102
+ >
103
+ <a
104
+ href="/get-started"
105
+ className="inline-block px-8 py-4 text-lg font-semibold text-indigo-900 bg-white rounded-full shadow-lg hover:shadow-xl hover:scale-105 transition-all duration-300"
106
+ >
107
+ Get Started Free →
108
+ </a>
109
+ </motion.div>
110
+ </div>
111
+ </section>
112
+ );
113
+ }
114
+ <|code_end|>
115
+
116
+ <|critique_start|>
117
+ Visual Analysis:
118
+ - ✅ Gradient background: Strong indigo→purple→pink gradient creates visual depth
119
+ - ✅ Animated headline: Framer Motion fade-up provides smooth entrance
120
+ - ✅ CTA button: High contrast white on gradient, rounded-full shape draws attention
121
+ - ⚠️ Accessibility: Need to verify color contrast ratios for text on gradient
122
+ - ⚠️ Mobile: Font sizes may need adjustment below md breakpoint
123
+ - ✅ Performance: Motion animations are hardware-accelerated transforms
124
+ <|critique_end|>
125
+
126
+ <|suggest_start|>
127
+ Improvements for next iteration:
128
+ 1. Add aria-label to the CTA link for screen readers
129
+ 2. Consider adding a secondary CTA (e.g., "Watch Demo") for users not ready to commit
130
+ 3. Add a subtle particle or floating shape animation in the background
131
+ 4. Include social proof (e.g., "Trusted by 10,000+ developers") below the CTA
132
+ <|suggest_end|>"""
133
+
134
+ # ── Build full conversation string ───────────────────────────────────
135
+ conversation = f"""<|mindi_start|>
136
+ <|im_start|>system
137
+ {system_message}<|im_end|>
138
+ <|im_start|>user
139
+ {user_message}<|im_end|>
140
+ <|im_start|>assistant
141
+ {assistant_message}<|im_end|>
142
+ <|mindi_end|>"""
143
+
144
+ print("\n" + "─" * 70)
145
+ print("FULL MINDI CONVERSATION (raw text)")
146
+ print("─" * 70)
147
+ print(conversation)
148
+ print("─" * 70)
149
+
150
+ # ── Tokenize the full conversation ───────────────────────────────────
151
+ print("\n📊 TOKENIZATION RESULTS")
152
+ print("─" * 70)
153
+
154
+ token_ids = tokenizer.encode(conversation, add_special_tokens=False)
155
+ print(f" Total tokens: {len(token_ids):,}")
156
+
157
+ decoded = tokenizer.decode(token_ids)
158
+ print(f" Decoded length (chars): {len(decoded):,}")
159
+
160
+ # ── Round-trip verification ──────────────────────────────────────────
161
+ print("\n🔄 ROUND-TRIP VERIFICATION")
162
+ print("─" * 70)
163
+
164
+ if decoded.strip() == conversation.strip():
165
+ print(" ✅ PERFECT MATCH — decoded text matches original conversation exactly")
166
+ round_trip_pass = True
167
+ else:
168
+ # Show differences for debugging
169
+ print(" ❌ MISMATCH detected!")
170
+ orig_lines = conversation.strip().splitlines()
171
+ dec_lines = decoded.strip().splitlines()
172
+ print(f" Original lines: {len(orig_lines)}, Decoded lines: {len(dec_lines)}")
173
+ for i, (o, d) in enumerate(zip(orig_lines, dec_lines)):
174
+ if o != d:
175
+ print(f" Line {i}: DIFF")
176
+ print(f" Original: {repr(o[:100])}")
177
+ print(f" Decoded: {repr(d[:100])}")
178
+ round_trip_pass = False
179
+
180
+ # ── Verify all MINDI special tokens are preserved as single tokens ───
181
+ print("\n🔍 SPECIAL TOKEN PRESERVATION")
182
+ print("─" * 70)
183
+
184
+ all_passed = True
185
+ for token_str in MINDI_SPECIAL_TOKENS:
186
+ token_id = tokenizer.convert_tokens_to_ids(token_str)
187
+ # Check the token encodes to a single ID
188
+ encoded = tokenizer.encode(token_str, add_special_tokens=False)
189
+
190
+ if len(encoded) == 1 and encoded[0] == token_id:
191
+ status = "✅"
192
+ else:
193
+ status = "❌"
194
+ all_passed = False
195
+
196
+ # Check this token_id appears in the full conversation encoding
197
+ count_in_conv = token_ids.count(token_id)
198
+ print(f" {status} {token_str:<25} ID={token_id:<8} single_token=True occurrences_in_conv={count_in_conv}")
199
+
200
+ # ── Qwen chat template tokens ──────────────────────────────────────
201
+ print("\n🔍 QWEN CHAT TEMPLATE TOKENS")
202
+ print("─" * 70)
203
+
204
+ qwen_tokens = ["<|im_start|>", "<|im_end|>"]
205
+ for token_str in qwen_tokens:
206
+ token_id = tokenizer.convert_tokens_to_ids(token_str)
207
+ encoded = tokenizer.encode(token_str, add_special_tokens=False)
208
+ count_in_conv = token_ids.count(token_id)
209
+ status = "✅" if len(encoded) == 1 else "❌"
210
+ print(f" {status} {token_str:<25} ID={token_id:<8} occurrences_in_conv={count_in_conv}")
211
+
212
+ # ── Token distribution analysis ──────────────────────────────────────
213
+ print("\n📈 TOKEN DISTRIBUTION")
214
+ print("─" * 70)
215
+
216
+ # Count special vs regular tokens
217
+ special_ids = set()
218
+ for t in MINDI_SPECIAL_TOKENS + qwen_tokens:
219
+ tid = tokenizer.convert_tokens_to_ids(t)
220
+ special_ids.add(tid)
221
+
222
+ special_count = sum(1 for tid in token_ids if tid in special_ids)
223
+ regular_count = len(token_ids) - special_count
224
+
225
+ print(f" Special tokens: {special_count}")
226
+ print(f" Regular tokens: {regular_count}")
227
+ print(f" Total tokens: {len(token_ids):,}")
228
+ print(f" Special ratio: {special_count / len(token_ids) * 100:.1f}%")
229
+
230
+ # ── Estimate tokens per message ──────────────────────────────────────
231
+ print("\n📏 TOKENS PER MESSAGE")
232
+ print("─" * 70)
233
+
234
+ sys_tokens = tokenizer.encode(system_message, add_special_tokens=False)
235
+ usr_tokens = tokenizer.encode(user_message, add_special_tokens=False)
236
+ ast_tokens = tokenizer.encode(assistant_message, add_special_tokens=False)
237
+
238
+ print(f" System message: {len(sys_tokens):>5} tokens ({len(system_message):>5} chars)")
239
+ print(f" User message: {len(usr_tokens):>5} tokens ({len(user_message):>5} chars)")
240
+ print(f" Assistant message: {len(ast_tokens):>5} tokens ({len(assistant_message):>5} chars)")
241
+ print(f" Wrapper overhead: ~{len(token_ids) - len(sys_tokens) - len(usr_tokens) - len(ast_tokens):>5} tokens (mindi_start/end, im_start/end, roles)")
242
+
243
+ # ── Context window fit check ─────────────────────────────────────────
244
+ print("\n📐 CONTEXT WINDOW FIT")
245
+ print("─" * 70)
246
+ context_length = 32768
247
+ print(f" Context window: {context_length:>6} tokens")
248
+ print(f" This conversation: {len(token_ids):>6} tokens")
249
+ print(f" Remaining: {context_length - len(token_ids):>6} tokens ({(context_length - len(token_ids)) / context_length * 100:.1f}%)")
250
+ print(f" ✅ Fits easily within context window")
251
+
252
+ # ── Final verdict ────────────────────────────────────────────────────
253
+ print("\n" + "=" * 70)
254
+ if round_trip_pass and all_passed:
255
+ print("✅ STEP 5 PASSED: MINDI conversation format works perfectly!")
256
+ print(" • Full conversation tokenizes and decodes with perfect fidelity")
257
+ print(" • All 22 MINDI special tokens preserved as single tokens")
258
+ print(" • Qwen chat template tokens (im_start/im_end) working correctly")
259
+ print(f" • Total: {len(token_ids):,} tokens for a realistic conversation")
260
+ else:
261
+ print("❌ STEP 5 FAILED — issues detected above")
262
+ print("=" * 70)
src/tokenizer/tokenizer.py CHANGED
@@ -1,8 +1,9 @@
1
  """
2
  MINDI 1.5 Vision-Coder — Tokenizer Wrapper
3
 
4
- Wraps the base model tokenizer with MINDI-specific special tokens
5
- and encoding utilities for code generation tasks.
 
6
  """
7
 
8
  from __future__ import annotations
@@ -13,63 +14,120 @@ from typing import Optional
13
  from transformers import AutoTokenizer, PreTrainedTokenizerFast
14
 
15
 
16
- # Special tokens for MINDI's structured output format
17
- SPECIAL_TOKENS: dict[str, str] = {
 
 
18
  "code_start": "<|code_start|>",
19
  "code_end": "<|code_end|>",
20
- "file_start": "<|file_start|>",
21
- "file_end": "<|file_end|>",
22
  "critique_start": "<|critique_start|>",
23
  "critique_end": "<|critique_end|>",
 
 
 
 
 
 
24
  "search_start": "<|search_start|>",
25
  "search_end": "<|search_end|>",
 
 
 
 
26
  "fix_start": "<|fix_start|>",
27
  "fix_end": "<|fix_end|>",
28
  }
29
 
 
 
 
30
 
31
  class MindiTokenizer:
32
- """Tokenizer wrapper with MINDI-specific special tokens."""
33
 
34
- def __init__(self, model_name: str, cache_dir: Optional[Path] = None) -> None:
35
- self.model_name = model_name
36
- self.cache_dir = cache_dir or Path("./data/tokenizer")
37
- self.cache_dir.mkdir(parents=True, exist_ok=True)
 
 
 
38
 
39
  self.tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
40
- model_name,
41
- cache_dir=str(self.cache_dir),
42
  trust_remote_code=True,
43
  )
44
- self._add_special_tokens()
45
 
46
- def _add_special_tokens(self) -> None:
47
- """Register MINDI special tokens with the tokenizer."""
48
- new_tokens = list(SPECIAL_TOKENS.values())
49
- num_added = self.tokenizer.add_special_tokens(
50
- {"additional_special_tokens": new_tokens}
51
- )
52
- if num_added > 0:
53
- print(f"[MindiTokenizer] Added {num_added} special tokens")
54
 
55
- @property
56
- def vocab_size(self) -> int:
57
- """Return the full vocabulary size including special tokens."""
58
- return len(self.tokenizer)
59
 
60
- def encode(self, text: str, max_length: int = 8192) -> list[int]:
61
- """Encode text to token IDs with truncation."""
 
 
 
 
62
  return self.tokenizer.encode(
63
- text, max_length=max_length, truncation=True
 
 
 
64
  )
65
 
66
- def decode(self, token_ids: list[int]) -> str:
67
- """Decode token IDs back to text."""
68
- return self.tokenizer.decode(token_ids, skip_special_tokens=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  def save(self, output_dir: Optional[Path] = None) -> Path:
71
- """Save the tokenizer to disk."""
72
- save_path = output_dir or self.cache_dir / "mindi_tokenizer"
73
  save_path.mkdir(parents=True, exist_ok=True)
74
  self.tokenizer.save_pretrained(str(save_path))
75
  return save_path
 
1
  """
2
  MINDI 1.5 Vision-Coder — Tokenizer Wrapper
3
 
4
+ Wraps the MINDI tokenizer (Qwen2.5-Coder base + 22 special tokens)
5
+ with encoding utilities for code generation, conversation formatting,
6
+ and special-token-aware operations.
7
  """
8
 
9
  from __future__ import annotations
 
14
  from transformers import AutoTokenizer, PreTrainedTokenizerFast
15
 
16
 
17
+ # All 22 MINDI special tokens (pairs)
18
+ MINDI_SPECIAL_TOKENS: dict[str, str] = {
19
+ "mindi_start": "<|mindi_start|>",
20
+ "mindi_end": "<|mindi_end|>",
21
  "code_start": "<|code_start|>",
22
  "code_end": "<|code_end|>",
23
+ "vision_start": "<|vision_start|>",
24
+ "vision_end": "<|vision_end|>",
25
  "critique_start": "<|critique_start|>",
26
  "critique_end": "<|critique_end|>",
27
+ "suggest_start": "<|suggest_start|>",
28
+ "suggest_end": "<|suggest_end|>",
29
+ "think_start": "<|think_start|>",
30
+ "think_end": "<|think_end|>",
31
+ "file_start": "<|file_start|>",
32
+ "file_end": "<|file_end|>",
33
  "search_start": "<|search_start|>",
34
  "search_end": "<|search_end|>",
35
+ "sandbox_start": "<|sandbox_start|>",
36
+ "sandbox_end": "<|sandbox_end|>",
37
+ "error_start": "<|error_start|>",
38
+ "error_end": "<|error_end|>",
39
  "fix_start": "<|fix_start|>",
40
  "fix_end": "<|fix_end|>",
41
  }
42
 
43
+ # Default tokenizer path (pre-built with special tokens already added)
44
+ DEFAULT_TOKENIZER_PATH = Path(__file__).resolve().parent.parent.parent / "data" / "tokenizer" / "mindi_tokenizer"
45
+
46
 
47
  class MindiTokenizer:
48
+ """Tokenizer wrapper with MINDI-specific special tokens and conversation formatting."""
49
 
50
+ def __init__(
51
+ self,
52
+ tokenizer_path: Optional[Path] = None,
53
+ max_length: int = 32768,
54
+ ) -> None:
55
+ self.tokenizer_path = tokenizer_path or DEFAULT_TOKENIZER_PATH
56
+ self.max_length = max_length
57
 
58
  self.tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
59
+ str(self.tokenizer_path),
 
60
  trust_remote_code=True,
61
  )
 
62
 
63
+ # Cache special token IDs for fast lookup
64
+ self._special_token_ids: dict[str, int] = {
65
+ name: self.tokenizer.convert_tokens_to_ids(token)
66
+ for name, token in MINDI_SPECIAL_TOKENS.items()
67
+ }
 
 
 
68
 
69
+ # ── Core API ──────────────────────────────────────────────────────
 
 
 
70
 
71
+ def encode(
72
+ self,
73
+ text: str,
74
+ add_special_tokens: bool = False,
75
+ max_length: Optional[int] = None,
76
+ ) -> list[int]:
77
  return self.tokenizer.encode(
78
+ text,
79
+ add_special_tokens=add_special_tokens,
80
+ max_length=max_length or self.max_length,
81
+ truncation=True,
82
  )
83
 
84
+ def decode(self, token_ids: list[int], skip_special_tokens: bool = False) -> str:
85
+ return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
86
+
87
+ def encode_conversation(
88
+ self,
89
+ messages: list[dict[str, str]],
90
+ wrap_mindi: bool = True,
91
+ ) -> list[int]:
92
+ """Encode a list of messages [{"role": ..., "content": ...}] into token IDs.
93
+
94
+ Uses Qwen's im_start/im_end chat template with optional mindi_start/end wrapper.
95
+ """
96
+ parts: list[str] = []
97
+ if wrap_mindi:
98
+ parts.append("<|mindi_start|>\n")
99
+
100
+ for msg in messages:
101
+ role = msg["role"]
102
+ content = msg["content"]
103
+ parts.append(f"<|im_start|>{role}\n{content}<|im_end|>\n")
104
+
105
+ if wrap_mindi:
106
+ parts.append("<|mindi_end|>")
107
+
108
+ full_text = "".join(parts)
109
+ return self.encode(full_text, add_special_tokens=False)
110
+
111
+ def encode_with_special_tokens(self, text: str) -> list[int]:
112
+ """Encode text that contains MINDI special tokens, preserving them as single tokens."""
113
+ return self.encode(text, add_special_tokens=False)
114
+
115
+ # ── Introspection ─────────────────────────────────────────────────
116
+
117
+ def get_vocab_size(self) -> int:
118
+ return len(self.tokenizer)
119
+
120
+ def get_special_token_ids(self) -> dict[str, int]:
121
+ return dict(self._special_token_ids)
122
+
123
+ def get_special_token_id(self, name: str) -> int:
124
+ return self._special_token_ids[name]
125
+
126
+ # ── Persistence ───────────────────────────────────────────────────
127
 
128
  def save(self, output_dir: Optional[Path] = None) -> Path:
129
+ save_path = output_dir or self.tokenizer_path
130
+ save_path = Path(save_path)
131
  save_path.mkdir(parents=True, exist_ok=True)
132
  self.tokenizer.save_pretrained(str(save_path))
133
  return save_path