Faaz
Day 1 Complete: Tokenizer setup β Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89 | """ | |
| MINDI 1.5 Vision-Coder β Tokenizer Wrapper | |
| Wraps the MINDI tokenizer (Qwen2.5-Coder base + 22 special tokens) | |
| with encoding utilities for code generation, conversation formatting, | |
| and special-token-aware operations. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Optional | |
| from transformers import AutoTokenizer, PreTrainedTokenizerFast | |
| # All 22 MINDI special tokens (pairs) | |
| MINDI_SPECIAL_TOKENS: dict[str, str] = { | |
| "mindi_start": "<|mindi_start|>", | |
| "mindi_end": "<|mindi_end|>", | |
| "code_start": "<|code_start|>", | |
| "code_end": "<|code_end|>", | |
| "vision_start": "<|vision_start|>", | |
| "vision_end": "<|vision_end|>", | |
| "critique_start": "<|critique_start|>", | |
| "critique_end": "<|critique_end|>", | |
| "suggest_start": "<|suggest_start|>", | |
| "suggest_end": "<|suggest_end|>", | |
| "think_start": "<|think_start|>", | |
| "think_end": "<|think_end|>", | |
| "file_start": "<|file_start|>", | |
| "file_end": "<|file_end|>", | |
| "search_start": "<|search_start|>", | |
| "search_end": "<|search_end|>", | |
| "sandbox_start": "<|sandbox_start|>", | |
| "sandbox_end": "<|sandbox_end|>", | |
| "error_start": "<|error_start|>", | |
| "error_end": "<|error_end|>", | |
| "fix_start": "<|fix_start|>", | |
| "fix_end": "<|fix_end|>", | |
| } | |
| # Default tokenizer path (pre-built with special tokens already added) | |
| DEFAULT_TOKENIZER_PATH = Path(__file__).resolve().parent.parent.parent / "data" / "tokenizer" / "mindi_tokenizer" | |
| class MindiTokenizer: | |
| """Tokenizer wrapper with MINDI-specific special tokens and conversation formatting.""" | |
| def __init__( | |
| self, | |
| tokenizer_path: Optional[Path] = None, | |
| max_length: int = 32768, | |
| ) -> None: | |
| self.tokenizer_path = tokenizer_path or DEFAULT_TOKENIZER_PATH | |
| self.max_length = max_length | |
| self.tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained( | |
| str(self.tokenizer_path), | |
| trust_remote_code=True, | |
| ) | |
| # Cache special token IDs for fast lookup | |
| self._special_token_ids: dict[str, int] = { | |
| name: self.tokenizer.convert_tokens_to_ids(token) | |
| for name, token in MINDI_SPECIAL_TOKENS.items() | |
| } | |
| # ββ Core API ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def encode( | |
| self, | |
| text: str, | |
| add_special_tokens: bool = False, | |
| max_length: Optional[int] = None, | |
| ) -> list[int]: | |
| return self.tokenizer.encode( | |
| text, | |
| add_special_tokens=add_special_tokens, | |
| max_length=max_length or self.max_length, | |
| truncation=True, | |
| ) | |
| def decode(self, token_ids: list[int], skip_special_tokens: bool = False) -> str: | |
| return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) | |
| def encode_conversation( | |
| self, | |
| messages: list[dict[str, str]], | |
| wrap_mindi: bool = True, | |
| ) -> list[int]: | |
| """Encode a list of messages [{"role": ..., "content": ...}] into token IDs. | |
| Uses Qwen's im_start/im_end chat template with optional mindi_start/end wrapper. | |
| """ | |
| parts: list[str] = [] | |
| if wrap_mindi: | |
| parts.append("<|mindi_start|>\n") | |
| for msg in messages: | |
| role = msg["role"] | |
| content = msg["content"] | |
| parts.append(f"<|im_start|>{role}\n{content}<|im_end|>\n") | |
| if wrap_mindi: | |
| parts.append("<|mindi_end|>") | |
| full_text = "".join(parts) | |
| return self.encode(full_text, add_special_tokens=False) | |
| def encode_with_special_tokens(self, text: str) -> list[int]: | |
| """Encode text that contains MINDI special tokens, preserving them as single tokens.""" | |
| return self.encode(text, add_special_tokens=False) | |
| # ββ Introspection βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_vocab_size(self) -> int: | |
| return len(self.tokenizer) | |
| def get_special_token_ids(self) -> dict[str, int]: | |
| return dict(self._special_token_ids) | |
| def get_special_token_id(self, name: str) -> int: | |
| return self._special_token_ids[name] | |
| # ββ Persistence βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def save(self, output_dir: Optional[Path] = None) -> Path: | |
| save_path = output_dir or self.tokenizer_path | |
| save_path = Path(save_path) | |
| save_path.mkdir(parents=True, exist_ok=True) | |
| self.tokenizer.save_pretrained(str(save_path)) | |
| return save_path | |