Faaz
Day 1 Complete: Tokenizer setup β Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89 | """ | |
| MINDI 1.5 Vision-Coder β Step 3: Download Tokenizer & Test | |
| Downloads ONLY the tokenizer (not model weights) from Qwen/Qwen2.5-Coder-7B-Instruct, | |
| saves it locally, and runs encoding/decoding tests on 8 code strings. | |
| """ | |
| import os | |
| import sys | |
| from pathlib import Path | |
| # Ensure project root | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from dotenv import load_dotenv | |
| load_dotenv(PROJECT_ROOT / ".env") | |
| def main(): | |
| from transformers import AutoTokenizer | |
| model_name = "Qwen/Qwen2.5-Coder-7B-Instruct" | |
| save_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer" | |
| hf_token = os.environ.get("HUGGINGFACE_TOKEN", "") | |
| # ββ Download tokenizer ββ | |
| print(f"\n{'='*60}") | |
| print(f" Downloading tokenizer: {model_name}") | |
| print(f" Save to: {save_dir}") | |
| print(f"{'='*60}\n") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| token=hf_token if hf_token else None, | |
| trust_remote_code=True, | |
| ) | |
| # Save locally | |
| save_dir.mkdir(parents=True, exist_ok=True) | |
| tokenizer.save_pretrained(str(save_dir)) | |
| print(f" β Tokenizer saved to {save_dir}") | |
| print(f" β Vocab size: {tokenizer.vocab_size:,}") | |
| print(f" β Model max length: {tokenizer.model_max_length:,}") | |
| # ββ List saved files ββ | |
| print(f"\n Saved files:") | |
| for f in sorted(save_dir.iterdir()): | |
| size_kb = f.stat().st_size / 1024 | |
| print(f" {f.name} ({size_kb:.1f} KB)") | |
| # ββ Run tokenizer tests ββ | |
| test_strings = [ | |
| "Build me a Next.js dashboard", | |
| "import React from 'react'", | |
| "className='flex items-center gap-4'", | |
| "'use client'", | |
| "const [state, setState] = useState(null)", | |
| "export default function Page() {", | |
| "npm install framer-motion", | |
| "async function getData() {", | |
| ] | |
| print(f"\n{'='*60}") | |
| print(f" Tokenizer Tests β 8 Code Strings") | |
| print(f"{'='*60}") | |
| all_pass = True | |
| for i, text in enumerate(test_strings, 1): | |
| ids = tokenizer.encode(text, add_special_tokens=False) | |
| decoded = tokenizer.decode(ids) | |
| match = decoded == text | |
| if not match: | |
| all_pass = False | |
| print(f"\n Test {i}: \"{text}\"") | |
| print(f" Token count: {len(ids)}") | |
| print(f" Token IDs: {ids}") | |
| print(f" Decoded: \"{decoded}\"") | |
| print(f" Match: {'β PERFECT' if match else 'β MISMATCH'}") | |
| print(f"\n{'='*60}") | |
| if all_pass: | |
| print(f" β ALL 8 TESTS PASSED β Perfect reconstruction!") | |
| else: | |
| print(f" β οΈ Some tests had reconstruction differences (whitespace normalization is normal)") | |
| print(f"{'='*60}\n") | |
| if __name__ == "__main__": | |
| main() | |