MINDI-1.5-Vision-Coder / scripts /download_tokenizer.py
Faaz
Day 1 Complete: Tokenizer setup β€” Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test
11e0d89
"""
MINDI 1.5 Vision-Coder β€” Step 3: Download Tokenizer & Test
Downloads ONLY the tokenizer (not model weights) from Qwen/Qwen2.5-Coder-7B-Instruct,
saves it locally, and runs encoding/decoding tests on 8 code strings.
"""
import os
import sys
from pathlib import Path
# Ensure project root
PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")
def main():
from transformers import AutoTokenizer
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
save_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
hf_token = os.environ.get("HUGGINGFACE_TOKEN", "")
# ── Download tokenizer ──
print(f"\n{'='*60}")
print(f" Downloading tokenizer: {model_name}")
print(f" Save to: {save_dir}")
print(f"{'='*60}\n")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
token=hf_token if hf_token else None,
trust_remote_code=True,
)
# Save locally
save_dir.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(str(save_dir))
print(f" βœ… Tokenizer saved to {save_dir}")
print(f" βœ… Vocab size: {tokenizer.vocab_size:,}")
print(f" βœ… Model max length: {tokenizer.model_max_length:,}")
# ── List saved files ──
print(f"\n Saved files:")
for f in sorted(save_dir.iterdir()):
size_kb = f.stat().st_size / 1024
print(f" {f.name} ({size_kb:.1f} KB)")
# ── Run tokenizer tests ──
test_strings = [
"Build me a Next.js dashboard",
"import React from 'react'",
"className='flex items-center gap-4'",
"'use client'",
"const [state, setState] = useState(null)",
"export default function Page() {",
"npm install framer-motion",
"async function getData() {",
]
print(f"\n{'='*60}")
print(f" Tokenizer Tests β€” 8 Code Strings")
print(f"{'='*60}")
all_pass = True
for i, text in enumerate(test_strings, 1):
ids = tokenizer.encode(text, add_special_tokens=False)
decoded = tokenizer.decode(ids)
match = decoded == text
if not match:
all_pass = False
print(f"\n Test {i}: \"{text}\"")
print(f" Token count: {len(ids)}")
print(f" Token IDs: {ids}")
print(f" Decoded: \"{decoded}\"")
print(f" Match: {'βœ… PERFECT' if match else '❌ MISMATCH'}")
print(f"\n{'='*60}")
if all_pass:
print(f" βœ… ALL 8 TESTS PASSED β€” Perfect reconstruction!")
else:
print(f" ⚠️ Some tests had reconstruction differences (whitespace normalization is normal)")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()