MINDI-1.5-Vision-Coder / scripts /download_tokenizer.py

Faaz

Day 1 Complete: Tokenizer setup — Qwen2.5-Coder-7B base + 22 MINDI special tokens (vocab 151,685), wrapper class, full format test

11e0d89 about 1 month ago

raw

history blame contribute delete

2.8 kB

	"""
	MINDI 1.5 Vision-Coder — Step 3: Download Tokenizer & Test

	Downloads ONLY the tokenizer (not model weights) from Qwen/Qwen2.5-Coder-7B-Instruct,
	saves it locally, and runs encoding/decoding tests on 8 code strings.
	"""

	import os
	import sys
	from pathlib import Path

	# Ensure project root
	PROJECT_ROOT = Path(__file__).resolve().parents[1]
	sys.path.insert(0, str(PROJECT_ROOT))

	from dotenv import load_dotenv
	load_dotenv(PROJECT_ROOT / ".env")


	def main():
	from transformers import AutoTokenizer

	model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
	save_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
	hf_token = os.environ.get("HUGGINGFACE_TOKEN", "")

	# ── Download tokenizer ──
	print(f"\n{'='*60}")
	print(f" Downloading tokenizer: {model_name}")
	print(f" Save to: {save_dir}")
	print(f"{'='*60}\n")

	tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	token=hf_token if hf_token else None,
	trust_remote_code=True,
	)

	# Save locally
	save_dir.mkdir(parents=True, exist_ok=True)
	tokenizer.save_pretrained(str(save_dir))
	print(f" ✅ Tokenizer saved to {save_dir}")
	print(f" ✅ Vocab size: {tokenizer.vocab_size:,}")
	print(f" ✅ Model max length: {tokenizer.model_max_length:,}")

	# ── List saved files ──
	print(f"\n Saved files:")
	for f in sorted(save_dir.iterdir()):
	size_kb = f.stat().st_size / 1024
	print(f" {f.name} ({size_kb:.1f} KB)")

	# ── Run tokenizer tests ──
	test_strings = [
	"Build me a Next.js dashboard",
	"import React from 'react'",
	"className='flex items-center gap-4'",
	"'use client'",
	"const [state, setState] = useState(null)",
	"export default function Page() {",
	"npm install framer-motion",
	"async function getData() {",
	]

	print(f"\n{'='*60}")
	print(f" Tokenizer Tests — 8 Code Strings")
	print(f"{'='*60}")

	all_pass = True
	for i, text in enumerate(test_strings, 1):
	ids = tokenizer.encode(text, add_special_tokens=False)
	decoded = tokenizer.decode(ids)
	match = decoded == text
	if not match:
	all_pass = False

	print(f"\n Test {i}: \"{text}\"")
	print(f" Token count: {len(ids)}")
	print(f" Token IDs: {ids}")
	print(f" Decoded: \"{decoded}\"")
	print(f" Match: {'✅ PERFECT' if match else '❌ MISMATCH'}")

	print(f"\n{'='*60}")
	if all_pass:
	print(f" ✅ ALL 8 TESTS PASSED — Perfect reconstruction!")
	else:
	print(f" ⚠️ Some tests had reconstruction differences (whitespace normalization is normal)")
	print(f"{'='*60}\n")


	if __name__ == "__main__":
	main()