File size: 2,803 Bytes

11e0d89

"""
MINDI 1.5 Vision-Coder — Step 3: Download Tokenizer & Test

Downloads ONLY the tokenizer (not model weights) from Qwen/Qwen2.5-Coder-7B-Instruct,
saves it locally, and runs encoding/decoding tests on 8 code strings.
"""

import os
import sys
from pathlib import Path

# Ensure project root
PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))

from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")


def main():
    from transformers import AutoTokenizer

    model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
    save_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
    hf_token = os.environ.get("HUGGINGFACE_TOKEN", "")

    # ── Download tokenizer ──
    print(f"\n{'='*60}")
    print(f"  Downloading tokenizer: {model_name}")
    print(f"  Save to: {save_dir}")
    print(f"{'='*60}\n")

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token if hf_token else None,
        trust_remote_code=True,
    )

    # Save locally
    save_dir.mkdir(parents=True, exist_ok=True)
    tokenizer.save_pretrained(str(save_dir))
    print(f"  ✅ Tokenizer saved to {save_dir}")
    print(f"  ✅ Vocab size: {tokenizer.vocab_size:,}")
    print(f"  ✅ Model max length: {tokenizer.model_max_length:,}")

    # ── List saved files ──
    print(f"\n  Saved files:")
    for f in sorted(save_dir.iterdir()):
        size_kb = f.stat().st_size / 1024
        print(f"    {f.name} ({size_kb:.1f} KB)")

    # ── Run tokenizer tests ──
    test_strings = [
        "Build me a Next.js dashboard",
        "import React from 'react'",
        "className='flex items-center gap-4'",
        "'use client'",
        "const [state, setState] = useState(null)",
        "export default function Page() {",
        "npm install framer-motion",
        "async function getData() {",
    ]

    print(f"\n{'='*60}")
    print(f"  Tokenizer Tests — 8 Code Strings")
    print(f"{'='*60}")

    all_pass = True
    for i, text in enumerate(test_strings, 1):
        ids = tokenizer.encode(text, add_special_tokens=False)
        decoded = tokenizer.decode(ids)
        match = decoded == text
        if not match:
            all_pass = False

        print(f"\n  Test {i}: \"{text}\"")
        print(f"    Token count: {len(ids)}")
        print(f"    Token IDs:   {ids}")
        print(f"    Decoded:     \"{decoded}\"")
        print(f"    Match:       {'✅ PERFECT' if match else '❌ MISMATCH'}")

    print(f"\n{'='*60}")
    if all_pass:
        print(f"  ✅ ALL 8 TESTS PASSED — Perfect reconstruction!")
    else:
        print(f"  ⚠️  Some tests had reconstruction differences (whitespace normalization is normal)")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    main()