File size: 2,803 Bytes
11e0d89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
MINDI 1.5 Vision-Coder β€” Step 3: Download Tokenizer & Test

Downloads ONLY the tokenizer (not model weights) from Qwen/Qwen2.5-Coder-7B-Instruct,
saves it locally, and runs encoding/decoding tests on 8 code strings.
"""

import os
import sys
from pathlib import Path

# Ensure project root
PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))

from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")


def main():
    from transformers import AutoTokenizer

    model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
    save_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
    hf_token = os.environ.get("HUGGINGFACE_TOKEN", "")

    # ── Download tokenizer ──
    print(f"\n{'='*60}")
    print(f"  Downloading tokenizer: {model_name}")
    print(f"  Save to: {save_dir}")
    print(f"{'='*60}\n")

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        token=hf_token if hf_token else None,
        trust_remote_code=True,
    )

    # Save locally
    save_dir.mkdir(parents=True, exist_ok=True)
    tokenizer.save_pretrained(str(save_dir))
    print(f"  βœ… Tokenizer saved to {save_dir}")
    print(f"  βœ… Vocab size: {tokenizer.vocab_size:,}")
    print(f"  βœ… Model max length: {tokenizer.model_max_length:,}")

    # ── List saved files ──
    print(f"\n  Saved files:")
    for f in sorted(save_dir.iterdir()):
        size_kb = f.stat().st_size / 1024
        print(f"    {f.name} ({size_kb:.1f} KB)")

    # ── Run tokenizer tests ──
    test_strings = [
        "Build me a Next.js dashboard",
        "import React from 'react'",
        "className='flex items-center gap-4'",
        "'use client'",
        "const [state, setState] = useState(null)",
        "export default function Page() {",
        "npm install framer-motion",
        "async function getData() {",
    ]

    print(f"\n{'='*60}")
    print(f"  Tokenizer Tests β€” 8 Code Strings")
    print(f"{'='*60}")

    all_pass = True
    for i, text in enumerate(test_strings, 1):
        ids = tokenizer.encode(text, add_special_tokens=False)
        decoded = tokenizer.decode(ids)
        match = decoded == text
        if not match:
            all_pass = False

        print(f"\n  Test {i}: \"{text}\"")
        print(f"    Token count: {len(ids)}")
        print(f"    Token IDs:   {ids}")
        print(f"    Decoded:     \"{decoded}\"")
        print(f"    Match:       {'βœ… PERFECT' if match else '❌ MISMATCH'}")

    print(f"\n{'='*60}")
    if all_pass:
        print(f"  βœ… ALL 8 TESTS PASSED β€” Perfect reconstruction!")
    else:
        print(f"  ⚠️  Some tests had reconstruction differences (whitespace normalization is normal)")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    main()