File size: 2,803 Bytes
11e0d89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | """
MINDI 1.5 Vision-Coder β Step 3: Download Tokenizer & Test
Downloads ONLY the tokenizer (not model weights) from Qwen/Qwen2.5-Coder-7B-Instruct,
saves it locally, and runs encoding/decoding tests on 8 code strings.
"""
import os
import sys
from pathlib import Path
# Ensure project root
PROJECT_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")
def main():
from transformers import AutoTokenizer
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
save_dir = PROJECT_ROOT / "data" / "tokenizer" / "base_tokenizer"
hf_token = os.environ.get("HUGGINGFACE_TOKEN", "")
# ββ Download tokenizer ββ
print(f"\n{'='*60}")
print(f" Downloading tokenizer: {model_name}")
print(f" Save to: {save_dir}")
print(f"{'='*60}\n")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
token=hf_token if hf_token else None,
trust_remote_code=True,
)
# Save locally
save_dir.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(str(save_dir))
print(f" β
Tokenizer saved to {save_dir}")
print(f" β
Vocab size: {tokenizer.vocab_size:,}")
print(f" β
Model max length: {tokenizer.model_max_length:,}")
# ββ List saved files ββ
print(f"\n Saved files:")
for f in sorted(save_dir.iterdir()):
size_kb = f.stat().st_size / 1024
print(f" {f.name} ({size_kb:.1f} KB)")
# ββ Run tokenizer tests ββ
test_strings = [
"Build me a Next.js dashboard",
"import React from 'react'",
"className='flex items-center gap-4'",
"'use client'",
"const [state, setState] = useState(null)",
"export default function Page() {",
"npm install framer-motion",
"async function getData() {",
]
print(f"\n{'='*60}")
print(f" Tokenizer Tests β 8 Code Strings")
print(f"{'='*60}")
all_pass = True
for i, text in enumerate(test_strings, 1):
ids = tokenizer.encode(text, add_special_tokens=False)
decoded = tokenizer.decode(ids)
match = decoded == text
if not match:
all_pass = False
print(f"\n Test {i}: \"{text}\"")
print(f" Token count: {len(ids)}")
print(f" Token IDs: {ids}")
print(f" Decoded: \"{decoded}\"")
print(f" Match: {'β
PERFECT' if match else 'β MISMATCH'}")
print(f"\n{'='*60}")
if all_pass:
print(f" β
ALL 8 TESTS PASSED β Perfect reconstruction!")
else:
print(f" β οΈ Some tests had reconstruction differences (whitespace normalization is normal)")
print(f"{'='*60}\n")
if __name__ == "__main__":
main()
|