rag-latency-optimization / scripts /download_advanced_models.py
Ariyan-Pro's picture
Deploy RAG Latency Optimization v1.0
04ab625
#!/usr/bin/env python3
"""
Download cutting-edge CPU-optimized models for production.
"""
import os
import requests
from pathlib import Path
import json
from huggingface_hub import snapshot_download, HfApi
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)
# CPU-optimized models (small, fast, quantized)
MODELS_TO_DOWNLOAD = {
# Ultra-fast CPU models
"phi-2-gguf": {
"repo_id": "microsoft/phi-2",
"filename": "phi-2.Q4_K_M.gguf", # 4-bit quantization
"size_gb": 1.6,
"tokens_per_sec": "~30-50",
"description": "Microsoft Phi-2 GGUF (4-bit)"
},
"tinyllama-gguf": {
"repo_id": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
"filename": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
"size_gb": 0.8,
"tokens_per_sec": "~50-80",
"description": "TinyLlama 1.1B GGUF (4-bit)"
},
"qwen2-0.5b-gguf": {
"repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
"filename": "qwen2.5-0.5b-instruct-q4_0.gguf",
"size_gb": 0.3,
"tokens_per_sec": "~100-150",
"description": "Qwen 2.5 0.5B GGUF (4-bit)"
},
# ONNX Runtime optimized models
"bert-tiny-onnx": {
"repo_id": "microsoft/bert-tiny",
"files": ["model.onnx", "vocab.txt"],
"type": "onnx",
"description": "BERT-Tiny ONNX for ultra-fast embeddings"
}
}
def download_model(model_name, model_info):
"""Download a specific model."""
print(f"\n📥 Downloading {model_name}...")
print(f" Description: {model_info['description']}")
target_dir = MODELS_DIR / model_name
target_dir.mkdir(exist_ok=True)
try:
if model_info.get("type") == "onnx":
# Download ONNX model
api = HfApi()
files = api.list_repo_files(model_info["repo_id"])
for file in files:
if any(f in file for f in model_info.get("files", [])):
print(f" Downloading {file}...")
url = f"https://huggingface.co/{model_info['repo_id']}/resolve/main/{file}"
response = requests.get(url, stream=True)
response.raise_for_status()
filepath = target_dir / file
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f" ✓ Downloaded {file} ({filepath.stat().st_size / 1024 / 1024:.1f}MB)")
else:
# Download GGUF model
print(f" Looking for {model_info['filename']}...")
# Try to find the file in the repo
api = HfApi()
files = api.list_repo_files(model_info["repo_id"])
gguf_files = [f for f in files if f.endswith('.gguf')]
if gguf_files:
# Get the specific file or first available
target_file = model_info.get('filename')
if target_file and target_file in gguf_files:
file_to_download = target_file
else:
file_to_download = gguf_files[0] # Get smallest
print(f" Found: {file_to_download}")
url = f"https://huggingface.co/{model_info['repo_id']}/resolve/main/{file_to_download}"
response = requests.get(url, stream=True)
response.raise_for_status()
filepath = target_dir / file_to_download
total_size = int(response.headers.get('content-length', 0))
with open(filepath, 'wb') as f:
downloaded = 0
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
percent = (downloaded / total_size) * 100
print(f" Progress: {percent:.1f}%", end='\r')
print(f"\n ✓ Downloaded {file_to_download} ({filepath.stat().st_size / 1024 / 1024:.1f}MB)")
else:
print(f" ⚠ No GGUF files found in repo")
except Exception as e:
print(f" ❌ Error downloading {model_name}: {e}")
def main():
print("=" * 60)
print("🚀 DOWNLOADING CUTTING-EDGE CPU-OPTIMIZED MODELS")
print("=" * 60)
# Download selected models
models_to_get = ["qwen2-0.5b-gguf", "bert-tiny-onnx"] # Start with essentials
for model_name in models_to_get:
if model_name in MODELS_TO_DOWNLOAD:
download_model(model_name, MODELS_TO_DOWNLOAD[model_name])
# Create model registry
registry = {
"models": {},
"download_timestamp": "2026-01-22",
"total_size_gb": 0
}
for model_dir in MODELS_DIR.iterdir():
if model_dir.is_dir():
total_size = sum(f.stat().st_size for f in model_dir.rglob('*') if f.is_file())
registry["models"][model_dir.name] = {
"path": str(model_dir.relative_to(MODELS_DIR)),
"size_mb": total_size / 1024 / 1024,
"files": [f.name for f in model_dir.iterdir() if f.is_file()]
}
registry["total_size_gb"] += total_size / 1024 / 1024 / 1024
# Save registry
registry_file = MODELS_DIR / "model_registry.json"
with open(registry_file, 'w') as f:
json.dump(registry, f, indent=2)
print(f"\n📋 Model registry saved to: {registry_file}")
print(f"📦 Total models size: {registry['total_size_gb']:.2f} GB")
print("\n✅ Model download complete!")
print("\nNext steps:")
print("1. Update config.py to use downloaded models")
print("2. Run: python -c \"from app.llm_integration import CPUOptimizedLLM; llm = CPUOptimizedLLM(); llm.initialize()\"")
print("3. Test with: python test_real_llm.py")
if __name__ == "__main__":
main()