Spaces:

Ariyan-Pro
/

rag-latency-optimization

Sleeping

File size: 6,162 Bytes

04ab625

#!/usr/bin/env python3
"""
Download cutting-edge CPU-optimized models for production.
"""
import os
import requests
from pathlib import Path
import json
from huggingface_hub import snapshot_download, HfApi

MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)

# CPU-optimized models (small, fast, quantized)
MODELS_TO_DOWNLOAD = {
    # Ultra-fast CPU models
    "phi-2-gguf": {
        "repo_id": "microsoft/phi-2",
        "filename": "phi-2.Q4_K_M.gguf",  # 4-bit quantization
        "size_gb": 1.6,
        "tokens_per_sec": "~30-50",
        "description": "Microsoft Phi-2 GGUF (4-bit)"
    },
    "tinyllama-gguf": {
        "repo_id": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
        "filename": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
        "size_gb": 0.8,
        "tokens_per_sec": "~50-80",
        "description": "TinyLlama 1.1B GGUF (4-bit)"
    },
    "qwen2-0.5b-gguf": {
        "repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
        "filename": "qwen2.5-0.5b-instruct-q4_0.gguf",
        "size_gb": 0.3,
        "tokens_per_sec": "~100-150",
        "description": "Qwen 2.5 0.5B GGUF (4-bit)"
    },
    # ONNX Runtime optimized models
    "bert-tiny-onnx": {
        "repo_id": "microsoft/bert-tiny",
        "files": ["model.onnx", "vocab.txt"],
        "type": "onnx",
        "description": "BERT-Tiny ONNX for ultra-fast embeddings"
    }
}

def download_model(model_name, model_info):
    """Download a specific model."""
    print(f"\n📥 Downloading {model_name}...")
    print(f"   Description: {model_info['description']}")
    
    target_dir = MODELS_DIR / model_name
    target_dir.mkdir(exist_ok=True)
    
    try:
        if model_info.get("type") == "onnx":
            # Download ONNX model
            api = HfApi()
            files = api.list_repo_files(model_info["repo_id"])
            
            for file in files:
                if any(f in file for f in model_info.get("files", [])):
                    print(f"   Downloading {file}...")
                    url = f"https://huggingface.co/{model_info['repo_id']}/resolve/main/{file}"
                    response = requests.get(url, stream=True)
                    response.raise_for_status()
                    
                    filepath = target_dir / file
                    with open(filepath, 'wb') as f:
                        for chunk in response.iter_content(chunk_size=8192):
                            f.write(chunk)
                    
                    print(f"   ✓ Downloaded {file} ({filepath.stat().st_size / 1024 / 1024:.1f}MB)")
        
        else:
            # Download GGUF model
            print(f"   Looking for {model_info['filename']}...")
            
            # Try to find the file in the repo
            api = HfApi()
            files = api.list_repo_files(model_info["repo_id"])
            
            gguf_files = [f for f in files if f.endswith('.gguf')]
            if gguf_files:
                # Get the specific file or first available
                target_file = model_info.get('filename')
                if target_file and target_file in gguf_files:
                    file_to_download = target_file
                else:
                    file_to_download = gguf_files[0]  # Get smallest
                
                print(f"   Found: {file_to_download}")
                
                url = f"https://huggingface.co/{model_info['repo_id']}/resolve/main/{file_to_download}"
                response = requests.get(url, stream=True)
                response.raise_for_status()
                
                filepath = target_dir / file_to_download
                total_size = int(response.headers.get('content-length', 0))
                
                with open(filepath, 'wb') as f:
                    downloaded = 0
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                        downloaded += len(chunk)
                        if total_size > 0:
                            percent = (downloaded / total_size) * 100
                            print(f"   Progress: {percent:.1f}%", end='\r')
                
                print(f"\n   ✓ Downloaded {file_to_download} ({filepath.stat().st_size / 1024 / 1024:.1f}MB)")
            else:
                print(f"   ⚠ No GGUF files found in repo")
    
    except Exception as e:
        print(f"   ❌ Error downloading {model_name}: {e}")

def main():
    print("=" * 60)
    print("🚀 DOWNLOADING CUTTING-EDGE CPU-OPTIMIZED MODELS")
    print("=" * 60)
    
    # Download selected models
    models_to_get = ["qwen2-0.5b-gguf", "bert-tiny-onnx"]  # Start with essentials
    
    for model_name in models_to_get:
        if model_name in MODELS_TO_DOWNLOAD:
            download_model(model_name, MODELS_TO_DOWNLOAD[model_name])
    
    # Create model registry
    registry = {
        "models": {},
        "download_timestamp": "2026-01-22",
        "total_size_gb": 0
    }
    
    for model_dir in MODELS_DIR.iterdir():
        if model_dir.is_dir():
            total_size = sum(f.stat().st_size for f in model_dir.rglob('*') if f.is_file())
            registry["models"][model_dir.name] = {
                "path": str(model_dir.relative_to(MODELS_DIR)),
                "size_mb": total_size / 1024 / 1024,
                "files": [f.name for f in model_dir.iterdir() if f.is_file()]
            }
            registry["total_size_gb"] += total_size / 1024 / 1024 / 1024
    
    # Save registry
    registry_file = MODELS_DIR / "model_registry.json"
    with open(registry_file, 'w') as f:
        json.dump(registry, f, indent=2)
    
    print(f"\n📋 Model registry saved to: {registry_file}")
    print(f"📦 Total models size: {registry['total_size_gb']:.2f} GB")
    print("\n✅ Model download complete!")
    print("\nNext steps:")
    print("1. Update config.py to use downloaded models")
    print("2. Run: python -c \"from app.llm_integration import CPUOptimizedLLM; llm = CPUOptimizedLLM(); llm.initialize()\"")
    print("3. Test with: python test_real_llm.py")

if __name__ == "__main__":
    main()