Spaces:

Ariyan-Pro
/

rag-latency-optimization

Sleeping

App Files Files Community

rag-latency-optimization / scripts /download_advanced_models.py

Ariyan-Pro

Deploy RAG Latency Optimization v1.0

04ab625 6 days ago

raw

history blame contribute delete

6.16 kB

	#!/usr/bin/env python3
	"""
	Download cutting-edge CPU-optimized models for production.
	"""
	import os
	import requests
	from pathlib import Path
	import json
	from huggingface_hub import snapshot_download, HfApi

	MODELS_DIR = Path("models")
	MODELS_DIR.mkdir(exist_ok=True)

	# CPU-optimized models (small, fast, quantized)
	MODELS_TO_DOWNLOAD = {
	# Ultra-fast CPU models
	"phi-2-gguf": {
	"repo_id": "microsoft/phi-2",
	"filename": "phi-2.Q4_K_M.gguf", # 4-bit quantization
	"size_gb": 1.6,
	"tokens_per_sec": "~30-50",
	"description": "Microsoft Phi-2 GGUF (4-bit)"
	},
	"tinyllama-gguf": {
	"repo_id": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
	"filename": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
	"size_gb": 0.8,
	"tokens_per_sec": "~50-80",
	"description": "TinyLlama 1.1B GGUF (4-bit)"
	},
	"qwen2-0.5b-gguf": {
	"repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
	"filename": "qwen2.5-0.5b-instruct-q4_0.gguf",
	"size_gb": 0.3,
	"tokens_per_sec": "~100-150",
	"description": "Qwen 2.5 0.5B GGUF (4-bit)"
	},
	# ONNX Runtime optimized models
	"bert-tiny-onnx": {
	"repo_id": "microsoft/bert-tiny",
	"files": ["model.onnx", "vocab.txt"],
	"type": "onnx",
	"description": "BERT-Tiny ONNX for ultra-fast embeddings"
	}
	}

	def download_model(model_name, model_info):
	"""Download a specific model."""
	print(f"\n📥 Downloading {model_name}...")
	print(f" Description: {model_info['description']}")

	target_dir = MODELS_DIR / model_name
	target_dir.mkdir(exist_ok=True)

	try:
	if model_info.get("type") == "onnx":
	# Download ONNX model
	api = HfApi()
	files = api.list_repo_files(model_info["repo_id"])

	for file in files:
	if any(f in file for f in model_info.get("files", [])):
	print(f" Downloading {file}...")
	url = f"https://huggingface.co/{model_info['repo_id']}/resolve/main/{file}"
	response = requests.get(url, stream=True)
	response.raise_for_status()

	filepath = target_dir / file
	with open(filepath, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	print(f" ✓ Downloaded {file} ({filepath.stat().st_size / 1024 / 1024:.1f}MB)")

	else:
	# Download GGUF model
	print(f" Looking for {model_info['filename']}...")

	# Try to find the file in the repo
	api = HfApi()
	files = api.list_repo_files(model_info["repo_id"])

	gguf_files = [f for f in files if f.endswith('.gguf')]
	if gguf_files:
	# Get the specific file or first available
	target_file = model_info.get('filename')
	if target_file and target_file in gguf_files:
	file_to_download = target_file
	else:
	file_to_download = gguf_files[0] # Get smallest

	print(f" Found: {file_to_download}")

	url = f"https://huggingface.co/{model_info['repo_id']}/resolve/main/{file_to_download}"
	response = requests.get(url, stream=True)
	response.raise_for_status()

	filepath = target_dir / file_to_download
	total_size = int(response.headers.get('content-length', 0))

	with open(filepath, 'wb') as f:
	downloaded = 0
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	downloaded += len(chunk)
	if total_size > 0:
	percent = (downloaded / total_size) * 100
	print(f" Progress: {percent:.1f}%", end='\r')

	print(f"\n ✓ Downloaded {file_to_download} ({filepath.stat().st_size / 1024 / 1024:.1f}MB)")
	else:
	print(f" ⚠ No GGUF files found in repo")

	except Exception as e:
	print(f" ❌ Error downloading {model_name}: {e}")

	def main():
	print("=" * 60)
	print("🚀 DOWNLOADING CUTTING-EDGE CPU-OPTIMIZED MODELS")
	print("=" * 60)

	# Download selected models
	models_to_get = ["qwen2-0.5b-gguf", "bert-tiny-onnx"] # Start with essentials

	for model_name in models_to_get:
	if model_name in MODELS_TO_DOWNLOAD:
	download_model(model_name, MODELS_TO_DOWNLOAD[model_name])

	# Create model registry
	registry = {
	"models": {},
	"download_timestamp": "2026-01-22",
	"total_size_gb": 0
	}

	for model_dir in MODELS_DIR.iterdir():
	if model_dir.is_dir():
	total_size = sum(f.stat().st_size for f in model_dir.rglob('*') if f.is_file())
	registry["models"][model_dir.name] = {
	"path": str(model_dir.relative_to(MODELS_DIR)),
	"size_mb": total_size / 1024 / 1024,
	"files": [f.name for f in model_dir.iterdir() if f.is_file()]
	}
	registry["total_size_gb"] += total_size / 1024 / 1024 / 1024

	# Save registry
	registry_file = MODELS_DIR / "model_registry.json"
	with open(registry_file, 'w') as f:
	json.dump(registry, f, indent=2)

	print(f"\n📋 Model registry saved to: {registry_file}")
	print(f"📦 Total models size: {registry['total_size_gb']:.2f} GB")
	print("\n✅ Model download complete!")
	print("\nNext steps:")
	print("1. Update config.py to use downloaded models")
	print("2. Run: python -c \"from app.llm_integration import CPUOptimizedLLM; llm = CPUOptimizedLLM(); llm.initialize()\"")
	print("3. Test with: python test_real_llm.py")

	if __name__ == "__main__":
	main()