MLOps-Platforms / src /mlops /system_check.py
songhieng's picture
Update src/mlops/system_check.py
9aa4daf verified
"""
System Prerequisites Checker Module
This module provides functionality to check system prerequisites including:
- CUDA/GPU availability
- Environment dependencies
- Model download with progress tracking
"""
import os
import sys
import torch
import platform
from typing import Dict, Tuple, Optional
from pathlib import Path
import importlib.metadata
from huggingface_hub import hf_hub_download, snapshot_download
from tqdm import tqdm
class SystemChecker:
"""Check system prerequisites for MLOps platform."""
def __init__(self, models_dir: str = "models"):
"""
Initialize system checker.
Args:
models_dir: Directory to store downloaded models
"""
self.models_dir = Path(models_dir)
self.models_dir.mkdir(parents=True, exist_ok=True)
def check_cuda(self) -> Dict[str, any]:
"""
Check CUDA/GPU availability and information.
Returns:
Dict with CUDA status, device info, and specifications
"""
result = {
"available": torch.cuda.is_available(),
"device_count": 0,
"devices": [],
"cuda_version": None,
"cudnn_version": None
}
if result["available"]:
result["device_count"] = torch.cuda.device_count()
result["cuda_version"] = torch.version.cuda
result["cudnn_version"] = torch.backends.cudnn.version()
for i in range(result["device_count"]):
device_props = {
"id": i,
"name": torch.cuda.get_device_name(i),
"memory_total": torch.cuda.get_device_properties(i).total_memory / 1024**3, # GB
"compute_capability": f"{torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}"
}
result["devices"].append(device_props)
return result
def check_environment(self) -> Dict[str, any]:
"""
Check Python environment and required dependencies.
Returns:
Dict with Python version, package versions, and system info
"""
result = {
"python_version": sys.version,
"platform": platform.platform(),
"architecture": platform.machine(),
"packages": {},
"missing_packages": [],
"all_satisfied": True
}
# Required packages with minimum versions
required_packages = {
"torch": "2.0.0",
"transformers": "4.36.0",
"streamlit": "1.28.0",
"pandas": "2.0.0",
"numpy": "1.24.0",
"plotly": "5.18.0",
"scikit-learn": "1.3.0"
}
for package, min_version in required_packages.items():
try:
version = importlib.metadata.version(package)
result["packages"][package] = {
"installed": version,
"required": f">={min_version}",
"satisfied": True # Simple check, could add version comparison
}
except importlib.metadata.PackageNotFoundError:
result["packages"][package] = {
"installed": None,
"required": f">={min_version}",
"satisfied": False
}
result["missing_packages"].append(package)
result["all_satisfied"] = False
return result
def download_model(
self,
model_name: str,
progress_callback: Optional[callable] = None
) -> Tuple[bool, str, str]:
"""
Download model from HuggingFace Hub to local cache.
Args:
model_name: HuggingFace model identifier (e.g., "roberta-base")
progress_callback: Optional callback function for progress updates
Returns:
Tuple of (success: bool, model_path: str, message: str)
"""
try:
model_cache_path = self.models_dir / model_name.replace("/", "_")
# Check if model already exists
if model_cache_path.exists() and any(model_cache_path.iterdir()):
return True, str(model_cache_path), f"Model '{model_name}' already exists in cache"
# Download model
if progress_callback:
progress_callback(f"Downloading {model_name}...", 0.1)
# Use snapshot_download to get all model files
cache_dir = snapshot_download(
repo_id=model_name,
cache_dir=str(self.models_dir),
local_dir=str(model_cache_path),
local_dir_use_symlinks=False
)
if progress_callback:
progress_callback(f"Downloaded {model_name} successfully", 1.0)
return True, str(model_cache_path), f"Model '{model_name}' downloaded successfully"
except Exception as e:
error_msg = f"Failed to download model '{model_name}': {str(e)}"
if progress_callback:
progress_callback(error_msg, 0.0)
return False, "", error_msg
def get_model_info(self, model_name: str) -> Dict[str, any]:
"""
Get information about a model (local or remote).
Args:
model_name: Model identifier
Returns:
Dict with model information
"""
model_cache_path = self.models_dir / model_name.replace("/", "_")
info = {
"name": model_name,
"local_path": str(model_cache_path),
"exists_locally": model_cache_path.exists() and any(model_cache_path.iterdir()),
"size_mb": 0
}
if info["exists_locally"]:
# Calculate total size
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob('*')
if f.is_file()
)
info["size_mb"] = total_size / (1024 * 1024)
return info
def format_bytes(bytes_size: float) -> str:
"""Format bytes to human-readable string."""
for unit in ['B', 'KB', 'MB', 'GB']:
if bytes_size < 1024.0:
return f"{bytes_size:.2f} {unit}"
bytes_size /= 1024.0
return f"{bytes_size:.2f} TB"
def get_system_summary() -> str:
"""Get a formatted summary of system capabilities."""
checker = SystemChecker()
cuda_info = checker.check_cuda()
env_info = checker.check_environment()
summary = []
summary.append("=" * 60)
summary.append("SYSTEM SUMMARY")
summary.append("=" * 60)
# Python & Platform
summary.append(f"\nPython: {env_info['python_version'].split()[0]}")
summary.append(f"Platform: {env_info['platform']}")
summary.append(f"Architecture: {env_info['architecture']}")
# CUDA
summary.append(f"\nCUDA Available: {'Yes' if cuda_info['available'] else 'No (CPU only)'}")
if cuda_info['available']:
summary.append(f"CUDA Version: {cuda_info['cuda_version']}")
summary.append(f"Number of GPUs: {cuda_info['device_count']}")
for device in cuda_info['devices']:
summary.append(f" - GPU {device['id']}: {device['name']}")
summary.append(f" Memory: {device['memory_total']:.2f} GB")
summary.append(f" Compute: {device['compute_capability']}")
# Packages
summary.append(f"\n📦 Required Packages: {'✅ All Satisfied' if env_info['all_satisfied'] else '⚠️ Missing Packages'}")
if env_info['missing_packages']:
summary.append(f" Missing: {', '.join(env_info['missing_packages'])}")
summary.append("=" * 60)
return "\n".join(summary)
if __name__ == "__main__":
# Test the system checker
print(get_system_summary())
checker = SystemChecker()
# Test model download (small model for testing)
print("\n\nTesting model download...")
success, path, msg = checker.download_model(
"distilbert-base-uncased",
progress_callback=lambda msg, progress: print(f"Progress: {progress*100:.0f}% - {msg}")
)
print(f"Result: {msg}")