Spaces:

songhieng
/

MLOps-Platforms

Sleeping

App Files Files Community

MLOps-Platforms / src /mlops /system_check.py

songhieng

Update src/mlops/system_check.py

9aa4daf verified about 1 month ago

raw

history blame contribute delete

8.48 kB

	"""
	System Prerequisites Checker Module

	This module provides functionality to check system prerequisites including:
	- CUDA/GPU availability
	- Environment dependencies
	- Model download with progress tracking
	"""

	import os
	import sys
	import torch
	import platform
	from typing import Dict, Tuple, Optional
	from pathlib import Path
	import importlib.metadata
	from huggingface_hub import hf_hub_download, snapshot_download
	from tqdm import tqdm


	class SystemChecker:
	"""Check system prerequisites for MLOps platform."""

	def __init__(self, models_dir: str = "models"):
	"""
	Initialize system checker.

	Args:
	models_dir: Directory to store downloaded models
	"""
	self.models_dir = Path(models_dir)
	self.models_dir.mkdir(parents=True, exist_ok=True)

	def check_cuda(self) -> Dict[str, any]:
	"""
	Check CUDA/GPU availability and information.

	Returns:
	Dict with CUDA status, device info, and specifications
	"""
	result = {
	"available": torch.cuda.is_available(),
	"device_count": 0,
	"devices": [],
	"cuda_version": None,
	"cudnn_version": None
	}

	if result["available"]:
	result["device_count"] = torch.cuda.device_count()
	result["cuda_version"] = torch.version.cuda
	result["cudnn_version"] = torch.backends.cudnn.version()

	for i in range(result["device_count"]):
	device_props = {
	"id": i,
	"name": torch.cuda.get_device_name(i),
	"memory_total": torch.cuda.get_device_properties(i).total_memory / 1024**3, # GB
	"compute_capability": f"{torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}"
	}
	result["devices"].append(device_props)

	return result

	def check_environment(self) -> Dict[str, any]:
	"""
	Check Python environment and required dependencies.

	Returns:
	Dict with Python version, package versions, and system info
	"""
	result = {
	"python_version": sys.version,
	"platform": platform.platform(),
	"architecture": platform.machine(),
	"packages": {},
	"missing_packages": [],
	"all_satisfied": True
	}

	# Required packages with minimum versions
	required_packages = {
	"torch": "2.0.0",
	"transformers": "4.36.0",
	"streamlit": "1.28.0",
	"pandas": "2.0.0",
	"numpy": "1.24.0",
	"plotly": "5.18.0",
	"scikit-learn": "1.3.0"
	}

	for package, min_version in required_packages.items():
	try:
	version = importlib.metadata.version(package)
	result["packages"][package] = {
	"installed": version,
	"required": f">={min_version}",
	"satisfied": True # Simple check, could add version comparison
	}
	except importlib.metadata.PackageNotFoundError:
	result["packages"][package] = {
	"installed": None,
	"required": f">={min_version}",
	"satisfied": False
	}
	result["missing_packages"].append(package)
	result["all_satisfied"] = False

	return result

	def download_model(
	self,
	model_name: str,
	progress_callback: Optional[callable] = None
	) -> Tuple[bool, str, str]:
	"""
	Download model from HuggingFace Hub to local cache.

	Args:
	model_name: HuggingFace model identifier (e.g., "roberta-base")
	progress_callback: Optional callback function for progress updates

	Returns:
	Tuple of (success: bool, model_path: str, message: str)
	"""
	try:
	model_cache_path = self.models_dir / model_name.replace("/", "_")

	# Check if model already exists
	if model_cache_path.exists() and any(model_cache_path.iterdir()):
	return True, str(model_cache_path), f"Model '{model_name}' already exists in cache"

	# Download model
	if progress_callback:
	progress_callback(f"Downloading {model_name}...", 0.1)

	# Use snapshot_download to get all model files
	cache_dir = snapshot_download(
	repo_id=model_name,
	cache_dir=str(self.models_dir),
	local_dir=str(model_cache_path),
	local_dir_use_symlinks=False
	)

	if progress_callback:
	progress_callback(f"Downloaded {model_name} successfully", 1.0)

	return True, str(model_cache_path), f"Model '{model_name}' downloaded successfully"

	except Exception as e:
	error_msg = f"Failed to download model '{model_name}': {str(e)}"
	if progress_callback:
	progress_callback(error_msg, 0.0)
	return False, "", error_msg

	def get_model_info(self, model_name: str) -> Dict[str, any]:
	"""
	Get information about a model (local or remote).

	Args:
	model_name: Model identifier

	Returns:
	Dict with model information
	"""
	model_cache_path = self.models_dir / model_name.replace("/", "_")

	info = {
	"name": model_name,
	"local_path": str(model_cache_path),
	"exists_locally": model_cache_path.exists() and any(model_cache_path.iterdir()),
	"size_mb": 0
	}

	if info["exists_locally"]:
	# Calculate total size
	total_size = sum(
	f.stat().st_size
	for f in model_cache_path.rglob('*')
	if f.is_file()
	)
	info["size_mb"] = total_size / (1024 * 1024)

	return info


	def format_bytes(bytes_size: float) -> str:
	"""Format bytes to human-readable string."""
	for unit in ['B', 'KB', 'MB', 'GB']:
	if bytes_size < 1024.0:
	return f"{bytes_size:.2f} {unit}"
	bytes_size /= 1024.0
	return f"{bytes_size:.2f} TB"


	def get_system_summary() -> str:
	"""Get a formatted summary of system capabilities."""
	checker = SystemChecker()

	cuda_info = checker.check_cuda()
	env_info = checker.check_environment()

	summary = []
	summary.append("=" * 60)
	summary.append("SYSTEM SUMMARY")
	summary.append("=" * 60)

	# Python & Platform
	summary.append(f"\nPython: {env_info['python_version'].split()[0]}")
	summary.append(f"Platform: {env_info['platform']}")
	summary.append(f"Architecture: {env_info['architecture']}")

	# CUDA
	summary.append(f"\nCUDA Available: {'Yes' if cuda_info['available'] else 'No (CPU only)'}")
	if cuda_info['available']:
	summary.append(f"CUDA Version: {cuda_info['cuda_version']}")
	summary.append(f"Number of GPUs: {cuda_info['device_count']}")
	for device in cuda_info['devices']:
	summary.append(f" - GPU {device['id']}: {device['name']}")
	summary.append(f" Memory: {device['memory_total']:.2f} GB")
	summary.append(f" Compute: {device['compute_capability']}")

	# Packages
	summary.append(f"\n📦 Required Packages: {'✅ All Satisfied' if env_info['all_satisfied'] else '⚠️ Missing Packages'}")
	if env_info['missing_packages']:
	summary.append(f" Missing: {', '.join(env_info['missing_packages'])}")

	summary.append("=" * 60)

	return "\n".join(summary)


	if __name__ == "__main__":
	# Test the system checker
	print(get_system_summary())

	checker = SystemChecker()

	# Test model download (small model for testing)
	print("\n\nTesting model download...")
	success, path, msg = checker.download_model(
	"distilbert-base-uncased",
	progress_callback=lambda msg, progress: print(f"Progress: {progress*100:.0f}% - {msg}")
	)
	print(f"Result: {msg}")