| | """ |
| | GPU Monitoring Example for SPARKNET |
| | Demonstrates GPU management and monitoring capabilities |
| | """ |
| |
|
| | import sys |
| | from pathlib import Path |
| |
|
| | |
| | sys.path.insert(0, str(Path(__file__).parent.parent)) |
| |
|
| | from src.utils.gpu_manager import get_gpu_manager |
| | from src.utils.logging import setup_logging |
| | from loguru import logger |
| | import time |
| |
|
| |
|
| | def main(): |
| | """Run GPU monitoring example.""" |
| |
|
| | |
| | setup_logging(log_level="INFO") |
| |
|
| | logger.info("="*70) |
| | logger.info("SPARKNET GPU Monitoring Example") |
| | logger.info("="*70) |
| |
|
| | |
| | gpu_manager = get_gpu_manager() |
| |
|
| | |
| | logger.info("\n" + "="*70) |
| | logger.info("All GPUs Status") |
| | logger.info("="*70) |
| | print(gpu_manager.monitor()) |
| |
|
| | |
| | logger.info("\n" + "="*70) |
| | logger.info("Detailed GPU Information") |
| | logger.info("="*70) |
| |
|
| | all_info = gpu_manager.get_all_gpu_info() |
| | for info in all_info: |
| | if "error" not in info: |
| | logger.info(f"\nGPU {info['gpu_id']}: {info['name']}") |
| | logger.info(f" Total Memory: {info['memory_total'] / 1024**3:.2f} GB") |
| | logger.info(f" Used Memory: {info['memory_used'] / 1024**3:.2f} GB") |
| | logger.info(f" Free Memory: {info['memory_free'] / 1024**3:.2f} GB") |
| | logger.info(f" Memory Usage: {info['memory_percent']:.1f}%") |
| | logger.info(f" GPU Utilization: {info['gpu_utilization']}%") |
| | logger.info(f" Memory Util: {info['memory_utilization']}%") |
| | logger.info(f" Temperature: {info['temperature']}°C") |
| |
|
| | |
| | logger.info("\n" + "="*70) |
| | logger.info("GPU Selection") |
| | logger.info("="*70) |
| |
|
| | min_memory = 2.0 |
| | best_gpu = gpu_manager.select_best_gpu(min_memory_gb=min_memory) |
| |
|
| | if best_gpu is not None: |
| | logger.info(f"\nBest GPU for {min_memory} GB requirement: GPU {best_gpu}") |
| | gpu_info = gpu_manager.get_gpu_info(best_gpu) |
| | logger.info(f"Free memory: {gpu_info['memory_free'] / 1024**3:.2f} GB") |
| | else: |
| | logger.warning(f"\nNo GPU found with {min_memory} GB free memory") |
| |
|
| | |
| | logger.info("\n" + "="*70) |
| | logger.info("GPU Context Manager Test") |
| | logger.info("="*70) |
| |
|
| | try: |
| | with gpu_manager.gpu_context(min_memory_gb=1.0) as gpu_id: |
| | logger.info(f"\nUsing GPU {gpu_id} in context") |
| | logger.info("This would be where you load and run your model") |
| | time.sleep(1) |
| | logger.info("GPU context released and cache cleared") |
| | except RuntimeError as e: |
| | logger.error(f"Could not allocate GPU: {e}") |
| |
|
| | |
| | logger.info("\n" + "="*70) |
| | logger.info("Available GPUs Summary") |
| | logger.info("="*70) |
| |
|
| | available = gpu_manager.available_gpus |
| | logger.info(f"\nTotal GPUs detected: {len(available)}") |
| | logger.info(f"GPU IDs: {available}") |
| | logger.info(f"Primary GPU: {gpu_manager.primary_gpu}") |
| | logger.info(f"Fallback GPUs: {gpu_manager.fallback_gpus}") |
| |
|
| | logger.info("\n" + "="*70) |
| | logger.info("GPU Monitoring Example Completed") |
| | logger.info("="*70) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|