Spaces:
Running
Running
| """ | |
| Compute Resource Tracker | |
| ========================= | |
| Track GPU hours, costs, and resource usage for experiments. | |
| Author: UW MSIM Team | |
| Date: November 2025 | |
| """ | |
| import time | |
| import numpy as np | |
| from typing import Dict, Optional, List | |
| try: | |
| import psutil | |
| HAS_PSUTIL = True | |
| except ImportError: | |
| HAS_PSUTIL = False | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class ComputeTracker: | |
| """ | |
| Track compute resources and costs. | |
| Parameters | |
| ---------- | |
| cost_per_hour : float | |
| Cost per GPU-hour in USD | |
| gpu_type : str | |
| GPU type (e.g., 'H200', 'A100', 'L40S') | |
| """ | |
| def __init__(self, cost_per_hour: float = 0.90, gpu_type: str = 'H200'): | |
| self.cost_per_hour = cost_per_hour | |
| self.gpu_type = gpu_type | |
| self.start_time: Optional[float] = None | |
| self.end_time: Optional[float] = None | |
| self.gpu_usage_log: List[Dict] = [] | |
| def start(self): | |
| """Start tracking.""" | |
| self.start_time = time.time() | |
| self.gpu_usage_log = [] | |
| logger.info(f"Compute tracking started (GPU: {self.gpu_type}, ${self.cost_per_hour}/hr)") | |
| def log_gpu_usage(self): | |
| """Log current GPU usage.""" | |
| try: | |
| import GPUtil | |
| gpus = GPUtil.getGPUs() | |
| for gpu in gpus: | |
| self.gpu_usage_log.append({ | |
| 'timestamp': time.time(), | |
| 'gpu_id': gpu.id, | |
| 'gpu_load': gpu.load * 100, | |
| 'memory_used_mb': gpu.memoryUsed, | |
| 'memory_total_mb': gpu.memoryTotal, | |
| 'memory_util': (gpu.memoryUsed / gpu.memoryTotal) * 100, | |
| 'temperature': getattr(gpu, 'temperature', None) | |
| }) | |
| except ImportError: | |
| logger.warning("GPUtil not installed, GPU tracking unavailable") | |
| except Exception as e: | |
| logger.warning(f"GPU logging failed: {e}") | |
| def stop(self) -> Dict: | |
| """ | |
| Stop tracking and calculate costs. | |
| Returns | |
| ------- | |
| summary : dict | |
| Elapsed time, costs, and GPU usage summary | |
| """ | |
| self.end_time = time.time() | |
| elapsed_hours = (self.end_time - self.start_time) / 3600 | |
| total_cost = elapsed_hours * self.cost_per_hour | |
| # CPU usage | |
| if HAS_PSUTIL: | |
| cpu_percent = psutil.cpu_percent(interval=1) | |
| memory_info = psutil.virtual_memory() | |
| memory_percent = memory_info.percent | |
| memory_used_gb = memory_info.used / (1024 ** 3) | |
| else: | |
| cpu_percent = 0.0 | |
| memory_percent = 0.0 | |
| memory_used_gb = 0.0 | |
| summary = { | |
| 'elapsed_hours': elapsed_hours, | |
| 'cost_usd': total_cost, | |
| 'cost_per_hour': self.cost_per_hour, | |
| 'gpu_type': self.gpu_type, | |
| 'cpu_percent': cpu_percent, | |
| 'memory_percent': memory_percent, | |
| 'memory_used_gb': memory_used_gb, | |
| 'gpu_logs_count': len(self.gpu_usage_log) | |
| } | |
| # Average GPU utilization | |
| if self.gpu_usage_log: | |
| summary['avg_gpu_load'] = np.mean([log['gpu_load'] for log in self.gpu_usage_log]) | |
| summary['avg_gpu_memory_util'] = np.mean([log['memory_util'] for log in self.gpu_usage_log]) | |
| logger.info(f"Compute tracking stopped: {elapsed_hours:.2f} hours, ${total_cost:.2f}") | |
| return summary | |