Spaces:
Running
Running
File size: 3,444 Bytes
e057d08 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | """
Compute Resource Tracker
=========================
Track GPU hours, costs, and resource usage for experiments.
Author: UW MSIM Team
Date: November 2025
"""
import time
import numpy as np
from typing import Dict, Optional, List
try:
import psutil
HAS_PSUTIL = True
except ImportError:
HAS_PSUTIL = False
import logging
logger = logging.getLogger(__name__)
class ComputeTracker:
"""
Track compute resources and costs.
Parameters
----------
cost_per_hour : float
Cost per GPU-hour in USD
gpu_type : str
GPU type (e.g., 'H200', 'A100', 'L40S')
"""
def __init__(self, cost_per_hour: float = 0.90, gpu_type: str = 'H200'):
self.cost_per_hour = cost_per_hour
self.gpu_type = gpu_type
self.start_time: Optional[float] = None
self.end_time: Optional[float] = None
self.gpu_usage_log: List[Dict] = []
def start(self):
"""Start tracking."""
self.start_time = time.time()
self.gpu_usage_log = []
logger.info(f"Compute tracking started (GPU: {self.gpu_type}, ${self.cost_per_hour}/hr)")
def log_gpu_usage(self):
"""Log current GPU usage."""
try:
import GPUtil
gpus = GPUtil.getGPUs()
for gpu in gpus:
self.gpu_usage_log.append({
'timestamp': time.time(),
'gpu_id': gpu.id,
'gpu_load': gpu.load * 100,
'memory_used_mb': gpu.memoryUsed,
'memory_total_mb': gpu.memoryTotal,
'memory_util': (gpu.memoryUsed / gpu.memoryTotal) * 100,
'temperature': getattr(gpu, 'temperature', None)
})
except ImportError:
logger.warning("GPUtil not installed, GPU tracking unavailable")
except Exception as e:
logger.warning(f"GPU logging failed: {e}")
def stop(self) -> Dict:
"""
Stop tracking and calculate costs.
Returns
-------
summary : dict
Elapsed time, costs, and GPU usage summary
"""
self.end_time = time.time()
elapsed_hours = (self.end_time - self.start_time) / 3600
total_cost = elapsed_hours * self.cost_per_hour
# CPU usage
if HAS_PSUTIL:
cpu_percent = psutil.cpu_percent(interval=1)
memory_info = psutil.virtual_memory()
memory_percent = memory_info.percent
memory_used_gb = memory_info.used / (1024 ** 3)
else:
cpu_percent = 0.0
memory_percent = 0.0
memory_used_gb = 0.0
summary = {
'elapsed_hours': elapsed_hours,
'cost_usd': total_cost,
'cost_per_hour': self.cost_per_hour,
'gpu_type': self.gpu_type,
'cpu_percent': cpu_percent,
'memory_percent': memory_percent,
'memory_used_gb': memory_used_gb,
'gpu_logs_count': len(self.gpu_usage_log)
}
# Average GPU utilization
if self.gpu_usage_log:
summary['avg_gpu_load'] = np.mean([log['gpu_load'] for log in self.gpu_usage_log])
summary['avg_gpu_memory_util'] = np.mean([log['memory_util'] for log in self.gpu_usage_log])
logger.info(f"Compute tracking stopped: {elapsed_hours:.2f} hours, ${total_cost:.2f}")
return summary
|