ModelMatrix / matrix /code /evaluation /compute_tracker.py
Akshay4506's picture
Fix deployment entry point and merge requirements
c4ff02d
"""
Compute Resource Tracker
=========================
Track GPU hours, costs, and resource usage for experiments.
Author: UW MSIM Team
Date: November 2025
"""
import time
import numpy as np
from typing import Dict, Optional, List
try:
import psutil
HAS_PSUTIL = True
except ImportError:
HAS_PSUTIL = False
import logging
logger = logging.getLogger(__name__)
class ComputeTracker:
"""
Track compute resources and costs.
Parameters
----------
cost_per_hour : float
Cost per GPU-hour in USD
gpu_type : str
GPU type (e.g., 'H200', 'A100', 'L40S')
"""
def __init__(self, cost_per_hour: float = 0.90, gpu_type: str = 'H200'):
self.cost_per_hour = cost_per_hour
self.gpu_type = gpu_type
self.start_time: Optional[float] = None
self.end_time: Optional[float] = None
self.gpu_usage_log: List[Dict] = []
def start(self):
"""Start tracking."""
self.start_time = time.time()
self.gpu_usage_log = []
logger.info(f"Compute tracking started (GPU: {self.gpu_type}, ${self.cost_per_hour}/hr)")
def log_gpu_usage(self):
"""Log current GPU usage."""
try:
import GPUtil
gpus = GPUtil.getGPUs()
for gpu in gpus:
self.gpu_usage_log.append({
'timestamp': time.time(),
'gpu_id': gpu.id,
'gpu_load': gpu.load * 100,
'memory_used_mb': gpu.memoryUsed,
'memory_total_mb': gpu.memoryTotal,
'memory_util': (gpu.memoryUsed / gpu.memoryTotal) * 100,
'temperature': getattr(gpu, 'temperature', None)
})
except ImportError:
logger.warning("GPUtil not installed, GPU tracking unavailable")
except Exception as e:
logger.warning(f"GPU logging failed: {e}")
def stop(self) -> Dict:
"""
Stop tracking and calculate costs.
Returns
-------
summary : dict
Elapsed time, costs, and GPU usage summary
"""
self.end_time = time.time()
elapsed_hours = (self.end_time - self.start_time) / 3600
total_cost = elapsed_hours * self.cost_per_hour
# CPU usage
if HAS_PSUTIL:
cpu_percent = psutil.cpu_percent(interval=1)
memory_info = psutil.virtual_memory()
memory_percent = memory_info.percent
memory_used_gb = memory_info.used / (1024 ** 3)
else:
cpu_percent = 0.0
memory_percent = 0.0
memory_used_gb = 0.0
summary = {
'elapsed_hours': elapsed_hours,
'cost_usd': total_cost,
'cost_per_hour': self.cost_per_hour,
'gpu_type': self.gpu_type,
'cpu_percent': cpu_percent,
'memory_percent': memory_percent,
'memory_used_gb': memory_used_gb,
'gpu_logs_count': len(self.gpu_usage_log)
}
# Average GPU utilization
if self.gpu_usage_log:
summary['avg_gpu_load'] = np.mean([log['gpu_load'] for log in self.gpu_usage_log])
summary['avg_gpu_memory_util'] = np.mean([log['memory_util'] for log in self.gpu_usage_log])
logger.info(f"Compute tracking stopped: {elapsed_hours:.2f} hours, ${total_cost:.2f}")
return summary