File size: 3,444 Bytes
e057d08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
Compute Resource Tracker
=========================

Track GPU hours, costs, and resource usage for experiments.

Author: UW MSIM Team
Date: November 2025
"""

import time
import numpy as np
from typing import Dict, Optional, List

try:
    import psutil
    HAS_PSUTIL = True
except ImportError:
    HAS_PSUTIL = False
import logging

logger = logging.getLogger(__name__)


class ComputeTracker:
    """
    Track compute resources and costs.

    Parameters
    ----------
    cost_per_hour : float
        Cost per GPU-hour in USD
    gpu_type : str
        GPU type (e.g., 'H200', 'A100', 'L40S')
    """

    def __init__(self, cost_per_hour: float = 0.90, gpu_type: str = 'H200'):
        self.cost_per_hour = cost_per_hour
        self.gpu_type = gpu_type
        self.start_time: Optional[float] = None
        self.end_time: Optional[float] = None
        self.gpu_usage_log: List[Dict] = []

    def start(self):
        """Start tracking."""
        self.start_time = time.time()
        self.gpu_usage_log = []
        logger.info(f"Compute tracking started (GPU: {self.gpu_type}, ${self.cost_per_hour}/hr)")

    def log_gpu_usage(self):
        """Log current GPU usage."""
        try:
            import GPUtil
            gpus = GPUtil.getGPUs()

            for gpu in gpus:
                self.gpu_usage_log.append({
                    'timestamp': time.time(),
                    'gpu_id': gpu.id,
                    'gpu_load': gpu.load * 100,
                    'memory_used_mb': gpu.memoryUsed,
                    'memory_total_mb': gpu.memoryTotal,
                    'memory_util': (gpu.memoryUsed / gpu.memoryTotal) * 100,
                    'temperature': getattr(gpu, 'temperature', None)
                })
        except ImportError:
            logger.warning("GPUtil not installed, GPU tracking unavailable")
        except Exception as e:
            logger.warning(f"GPU logging failed: {e}")

    def stop(self) -> Dict:
        """
        Stop tracking and calculate costs.

        Returns
        -------
        summary : dict
            Elapsed time, costs, and GPU usage summary
        """
        self.end_time = time.time()

        elapsed_hours = (self.end_time - self.start_time) / 3600
        total_cost = elapsed_hours * self.cost_per_hour

        # CPU usage
        if HAS_PSUTIL:
            cpu_percent = psutil.cpu_percent(interval=1)
            memory_info = psutil.virtual_memory()
            memory_percent = memory_info.percent
            memory_used_gb = memory_info.used / (1024 ** 3)
        else:
            cpu_percent = 0.0
            memory_percent = 0.0
            memory_used_gb = 0.0

        summary = {
            'elapsed_hours': elapsed_hours,
            'cost_usd': total_cost,
            'cost_per_hour': self.cost_per_hour,
            'gpu_type': self.gpu_type,
            'cpu_percent': cpu_percent,
            'memory_percent': memory_percent,
            'memory_used_gb': memory_used_gb,
            'gpu_logs_count': len(self.gpu_usage_log)
        }

        # Average GPU utilization
        if self.gpu_usage_log:
            summary['avg_gpu_load'] = np.mean([log['gpu_load'] for log in self.gpu_usage_log])
            summary['avg_gpu_memory_util'] = np.mean([log['memory_util'] for log in self.gpu_usage_log])

        logger.info(f"Compute tracking stopped: {elapsed_hours:.2f} hours, ${total_cost:.2f}")

        return summary