File size: 3,307 Bytes
54c5666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""System resource monitoring"""
import psutil
import time
import logging
from typing import Dict, Optional
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class SystemMetrics:
    """System resource metrics"""
    cpu_percent: float
    memory_percent: float
    memory_available_gb: float
    memory_used_gb: float
    disk_usage_percent: float
    timestamp: float


class SystemMonitor:
    """Monitor system resources during training"""
    
    def __init__(self, log_interval: int = 60):
        """

        Args:

            log_interval: Seconds between system metric logs

        """
        self.log_interval = log_interval
        self.last_log_time = 0
        self.metrics_history = []
    
    def get_current_metrics(self) -> SystemMetrics:
        """Get current system metrics"""
        memory = psutil.virtual_memory()
        disk = psutil.disk_usage('/')
        
        return SystemMetrics(
            cpu_percent=psutil.cpu_percent(interval=0.1),
            memory_percent=memory.percent,
            memory_available_gb=memory.available / 1e9,
            memory_used_gb=memory.used / 1e9,
            disk_usage_percent=disk.percent,
            timestamp=time.time()
        )
    
    def check_and_log(self, force: bool = False) -> Optional[SystemMetrics]:
        """Check system metrics and log if interval elapsed"""
        current_time = time.time()
        
        if force or (current_time - self.last_log_time) >= self.log_interval:
            metrics = self.get_current_metrics()
            self.metrics_history.append(metrics)
            self.last_log_time = current_time
            
            logger.info(
                f"System Metrics | "
                f"CPU: {metrics.cpu_percent:.1f}% | "
                f"RAM: {metrics.memory_used_gb:.1f}/{metrics.memory_used_gb + metrics.memory_available_gb:.1f}GB "
                f"({metrics.memory_percent:.1f}%) | "
                f"Disk: {metrics.disk_usage_percent:.1f}%"
            )
            
            # Warn if resources are high
            if metrics.memory_percent > 90:
                logger.warning(f"High memory usage: {metrics.memory_percent:.1f}%")
            
            if metrics.cpu_percent > 95:
                logger.warning(f"High CPU usage: {metrics.cpu_percent:.1f}%")
            
            if metrics.disk_usage_percent > 90:
                logger.warning(f"High disk usage: {metrics.disk_usage_percent:.1f}%")
            
            return metrics
        
        return None
    
    def get_summary(self) -> Dict[str, float]:
        """Get summary of system metrics"""
        if not self.metrics_history:
            return {}
        
        cpu_values = [m.cpu_percent for m in self.metrics_history]
        mem_values = [m.memory_percent for m in self.metrics_history]
        
        return {
            'avg_cpu_percent': sum(cpu_values) / len(cpu_values),
            'max_cpu_percent': max(cpu_values),
            'avg_memory_percent': sum(mem_values) / len(mem_values),
            'max_memory_percent': max(mem_values),
            'max_memory_used_gb': max(m.memory_used_gb for m in self.metrics_history)
        }