Spaces:
Running
Running
Create monitoring_system.py
Browse files- src/monitoring_system.py +70 -0
src/monitoring_system.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import psutil
|
| 3 |
+
from datetime import datetime, timedelta
|
| 4 |
+
from typing import Dict, List, Optional
|
| 5 |
+
import logging
|
| 6 |
+
from threading import Thread
|
| 7 |
+
|
| 8 |
+
class ComprehensiveMonitor:
|
| 9 |
+
"""Monitoring system for performance tracking"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.inference_metrics = []
|
| 13 |
+
self.system_metrics = []
|
| 14 |
+
self.start_time = datetime.now()
|
| 15 |
+
self.setup_logging()
|
| 16 |
+
|
| 17 |
+
def setup_logging(self):
|
| 18 |
+
"""Setup logging"""
|
| 19 |
+
self.logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
def record_inference(self, metrics: Dict):
|
| 22 |
+
"""Record inference metrics"""
|
| 23 |
+
metrics['timestamp'] = datetime.now()
|
| 24 |
+
self.inference_metrics.append(metrics)
|
| 25 |
+
|
| 26 |
+
# Keep only last 1000 records
|
| 27 |
+
if len(self.inference_metrics) > 1000:
|
| 28 |
+
self.inference_metrics = self.inference_metrics[-1000:]
|
| 29 |
+
|
| 30 |
+
def get_recent_metrics(self, minutes: int = 5) -> List[Dict]:
|
| 31 |
+
"""Get metrics from recent time window"""
|
| 32 |
+
cutoff = datetime.now() - timedelta(minutes=minutes)
|
| 33 |
+
return [m for m in self.inference_metrics if m['timestamp'] > cutoff]
|
| 34 |
+
|
| 35 |
+
def get_average_response_time(self) -> float:
|
| 36 |
+
"""Get average response time"""
|
| 37 |
+
recent_metrics = self.get_recent_metrics(30) # Last 30 minutes
|
| 38 |
+
if not recent_metrics:
|
| 39 |
+
return 0.0
|
| 40 |
+
|
| 41 |
+
successful_metrics = [m for m in recent_metrics if m.get('success', False)]
|
| 42 |
+
if not successful_metrics:
|
| 43 |
+
return 0.0
|
| 44 |
+
|
| 45 |
+
return sum(m['processing_time_ms'] for m in successful_metrics) / len(successful_metrics)
|
| 46 |
+
|
| 47 |
+
def get_error_rate(self) -> float:
|
| 48 |
+
"""Get error rate percentage"""
|
| 49 |
+
recent_metrics = self.get_recent_metrics(30) # Last 30 minutes
|
| 50 |
+
if not recent_metrics:
|
| 51 |
+
return 0.0
|
| 52 |
+
|
| 53 |
+
errors = sum(1 for m in recent_metrics if not m.get('success', True))
|
| 54 |
+
return (errors / len(recent_metrics)) * 100
|
| 55 |
+
|
| 56 |
+
def get_uptime(self) -> float:
|
| 57 |
+
"""Get system uptime in seconds"""
|
| 58 |
+
return (datetime.now() - self.start_time).total_seconds()
|
| 59 |
+
|
| 60 |
+
def get_system_health(self) -> Dict:
|
| 61 |
+
"""Get system health status"""
|
| 62 |
+
return {
|
| 63 |
+
"cpu_percent": psutil.cpu_percent(),
|
| 64 |
+
"memory_percent": psutil.virtual_memory().percent,
|
| 65 |
+
"disk_percent": psutil.disk_usage('/').percent,
|
| 66 |
+
"uptime_seconds": self.get_uptime(),
|
| 67 |
+
"total_requests": len(self.inference_metrics),
|
| 68 |
+
"error_rate": self.get_error_rate(),
|
| 69 |
+
"avg_response_time": self.get_average_response_time()
|
| 70 |
+
}
|