saemstunes commited on
Commit
add553a
·
verified ·
1 Parent(s): 5e1e0b6

Update src/monitoring_system.py

Browse files
Files changed (1) hide show
  1. src/monitoring_system.py +3 -34
src/monitoring_system.py CHANGED
@@ -20,7 +20,6 @@ except ImportError:
20
 
21
  @dataclass
22
  class InferenceMetrics:
23
- """Data class for inference metrics"""
24
  model_name: str
25
  processing_time_ms: float
26
  input_tokens: int
@@ -38,7 +37,6 @@ class InferenceMetrics:
38
 
39
  @dataclass
40
  class SystemMetrics:
41
- """Data class for system metrics"""
42
  timestamp: datetime
43
  cpu_percent: float
44
  memory_percent: float
@@ -52,11 +50,6 @@ class SystemMetrics:
52
  active_threads: int
53
 
54
  class ComprehensiveMonitor:
55
- """
56
- Comprehensive monitoring system for performance tracking, metrics, and alerts.
57
- Provides real-time monitoring of AI system performance and health.
58
- """
59
-
60
  def __init__(self, prometheus_port: int = 8001, metrics_retention_hours: int = 24):
61
  self.inference_metrics: List[InferenceMetrics] = []
62
  self.system_metrics: List[SystemMetrics] = []
@@ -70,19 +63,19 @@ class ComprehensiveMonitor:
70
  self.alert_callbacks = []
71
 
72
  self.prometheus_metrics = {}
 
 
 
73
  if PROMETHEUS_AVAILABLE:
74
  self.setup_prometheus_metrics()
75
 
76
- self.setup_logging()
77
  self.start_monitoring()
78
 
79
  def setup_logging(self):
80
- """Setup monitoring logging"""
81
  self.logger = logging.getLogger(__name__)
82
  self.logger.setLevel(logging.INFO)
83
 
84
  def setup_prometheus_metrics(self):
85
- """Setup Prometheus metrics"""
86
  try:
87
  self.prometheus_metrics = {
88
  'inference_requests_total': Counter(
@@ -146,14 +139,12 @@ class ComprehensiveMonitor:
146
  self.logger.warning(f"Could not start Prometheus server: {e}")
147
 
148
  def start_monitoring(self):
149
- """Start background monitoring"""
150
  self.monitoring_active = True
151
  self.monitoring_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
152
  self.monitoring_thread.start()
153
  self.logger.info("Background monitoring started")
154
 
155
  def _monitoring_loop(self):
156
- """Background monitoring loop"""
157
  iteration = 0
158
  while self.monitoring_active:
159
  try:
@@ -178,7 +169,6 @@ class ComprehensiveMonitor:
178
  time.sleep(60)
179
 
180
  def get_system_metrics(self) -> SystemMetrics:
181
- """Get current system metrics"""
182
  try:
183
  cpu_percent = psutil.cpu_percent(interval=1)
184
 
@@ -235,7 +225,6 @@ class ComprehensiveMonitor:
235
  )
236
 
237
  def update_prometheus_gauges(self, system_metrics: SystemMetrics):
238
- """Update Prometheus gauges with system metrics"""
239
  try:
240
  self.prometheus_metrics['system_cpu_percent'].set(system_metrics.cpu_percent)
241
  self.prometheus_metrics['system_memory_percent'].set(system_metrics.memory_percent)
@@ -258,7 +247,6 @@ class ComprehensiveMonitor:
258
  self.logger.error(f"Error updating Prometheus gauges: {e}")
259
 
260
  def record_inference(self, metrics: Dict):
261
- """Record inference metrics"""
262
  try:
263
  inference_metrics = InferenceMetrics(
264
  model_name=metrics.get('model_name', 'unknown'),
@@ -307,12 +295,10 @@ class ComprehensiveMonitor:
307
  self.logger.error(f"Error recording inference metrics: {e}")
308
 
309
  def get_recent_metrics(self, minutes: int = 5) -> List[InferenceMetrics]:
310
- """Get metrics from recent time window"""
311
  cutoff = datetime.now() - timedelta(minutes=minutes)
312
  return [m for m in self.inference_metrics if m.timestamp > cutoff]
313
 
314
  def get_average_response_time(self, minutes: int = 30) -> float:
315
- """Get average response time for successful requests"""
316
  recent_metrics = self.get_recent_metrics(minutes)
317
  successful_metrics = [m for m in recent_metrics if m.success]
318
 
@@ -322,7 +308,6 @@ class ComprehensiveMonitor:
322
  return sum(m.processing_time_ms for m in successful_metrics) / len(successful_metrics)
323
 
324
  def get_response_time_percentile(self, percentile: float, minutes: int = 30) -> float:
325
- """Get percentile response time"""
326
  recent_metrics = self.get_recent_metrics(minutes)
327
  successful_metrics = [m for m in recent_metrics if m.success]
328
 
@@ -336,7 +321,6 @@ class ComprehensiveMonitor:
336
  return processing_times[index] if index < len(processing_times) else processing_times[-1]
337
 
338
  def get_error_rate(self, minutes: int = 30) -> float:
339
- """Get error rate percentage"""
340
  recent_metrics = self.get_recent_metrics(minutes)
341
  if not recent_metrics:
342
  return 0.0
@@ -345,7 +329,6 @@ class ComprehensiveMonitor:
345
  return (errors / len(recent_metrics)) * 100
346
 
347
  def get_throughput(self, minutes: int = 5) -> float:
348
- """Get requests per minute"""
349
  recent_metrics = self.get_recent_metrics(minutes)
350
  if not recent_metrics or minutes == 0:
351
  return 0.0
@@ -353,7 +336,6 @@ class ComprehensiveMonitor:
353
  return len(recent_metrics) / minutes
354
 
355
  def get_cache_hit_rate(self, minutes: int = 30) -> float:
356
- """Get cache hit rate percentage"""
357
  recent_metrics = self.get_recent_metrics(minutes)
358
  if not recent_metrics:
359
  return 0.0
@@ -362,11 +344,9 @@ class ComprehensiveMonitor:
362
  return (cache_hits / len(recent_metrics)) * 100
363
 
364
  def get_uptime(self) -> float:
365
- """Get system uptime in seconds"""
366
  return (datetime.now() - self.start_time).total_seconds()
367
 
368
  def check_alerts(self, system_metrics: SystemMetrics):
369
- """Check system metrics against alert thresholds"""
370
  current_alerts = []
371
 
372
  if system_metrics.cpu_percent > 85:
@@ -432,7 +412,6 @@ class ComprehensiveMonitor:
432
  self.alerts.append(alert)
433
 
434
  def is_new_alert(self, alert: Dict) -> bool:
435
- """Check if this is a new alert (not recently triggered)"""
436
  recent_threshold = datetime.now() - timedelta(minutes=5)
437
  recent_alerts = [a for a in self.alerts
438
  if a['metric'] == alert['metric']
@@ -440,7 +419,6 @@ class ComprehensiveMonitor:
440
  return len(recent_alerts) == 0
441
 
442
  def trigger_alert(self, alert: Dict):
443
- """Trigger alert notification"""
444
  alert['timestamp'] = datetime.now()
445
  alert['alert_id'] = hashlib.md5(f"{alert['metric']}_{alert['timestamp']}".encode()).hexdigest()[:8]
446
 
@@ -453,11 +431,9 @@ class ComprehensiveMonitor:
453
  self.logger.error(f"Error in alert callback: {e}")
454
 
455
  def add_alert_callback(self, callback):
456
- """Add callback function for alert notifications"""
457
  self.alert_callbacks.append(callback)
458
 
459
  def log_system_summary(self):
460
- """Log periodic system summary"""
461
  summary = self.get_performance_summary(timedelta(minutes=5))
462
 
463
  if summary:
@@ -471,7 +447,6 @@ class ComprehensiveMonitor:
471
  )
472
 
473
  def get_performance_summary(self, time_window: timedelta) -> Dict[str, Any]:
474
- """Get comprehensive performance summary"""
475
  recent_metrics = self.get_recent_metrics(time_window.total_seconds() / 60)
476
  recent_system = [m for m in self.system_metrics
477
  if m.timestamp > datetime.now() - time_window]
@@ -508,7 +483,6 @@ class ComprehensiveMonitor:
508
  return summary
509
 
510
  def cleanup_old_metrics(self):
511
- """Clean up old metrics to prevent memory issues"""
512
  cutoff = datetime.now() - timedelta(hours=self.metrics_retention_hours)
513
 
514
  self.inference_metrics = [m for m in self.inference_metrics if m.timestamp > cutoff]
@@ -516,7 +490,6 @@ class ComprehensiveMonitor:
516
  self.alerts = [a for a in self.alerts if a.get('timestamp', datetime.min) > cutoff - timedelta(hours=24)]
517
 
518
  def get_system_health(self) -> Dict[str, Any]:
519
- """Get comprehensive system health status"""
520
  performance_summary = self.get_performance_summary(timedelta(minutes=30))
521
 
522
  health_status = "healthy"
@@ -539,14 +512,12 @@ class ComprehensiveMonitor:
539
  }
540
 
541
  def stop_monitoring(self):
542
- """Stop the monitoring system"""
543
  self.monitoring_active = False
544
  if self.monitoring_thread:
545
  self.monitoring_thread.join(timeout=5)
546
  self.logger.info("Monitoring system stopped")
547
 
548
  def export_metrics(self, filename: str, time_window: timedelta = timedelta(hours=24)):
549
- """Export metrics to JSON file for analysis"""
550
  try:
551
  metrics_data = {
552
  'export_timestamp': datetime.now().isoformat(),
@@ -589,7 +560,6 @@ class ComprehensiveMonitor:
589
  self.logger.error(f"Error exporting metrics: {e}")
590
 
591
  def get_prometheus_metrics(self) -> str:
592
- """Get Prometheus metrics as string"""
593
  if not PROMETHEUS_AVAILABLE:
594
  return "# Prometheus client not available\n"
595
 
@@ -600,7 +570,6 @@ class ComprehensiveMonitor:
600
  return f"# Error generating metrics: {e}\n"
601
 
602
  def reset_metrics(self):
603
- """Reset all metrics (for testing)"""
604
  self.inference_metrics.clear()
605
  self.system_metrics.clear()
606
  self.alerts.clear()
 
20
 
21
  @dataclass
22
  class InferenceMetrics:
 
23
  model_name: str
24
  processing_time_ms: float
25
  input_tokens: int
 
37
 
38
  @dataclass
39
  class SystemMetrics:
 
40
  timestamp: datetime
41
  cpu_percent: float
42
  memory_percent: float
 
50
  active_threads: int
51
 
52
  class ComprehensiveMonitor:
 
 
 
 
 
53
  def __init__(self, prometheus_port: int = 8001, metrics_retention_hours: int = 24):
54
  self.inference_metrics: List[InferenceMetrics] = []
55
  self.system_metrics: List[SystemMetrics] = []
 
63
  self.alert_callbacks = []
64
 
65
  self.prometheus_metrics = {}
66
+
67
+ self.setup_logging()
68
+
69
  if PROMETHEUS_AVAILABLE:
70
  self.setup_prometheus_metrics()
71
 
 
72
  self.start_monitoring()
73
 
74
  def setup_logging(self):
 
75
  self.logger = logging.getLogger(__name__)
76
  self.logger.setLevel(logging.INFO)
77
 
78
  def setup_prometheus_metrics(self):
 
79
  try:
80
  self.prometheus_metrics = {
81
  'inference_requests_total': Counter(
 
139
  self.logger.warning(f"Could not start Prometheus server: {e}")
140
 
141
  def start_monitoring(self):
 
142
  self.monitoring_active = True
143
  self.monitoring_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
144
  self.monitoring_thread.start()
145
  self.logger.info("Background monitoring started")
146
 
147
  def _monitoring_loop(self):
 
148
  iteration = 0
149
  while self.monitoring_active:
150
  try:
 
169
  time.sleep(60)
170
 
171
  def get_system_metrics(self) -> SystemMetrics:
 
172
  try:
173
  cpu_percent = psutil.cpu_percent(interval=1)
174
 
 
225
  )
226
 
227
  def update_prometheus_gauges(self, system_metrics: SystemMetrics):
 
228
  try:
229
  self.prometheus_metrics['system_cpu_percent'].set(system_metrics.cpu_percent)
230
  self.prometheus_metrics['system_memory_percent'].set(system_metrics.memory_percent)
 
247
  self.logger.error(f"Error updating Prometheus gauges: {e}")
248
 
249
  def record_inference(self, metrics: Dict):
 
250
  try:
251
  inference_metrics = InferenceMetrics(
252
  model_name=metrics.get('model_name', 'unknown'),
 
295
  self.logger.error(f"Error recording inference metrics: {e}")
296
 
297
  def get_recent_metrics(self, minutes: int = 5) -> List[InferenceMetrics]:
 
298
  cutoff = datetime.now() - timedelta(minutes=minutes)
299
  return [m for m in self.inference_metrics if m.timestamp > cutoff]
300
 
301
  def get_average_response_time(self, minutes: int = 30) -> float:
 
302
  recent_metrics = self.get_recent_metrics(minutes)
303
  successful_metrics = [m for m in recent_metrics if m.success]
304
 
 
308
  return sum(m.processing_time_ms for m in successful_metrics) / len(successful_metrics)
309
 
310
  def get_response_time_percentile(self, percentile: float, minutes: int = 30) -> float:
 
311
  recent_metrics = self.get_recent_metrics(minutes)
312
  successful_metrics = [m for m in recent_metrics if m.success]
313
 
 
321
  return processing_times[index] if index < len(processing_times) else processing_times[-1]
322
 
323
  def get_error_rate(self, minutes: int = 30) -> float:
 
324
  recent_metrics = self.get_recent_metrics(minutes)
325
  if not recent_metrics:
326
  return 0.0
 
329
  return (errors / len(recent_metrics)) * 100
330
 
331
  def get_throughput(self, minutes: int = 5) -> float:
 
332
  recent_metrics = self.get_recent_metrics(minutes)
333
  if not recent_metrics or minutes == 0:
334
  return 0.0
 
336
  return len(recent_metrics) / minutes
337
 
338
  def get_cache_hit_rate(self, minutes: int = 30) -> float:
 
339
  recent_metrics = self.get_recent_metrics(minutes)
340
  if not recent_metrics:
341
  return 0.0
 
344
  return (cache_hits / len(recent_metrics)) * 100
345
 
346
  def get_uptime(self) -> float:
 
347
  return (datetime.now() - self.start_time).total_seconds()
348
 
349
  def check_alerts(self, system_metrics: SystemMetrics):
 
350
  current_alerts = []
351
 
352
  if system_metrics.cpu_percent > 85:
 
412
  self.alerts.append(alert)
413
 
414
  def is_new_alert(self, alert: Dict) -> bool:
 
415
  recent_threshold = datetime.now() - timedelta(minutes=5)
416
  recent_alerts = [a for a in self.alerts
417
  if a['metric'] == alert['metric']
 
419
  return len(recent_alerts) == 0
420
 
421
  def trigger_alert(self, alert: Dict):
 
422
  alert['timestamp'] = datetime.now()
423
  alert['alert_id'] = hashlib.md5(f"{alert['metric']}_{alert['timestamp']}".encode()).hexdigest()[:8]
424
 
 
431
  self.logger.error(f"Error in alert callback: {e}")
432
 
433
  def add_alert_callback(self, callback):
 
434
  self.alert_callbacks.append(callback)
435
 
436
  def log_system_summary(self):
 
437
  summary = self.get_performance_summary(timedelta(minutes=5))
438
 
439
  if summary:
 
447
  )
448
 
449
  def get_performance_summary(self, time_window: timedelta) -> Dict[str, Any]:
 
450
  recent_metrics = self.get_recent_metrics(time_window.total_seconds() / 60)
451
  recent_system = [m for m in self.system_metrics
452
  if m.timestamp > datetime.now() - time_window]
 
483
  return summary
484
 
485
  def cleanup_old_metrics(self):
 
486
  cutoff = datetime.now() - timedelta(hours=self.metrics_retention_hours)
487
 
488
  self.inference_metrics = [m for m in self.inference_metrics if m.timestamp > cutoff]
 
490
  self.alerts = [a for a in self.alerts if a.get('timestamp', datetime.min) > cutoff - timedelta(hours=24)]
491
 
492
  def get_system_health(self) -> Dict[str, Any]:
 
493
  performance_summary = self.get_performance_summary(timedelta(minutes=30))
494
 
495
  health_status = "healthy"
 
512
  }
513
 
514
  def stop_monitoring(self):
 
515
  self.monitoring_active = False
516
  if self.monitoring_thread:
517
  self.monitoring_thread.join(timeout=5)
518
  self.logger.info("Monitoring system stopped")
519
 
520
  def export_metrics(self, filename: str, time_window: timedelta = timedelta(hours=24)):
 
521
  try:
522
  metrics_data = {
523
  'export_timestamp': datetime.now().isoformat(),
 
560
  self.logger.error(f"Error exporting metrics: {e}")
561
 
562
  def get_prometheus_metrics(self) -> str:
 
563
  if not PROMETHEUS_AVAILABLE:
564
  return "# Prometheus client not available\n"
565
 
 
570
  return f"# Error generating metrics: {e}\n"
571
 
572
  def reset_metrics(self):
 
573
  self.inference_metrics.clear()
574
  self.system_metrics.clear()
575
  self.alerts.clear()