snikhilesh commited on
Commit
e942eb2
·
verified ·
1 Parent(s): b0bc699

Deploy monitoring_service.py to backend/ directory

Browse files
Files changed (1) hide show
  1. backend/monitoring_service.py +1102 -0
backend/monitoring_service.py ADDED
@@ -0,0 +1,1102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enterprise Monitoring Service for Medical AI Platform
3
+ Comprehensive monitoring, metrics tracking, and alerting system
4
+
5
+ Features:
6
+ - Real-time performance monitoring
7
+ - Error rate tracking with automated alerts
8
+ - Latency analysis across pipeline stages
9
+ - Resource utilization monitoring
10
+ - Model performance tracking
11
+ - System health indicators
12
+
13
+ Author: MiniMax Agent
14
+ Date: 2025-10-29
15
+ Version: 1.0.0
16
+ """
17
+
18
+ import logging
19
+ import time
20
+ import hashlib
21
+ import json
22
+ import pickle
23
+ from typing import Dict, List, Any, Optional, Tuple
24
+ from datetime import datetime, timedelta
25
+ from collections import defaultdict, deque
26
+ from dataclasses import dataclass, asdict
27
+ from enum import Enum
28
+ import asyncio
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class SystemStatus(Enum):
34
+ """System operational status levels"""
35
+ OPERATIONAL = "operational"
36
+ DEGRADED = "degraded"
37
+ CRITICAL = "critical"
38
+ MAINTENANCE = "maintenance"
39
+
40
+
41
+ class AlertLevel(Enum):
42
+ """Alert severity levels"""
43
+ INFO = "info"
44
+ WARNING = "warning"
45
+ ERROR = "error"
46
+ CRITICAL = "critical"
47
+
48
+
49
+ @dataclass
50
+ class PerformanceMetric:
51
+ """Performance metric data structure"""
52
+ metric_name: str
53
+ value: float
54
+ unit: str
55
+ timestamp: str
56
+ tags: Dict[str, str]
57
+
58
+ def to_dict(self) -> Dict[str, Any]:
59
+ return asdict(self)
60
+
61
+
62
+ @dataclass
63
+ class Alert:
64
+ """Alert data structure"""
65
+ alert_id: str
66
+ level: AlertLevel
67
+ message: str
68
+ category: str
69
+ timestamp: str
70
+ details: Dict[str, Any]
71
+ resolved: bool = False
72
+ resolved_at: Optional[str] = None
73
+
74
+ def to_dict(self) -> Dict[str, Any]:
75
+ return {
76
+ "alert_id": self.alert_id,
77
+ "level": self.level.value,
78
+ "message": self.message,
79
+ "category": self.category,
80
+ "timestamp": self.timestamp,
81
+ "details": self.details,
82
+ "resolved": self.resolved,
83
+ "resolved_at": self.resolved_at
84
+ }
85
+
86
+
87
+ class MetricsCollector:
88
+ """
89
+ Collects and aggregates performance metrics
90
+ Provides time-series data for monitoring and analysis
91
+ """
92
+
93
+ def __init__(self, retention_hours: int = 24):
94
+ self.retention_hours = retention_hours
95
+ self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=10000))
96
+ self.counters: Dict[str, int] = defaultdict(int)
97
+ self.gauges: Dict[str, float] = defaultdict(float)
98
+
99
+ logger.info(f"Metrics Collector initialized (retention: {retention_hours}h)")
100
+
101
+ def record_metric(
102
+ self,
103
+ metric_name: str,
104
+ value: float,
105
+ unit: str = "count",
106
+ tags: Optional[Dict[str, str]] = None
107
+ ):
108
+ """Record a performance metric"""
109
+ metric = PerformanceMetric(
110
+ metric_name=metric_name,
111
+ value=value,
112
+ unit=unit,
113
+ timestamp=datetime.utcnow().isoformat(),
114
+ tags=tags or {}
115
+ )
116
+
117
+ self.metrics[metric_name].append(metric)
118
+ self._cleanup_old_metrics()
119
+
120
+ def increment_counter(self, counter_name: str, value: int = 1):
121
+ """Increment a counter metric"""
122
+ self.counters[counter_name] += value
123
+
124
+ def set_gauge(self, gauge_name: str, value: float):
125
+ """Set a gauge metric (current value)"""
126
+ self.gauges[gauge_name] = value
127
+
128
+ def get_metrics(
129
+ self,
130
+ metric_name: str,
131
+ start_time: Optional[datetime] = None,
132
+ end_time: Optional[datetime] = None
133
+ ) -> List[PerformanceMetric]:
134
+ """Retrieve metrics within time range"""
135
+ metrics = list(self.metrics.get(metric_name, []))
136
+
137
+ if start_time or end_time:
138
+ filtered = []
139
+ for metric in metrics:
140
+ metric_time = datetime.fromisoformat(metric.timestamp)
141
+ if start_time and metric_time < start_time:
142
+ continue
143
+ if end_time and metric_time > end_time:
144
+ continue
145
+ filtered.append(metric)
146
+ return filtered
147
+
148
+ return metrics
149
+
150
+ def get_statistics(
151
+ self,
152
+ metric_name: str,
153
+ window_minutes: int = 60
154
+ ) -> Dict[str, float]:
155
+ """Calculate statistics for a metric over time window"""
156
+ cutoff = datetime.utcnow() - timedelta(minutes=window_minutes)
157
+ metrics = [
158
+ m for m in self.metrics.get(metric_name, [])
159
+ if datetime.fromisoformat(m.timestamp) > cutoff
160
+ ]
161
+
162
+ if not metrics:
163
+ return {
164
+ "count": 0,
165
+ "mean": 0.0,
166
+ "min": 0.0,
167
+ "max": 0.0,
168
+ "p50": 0.0,
169
+ "p95": 0.0,
170
+ "p99": 0.0
171
+ }
172
+
173
+ values = sorted([m.value for m in metrics])
174
+ count = len(values)
175
+
176
+ return {
177
+ "count": count,
178
+ "mean": sum(values) / count,
179
+ "min": values[0],
180
+ "max": values[-1],
181
+ "p50": values[int(count * 0.50)],
182
+ "p95": values[int(count * 0.95)] if count > 1 else values[0],
183
+ "p99": values[int(count * 0.99)] if count > 1 else values[0]
184
+ }
185
+
186
+ def _cleanup_old_metrics(self):
187
+ """Remove metrics older than retention period"""
188
+ cutoff = datetime.utcnow() - timedelta(hours=self.retention_hours)
189
+
190
+ for metric_name in list(self.metrics.keys()):
191
+ metrics = self.metrics[metric_name]
192
+ # Remove old metrics from front of deque
193
+ while metrics and datetime.fromisoformat(metrics[0].timestamp) < cutoff:
194
+ metrics.popleft()
195
+
196
+ def get_counter(self, counter_name: str, default: int = 0) -> int:
197
+ """Get value of a specific counter"""
198
+ return self.counters.get(counter_name, default)
199
+
200
+ def get_all_counters(self) -> Dict[str, int]:
201
+ """Get all counter values"""
202
+ return dict(self.counters)
203
+
204
+ def get_all_gauges(self) -> Dict[str, float]:
205
+ """Get all gauge values"""
206
+ return dict(self.gauges)
207
+
208
+
209
+ class ErrorMonitor:
210
+ """
211
+ Monitors error rates and triggers alerts
212
+ Tracks errors across different categories and stages
213
+ """
214
+
215
+ def __init__(
216
+ self,
217
+ error_threshold: float = 0.05, # 5% error rate
218
+ window_minutes: int = 15
219
+ ):
220
+ self.error_threshold = error_threshold
221
+ self.window_minutes = window_minutes
222
+ self.errors: deque = deque(maxlen=10000)
223
+ self.success_count: deque = deque(maxlen=10000)
224
+ self.error_categories: Dict[str, int] = defaultdict(int)
225
+
226
+ logger.info(f"Error Monitor initialized (threshold: {error_threshold*100}%, window: {window_minutes}m)")
227
+
228
+ def record_error(
229
+ self,
230
+ error_type: str,
231
+ error_message: str,
232
+ stage: str,
233
+ details: Optional[Dict[str, Any]] = None
234
+ ):
235
+ """Record an error occurrence"""
236
+ error_record = {
237
+ "error_type": error_type,
238
+ "error_message": error_message,
239
+ "stage": stage,
240
+ "timestamp": datetime.utcnow().isoformat(),
241
+ "details": details or {}
242
+ }
243
+
244
+ self.errors.append(error_record)
245
+ self.error_categories[f"{stage}:{error_type}"] += 1
246
+
247
+ logger.warning(f"Error recorded: {stage} - {error_type}: {error_message}")
248
+
249
+ def record_success(self, stage: str):
250
+ """Record a successful operation"""
251
+ self.success_count.append({
252
+ "stage": stage,
253
+ "timestamp": datetime.utcnow().isoformat()
254
+ })
255
+
256
+ def get_error_rate(self, stage: Optional[str] = None) -> float:
257
+ """Calculate error rate within time window"""
258
+ cutoff = datetime.utcnow() - timedelta(minutes=self.window_minutes)
259
+
260
+ # Filter errors within window
261
+ recent_errors = [
262
+ e for e in self.errors
263
+ if datetime.fromisoformat(e["timestamp"]) > cutoff
264
+ ]
265
+
266
+ # Filter successes within window
267
+ recent_successes = [
268
+ s for s in self.success_count
269
+ if datetime.fromisoformat(s["timestamp"]) > cutoff
270
+ ]
271
+
272
+ # Filter by stage if specified
273
+ if stage:
274
+ recent_errors = [e for e in recent_errors if e["stage"] == stage]
275
+ recent_successes = [s for s in recent_successes if s["stage"] == stage]
276
+
277
+ total = len(recent_errors) + len(recent_successes)
278
+ if total == 0:
279
+ return 0.0
280
+
281
+ return len(recent_errors) / total
282
+
283
+ def check_threshold_exceeded(self, stage: Optional[str] = None) -> bool:
284
+ """Check if error rate exceeds threshold"""
285
+ error_rate = self.get_error_rate(stage)
286
+ return error_rate > self.error_threshold
287
+
288
+ def get_error_summary(self) -> Dict[str, Any]:
289
+ """Get error summary statistics"""
290
+ cutoff = datetime.utcnow() - timedelta(minutes=self.window_minutes)
291
+
292
+ recent_errors = [
293
+ e for e in self.errors
294
+ if datetime.fromisoformat(e["timestamp"]) > cutoff
295
+ ]
296
+
297
+ # Count by category
298
+ category_counts = defaultdict(int)
299
+ stage_counts = defaultdict(int)
300
+ for error in recent_errors:
301
+ category_counts[error["error_type"]] += 1
302
+ stage_counts[error["stage"]] += 1
303
+
304
+ return {
305
+ "total_errors": len(recent_errors),
306
+ "error_rate": self.get_error_rate(),
307
+ "threshold_exceeded": self.check_threshold_exceeded(),
308
+ "by_category": dict(category_counts),
309
+ "by_stage": dict(stage_counts),
310
+ "window_minutes": self.window_minutes
311
+ }
312
+
313
+
314
+ class LatencyTracker:
315
+ """
316
+ Tracks latency across pipeline stages
317
+ Provides detailed timing analysis
318
+ """
319
+
320
+ def __init__(self):
321
+ self.active_traces: Dict[str, Dict[str, float]] = {}
322
+ self.completed_traces: deque = deque(maxlen=1000)
323
+
324
+ logger.info("Latency Tracker initialized")
325
+
326
+ def start_trace(self, trace_id: str, stage: str):
327
+ """Start timing a pipeline stage"""
328
+ if trace_id not in self.active_traces:
329
+ self.active_traces[trace_id] = {}
330
+
331
+ self.active_traces[trace_id][f"{stage}_start"] = time.time()
332
+
333
+ def end_trace(self, trace_id: str, stage: str) -> float:
334
+ """End timing a pipeline stage and return duration"""
335
+ if trace_id not in self.active_traces:
336
+ logger.warning(f"Trace {trace_id} not found")
337
+ return 0.0
338
+
339
+ start_key = f"{stage}_start"
340
+ if start_key not in self.active_traces[trace_id]:
341
+ logger.warning(f"Start time for {stage} not found in trace {trace_id}")
342
+ return 0.0
343
+
344
+ duration = time.time() - self.active_traces[trace_id][start_key]
345
+ self.active_traces[trace_id][f"{stage}_duration"] = duration
346
+
347
+ return duration
348
+
349
+ def complete_trace(self, trace_id: str) -> Dict[str, float]:
350
+ """Mark trace as complete and get timing summary"""
351
+ if trace_id not in self.active_traces:
352
+ return {}
353
+
354
+ trace_data = self.active_traces.pop(trace_id)
355
+
356
+ # Extract durations
357
+ durations = {
358
+ key.replace("_duration", ""): value
359
+ for key, value in trace_data.items()
360
+ if key.endswith("_duration")
361
+ }
362
+
363
+ # Calculate total duration
364
+ total_duration = sum(durations.values())
365
+
366
+ completed_trace = {
367
+ "trace_id": trace_id,
368
+ "timestamp": datetime.utcnow().isoformat(),
369
+ "total_duration": total_duration,
370
+ "stages": durations
371
+ }
372
+
373
+ self.completed_traces.append(completed_trace)
374
+
375
+ return durations
376
+
377
+ def get_stage_statistics(
378
+ self,
379
+ stage: str,
380
+ window_minutes: int = 60
381
+ ) -> Dict[str, float]:
382
+ """Get latency statistics for a specific stage"""
383
+ cutoff = datetime.utcnow() - timedelta(minutes=window_minutes)
384
+
385
+ durations = []
386
+ for trace in self.completed_traces:
387
+ if datetime.fromisoformat(trace["timestamp"]) < cutoff:
388
+ continue
389
+
390
+ if stage in trace["stages"]:
391
+ durations.append(trace["stages"][stage])
392
+
393
+ if not durations:
394
+ return {
395
+ "count": 0,
396
+ "mean": 0.0,
397
+ "min": 0.0,
398
+ "max": 0.0,
399
+ "p50": 0.0,
400
+ "p95": 0.0,
401
+ "p99": 0.0
402
+ }
403
+
404
+ durations_sorted = sorted(durations)
405
+ count = len(durations_sorted)
406
+
407
+ return {
408
+ "count": count,
409
+ "mean": sum(durations_sorted) / count,
410
+ "min": durations_sorted[0],
411
+ "max": durations_sorted[-1],
412
+ "p50": durations_sorted[int(count * 0.50)],
413
+ "p95": durations_sorted[int(count * 0.95)] if count > 1 else durations_sorted[0],
414
+ "p99": durations_sorted[int(count * 0.99)] if count > 1 else durations_sorted[0]
415
+ }
416
+
417
+
418
+ @dataclass
419
+ class CacheEntry:
420
+ """Cache entry with metadata"""
421
+ key: str
422
+ value: Any
423
+ created_at: float
424
+ accessed_at: float
425
+ access_count: int
426
+ size_bytes: int
427
+ ttl: Optional[int] = None # Time to live in seconds
428
+
429
+ def is_expired(self) -> bool:
430
+ """Check if entry has expired"""
431
+ if self.ttl is None:
432
+ return False
433
+ return (time.time() - self.created_at) > self.ttl
434
+
435
+ def to_dict(self) -> Dict[str, Any]:
436
+ return {
437
+ "key": self.key,
438
+ "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
439
+ "accessed_at": datetime.fromtimestamp(self.accessed_at).isoformat(),
440
+ "access_count": self.access_count,
441
+ "size_bytes": self.size_bytes,
442
+ "ttl": self.ttl,
443
+ "expired": self.is_expired()
444
+ }
445
+
446
+
447
+ class CacheService:
448
+ """
449
+ SHA256-based caching service for deduplication and performance optimization
450
+
451
+ Features:
452
+ - SHA256 fingerprinting for input deduplication
453
+ - LRU eviction policy
454
+ - TTL support for automatic expiration
455
+ - Cache hit/miss tracking
456
+ - Memory usage monitoring
457
+ - Performance metrics
458
+ """
459
+
460
+ def __init__(
461
+ self,
462
+ max_entries: int = 10000,
463
+ max_memory_mb: int = 512,
464
+ default_ttl: Optional[int] = 3600 # 1 hour default
465
+ ):
466
+ self.max_entries = max_entries
467
+ self.max_memory_mb = max_memory_mb
468
+ self.default_ttl = default_ttl
469
+
470
+ self.cache: Dict[str, CacheEntry] = {}
471
+ self.access_order: deque = deque() # For LRU tracking
472
+
473
+ # Metrics
474
+ self.hits = 0
475
+ self.misses = 0
476
+ self.evictions = 0
477
+ self.total_retrieval_time = 0.0
478
+ self.retrieval_count = 0
479
+
480
+ logger.info(f"Cache Service initialized (max_entries: {max_entries}, max_memory: {max_memory_mb}MB)")
481
+
482
+ def _compute_fingerprint(self, data: Any) -> str:
483
+ """
484
+ Compute SHA256 fingerprint for any data
485
+
486
+ Args:
487
+ data: Any serializable data (dict, str, bytes, etc.)
488
+
489
+ Returns:
490
+ SHA256 hash as hex string
491
+ """
492
+ if isinstance(data, bytes):
493
+ data_bytes = data
494
+ elif isinstance(data, str):
495
+ data_bytes = data.encode('utf-8')
496
+ elif isinstance(data, (dict, list)):
497
+ # Serialize to JSON for consistent hashing
498
+ json_str = json.dumps(data, sort_keys=True)
499
+ data_bytes = json_str.encode('utf-8')
500
+ else:
501
+ # Use pickle for other types
502
+ data_bytes = pickle.dumps(data)
503
+
504
+ return hashlib.sha256(data_bytes).hexdigest()
505
+
506
+ def _estimate_size(self, obj: Any) -> int:
507
+ """Estimate size of object in bytes"""
508
+ try:
509
+ return len(pickle.dumps(obj))
510
+ except Exception:
511
+ # Fallback estimation
512
+ if isinstance(obj, (str, bytes)):
513
+ return len(obj)
514
+ elif isinstance(obj, dict):
515
+ return sum(len(str(k)) + len(str(v)) for k, v in obj.items())
516
+ elif isinstance(obj, list):
517
+ return sum(len(str(item)) for item in obj)
518
+ else:
519
+ return 1024 # Default 1KB estimate
520
+
521
+ def _get_memory_usage_mb(self) -> float:
522
+ """Calculate current memory usage in MB"""
523
+ total_bytes = sum(entry.size_bytes for entry in self.cache.values())
524
+ return total_bytes / (1024 * 1024)
525
+
526
+ def _evict_lru(self):
527
+ """Evict least recently used entry"""
528
+ if not self.access_order:
529
+ return
530
+
531
+ # Find oldest entry still in cache
532
+ while self.access_order:
533
+ lru_key = self.access_order.popleft()
534
+ if lru_key in self.cache:
535
+ del self.cache[lru_key]
536
+ self.evictions += 1
537
+ logger.debug(f"Evicted LRU cache entry: {lru_key[:16]}...")
538
+ break
539
+
540
+ def _cleanup_expired(self):
541
+ """Remove expired entries"""
542
+ expired_keys = [
543
+ key for key, entry in self.cache.items()
544
+ if entry.is_expired()
545
+ ]
546
+
547
+ for key in expired_keys:
548
+ del self.cache[key]
549
+ logger.debug(f"Removed expired cache entry: {key[:16]}...")
550
+
551
+ def _ensure_capacity(self, new_entry_size: int):
552
+ """Ensure cache has capacity for new entry"""
553
+ # Check entry count limit
554
+ while len(self.cache) >= self.max_entries:
555
+ self._evict_lru()
556
+
557
+ # Check memory limit
558
+ while self._get_memory_usage_mb() + (new_entry_size / 1024 / 1024) > self.max_memory_mb:
559
+ if len(self.cache) == 0:
560
+ break
561
+ self._evict_lru()
562
+
563
+ def get(self, key: str) -> Optional[Any]:
564
+ """
565
+ Retrieve value from cache by key
566
+
567
+ Args:
568
+ key: Cache key (typically SHA256 fingerprint)
569
+
570
+ Returns:
571
+ Cached value if found and not expired, None otherwise
572
+ """
573
+ start_time = time.time()
574
+
575
+ # Periodic cleanup
576
+ if self.retrieval_count % 100 == 0:
577
+ self._cleanup_expired()
578
+
579
+ if key not in self.cache:
580
+ self.misses += 1
581
+ retrieval_time = time.time() - start_time
582
+ self.total_retrieval_time += retrieval_time
583
+ self.retrieval_count += 1
584
+ return None
585
+
586
+ entry = self.cache[key]
587
+
588
+ # Check expiration
589
+ if entry.is_expired():
590
+ del self.cache[key]
591
+ self.misses += 1
592
+ retrieval_time = time.time() - start_time
593
+ self.total_retrieval_time += retrieval_time
594
+ self.retrieval_count += 1
595
+ return None
596
+
597
+ # Update access metadata
598
+ entry.accessed_at = time.time()
599
+ entry.access_count += 1
600
+
601
+ # Update LRU order
602
+ if key in self.access_order:
603
+ self.access_order.remove(key)
604
+ self.access_order.append(key)
605
+
606
+ self.hits += 1
607
+ retrieval_time = time.time() - start_time
608
+ self.total_retrieval_time += retrieval_time
609
+ self.retrieval_count += 1
610
+
611
+ logger.debug(f"Cache hit: {key[:16]}... (access_count: {entry.access_count})")
612
+
613
+ return entry.value
614
+
615
+ def set(self, key: str, value: Any, ttl: Optional[int] = None):
616
+ """
617
+ Store value in cache with key
618
+
619
+ Args:
620
+ key: Cache key (typically SHA256 fingerprint)
621
+ value: Value to cache
622
+ ttl: Time to live in seconds (None for default, 0 for no expiration)
623
+ """
624
+ size_bytes = self._estimate_size(value)
625
+
626
+ # Use default TTL if not specified
627
+ if ttl is None:
628
+ ttl = self.default_ttl
629
+ elif ttl == 0:
630
+ ttl = None # No expiration
631
+
632
+ # Ensure capacity
633
+ self._ensure_capacity(size_bytes)
634
+
635
+ # Create entry
636
+ current_time = time.time()
637
+ entry = CacheEntry(
638
+ key=key,
639
+ value=value,
640
+ created_at=current_time,
641
+ accessed_at=current_time,
642
+ access_count=0,
643
+ size_bytes=size_bytes,
644
+ ttl=ttl
645
+ )
646
+
647
+ # Store in cache
648
+ self.cache[key] = entry
649
+ self.access_order.append(key)
650
+
651
+ logger.debug(f"Cached entry: {key[:16]}... (size: {size_bytes} bytes, ttl: {ttl}s)")
652
+
653
+ def get_or_compute(
654
+ self,
655
+ data: Any,
656
+ compute_fn: callable,
657
+ ttl: Optional[int] = None
658
+ ) -> Tuple[Any, bool]:
659
+ """
660
+ Get cached value or compute and cache it
661
+
662
+ Args:
663
+ data: Input data to fingerprint
664
+ compute_fn: Function to compute value if not cached
665
+ ttl: Time to live for cached result
666
+
667
+ Returns:
668
+ Tuple of (result, was_cached)
669
+ """
670
+ # Compute fingerprint
671
+ fingerprint = self._compute_fingerprint(data)
672
+
673
+ # Try to get from cache
674
+ cached_value = self.get(fingerprint)
675
+ if cached_value is not None:
676
+ return cached_value, True
677
+
678
+ # Compute value
679
+ result = compute_fn()
680
+
681
+ # Cache result
682
+ self.set(fingerprint, result, ttl)
683
+
684
+ return result, False
685
+
686
+ def invalidate(self, key: str) -> bool:
687
+ """
688
+ Invalidate (remove) a cache entry
689
+
690
+ Args:
691
+ key: Cache key to invalidate
692
+
693
+ Returns:
694
+ True if entry was removed, False if not found
695
+ """
696
+ if key in self.cache:
697
+ del self.cache[key]
698
+ if key in self.access_order:
699
+ self.access_order.remove(key)
700
+ logger.debug(f"Invalidated cache entry: {key[:16]}...")
701
+ return True
702
+ return False
703
+
704
+ def invalidate_by_fingerprint(self, data: Any) -> bool:
705
+ """
706
+ Invalidate cache entry by computing fingerprint of data
707
+
708
+ Args:
709
+ data: Data to fingerprint and invalidate
710
+
711
+ Returns:
712
+ True if entry was removed, False if not found
713
+ """
714
+ fingerprint = self._compute_fingerprint(data)
715
+ return self.invalidate(fingerprint)
716
+
717
+ def clear(self):
718
+ """Clear all cache entries"""
719
+ self.cache.clear()
720
+ self.access_order.clear()
721
+ logger.info("Cache cleared")
722
+
723
+ def get_statistics(self) -> Dict[str, Any]:
724
+ """Get cache performance statistics"""
725
+ total_requests = self.hits + self.misses
726
+ hit_rate = self.hits / total_requests if total_requests > 0 else 0.0
727
+ avg_retrieval_time = (
728
+ self.total_retrieval_time / self.retrieval_count
729
+ if self.retrieval_count > 0 else 0.0
730
+ )
731
+
732
+ return {
733
+ "total_entries": len(self.cache),
734
+ "hits": self.hits,
735
+ "misses": self.misses,
736
+ "hit_rate": hit_rate,
737
+ "evictions": self.evictions,
738
+ "memory_usage_mb": self._get_memory_usage_mb(),
739
+ "max_memory_mb": self.max_memory_mb,
740
+ "avg_retrieval_time_ms": avg_retrieval_time * 1000,
741
+ "cache_efficiency": hit_rate * 100 # Percentage
742
+ }
743
+
744
+ def get_entry_info(self, key: str) -> Optional[Dict[str, Any]]:
745
+ """Get information about a specific cache entry"""
746
+ if key not in self.cache:
747
+ return None
748
+ return self.cache[key].to_dict()
749
+
750
+ def list_entries(self, limit: int = 100) -> List[Dict[str, Any]]:
751
+ """List cache entries with metadata"""
752
+ entries = sorted(
753
+ self.cache.values(),
754
+ key=lambda e: e.accessed_at,
755
+ reverse=True
756
+ )[:limit]
757
+ return [entry.to_dict() for entry in entries]
758
+
759
+
760
+ class AlertManager:
761
+ """
762
+ Manages alerts and notifications
763
+ Handles alert lifecycle and delivery
764
+ """
765
+
766
+ def __init__(self):
767
+ self.active_alerts: Dict[str, Alert] = {}
768
+ self.alert_history: deque = deque(maxlen=1000)
769
+ self.alert_handlers: List[callable] = []
770
+
771
+ logger.info("Alert Manager initialized")
772
+
773
+ def create_alert(
774
+ self,
775
+ level: AlertLevel,
776
+ message: str,
777
+ category: str,
778
+ details: Optional[Dict[str, Any]] = None
779
+ ) -> Alert:
780
+ """Create a new alert"""
781
+ alert_id = hashlib.sha256(
782
+ f"{category}:{message}:{datetime.utcnow().isoformat()}".encode()
783
+ ).hexdigest()[:16]
784
+
785
+ alert = Alert(
786
+ alert_id=alert_id,
787
+ level=level,
788
+ message=message,
789
+ category=category,
790
+ timestamp=datetime.utcnow().isoformat(),
791
+ details=details or {}
792
+ )
793
+
794
+ self.active_alerts[alert_id] = alert
795
+ self.alert_history.append(alert)
796
+
797
+ # Trigger alert handlers
798
+ asyncio.create_task(self._trigger_handlers(alert))
799
+
800
+ logger.warning(f"Alert created: [{level.value}] {category} - {message}")
801
+
802
+ return alert
803
+
804
+ def resolve_alert(self, alert_id: str):
805
+ """Resolve an active alert"""
806
+ if alert_id in self.active_alerts:
807
+ alert = self.active_alerts.pop(alert_id)
808
+ alert.resolved = True
809
+ alert.resolved_at = datetime.utcnow().isoformat()
810
+
811
+ logger.info(f"Alert resolved: {alert_id}")
812
+
813
+ def add_handler(self, handler: callable):
814
+ """Add an alert handler function"""
815
+ self.alert_handlers.append(handler)
816
+
817
+ async def _trigger_handlers(self, alert: Alert):
818
+ """Trigger all registered alert handlers"""
819
+ for handler in self.alert_handlers:
820
+ try:
821
+ if asyncio.iscoroutinefunction(handler):
822
+ await handler(alert)
823
+ else:
824
+ handler(alert)
825
+ except Exception as e:
826
+ logger.error(f"Alert handler failed: {str(e)}")
827
+
828
+ def get_active_alerts(
829
+ self,
830
+ level: Optional[AlertLevel] = None,
831
+ category: Optional[str] = None
832
+ ) -> List[Alert]:
833
+ """Get active alerts with optional filtering"""
834
+ alerts = list(self.active_alerts.values())
835
+
836
+ if level:
837
+ alerts = [a for a in alerts if a.level == level]
838
+
839
+ if category:
840
+ alerts = [a for a in alerts if a.category == category]
841
+
842
+ return alerts
843
+
844
+ def get_alert_summary(self) -> Dict[str, Any]:
845
+ """Get summary of alert status"""
846
+ active = list(self.active_alerts.values())
847
+
848
+ by_level = defaultdict(int)
849
+ by_category = defaultdict(int)
850
+
851
+ for alert in active:
852
+ by_level[alert.level.value] += 1
853
+ by_category[alert.category] += 1
854
+
855
+ return {
856
+ "total_active": len(active),
857
+ "by_level": dict(by_level),
858
+ "by_category": dict(by_category),
859
+ "critical_count": by_level[AlertLevel.CRITICAL.value],
860
+ "error_count": by_level[AlertLevel.ERROR.value]
861
+ }
862
+
863
+
864
+ class MonitoringService:
865
+ """
866
+ Central monitoring service coordinating all monitoring components
867
+ Provides unified interface for system monitoring and health checks
868
+ """
869
+
870
+ def __init__(
871
+ self,
872
+ error_threshold: float = 0.05,
873
+ window_minutes: int = 15
874
+ ):
875
+ self.metrics_collector = MetricsCollector()
876
+ self.error_monitor = ErrorMonitor(error_threshold, window_minutes)
877
+ self.latency_tracker = LatencyTracker()
878
+ self.alert_manager = AlertManager()
879
+ self.cache_service = CacheService(
880
+ max_entries=10000,
881
+ max_memory_mb=512,
882
+ default_ttl=3600 # 1 hour default
883
+ )
884
+
885
+ self.system_status = SystemStatus.OPERATIONAL
886
+ self.start_time = datetime.utcnow()
887
+
888
+ # Setup automatic monitoring (skip background tasks for now)
889
+ # self._setup_automatic_checks()
890
+
891
+ logger.info("Monitoring Service initialized")
892
+
893
+ def _setup_automatic_checks(self):
894
+ """Setup automatic health checks and alerts"""
895
+ async def check_error_rate():
896
+ """Periodically check error rate and create alerts"""
897
+ while True:
898
+ try:
899
+ error_summary = self.error_monitor.get_error_summary()
900
+
901
+ if error_summary["threshold_exceeded"]:
902
+ self.alert_manager.create_alert(
903
+ level=AlertLevel.ERROR,
904
+ message=f"Error rate ({error_summary['error_rate']*100:.1f}%) exceeds threshold",
905
+ category="error_rate",
906
+ details=error_summary
907
+ )
908
+
909
+ await asyncio.sleep(60) # Check every minute
910
+ except Exception as e:
911
+ logger.error(f"Error rate check failed: {str(e)}")
912
+ await asyncio.sleep(60)
913
+
914
+ # Start background task
915
+ asyncio.create_task(check_error_rate())
916
+
917
+ def record_processing_stage(
918
+ self,
919
+ trace_id: str,
920
+ stage: str,
921
+ success: bool,
922
+ duration: Optional[float] = None,
923
+ error_details: Optional[Dict[str, Any]] = None
924
+ ):
925
+ """Record completion of a processing stage"""
926
+ # Record success/error
927
+ if success:
928
+ self.error_monitor.record_success(stage)
929
+ else:
930
+ error_type = error_details.get("error_type", "unknown") if error_details else "unknown"
931
+ error_message = error_details.get("message", "No details") if error_details else "No details"
932
+ self.error_monitor.record_error(error_type, error_message, stage, error_details)
933
+
934
+ # Record latency
935
+ if duration is not None:
936
+ self.metrics_collector.record_metric(
937
+ f"latency_{stage}",
938
+ duration,
939
+ unit="seconds",
940
+ tags={"stage": stage, "success": str(success)}
941
+ )
942
+
943
+ # Increment counters
944
+ self.metrics_collector.increment_counter(f"stage_{stage}_total")
945
+ if success:
946
+ self.metrics_collector.increment_counter(f"stage_{stage}_success")
947
+ else:
948
+ self.metrics_collector.increment_counter(f"stage_{stage}_error")
949
+
950
+ def get_system_health(self) -> Dict[str, Any]:
951
+ """Get comprehensive system health status"""
952
+ error_summary = self.error_monitor.get_error_summary()
953
+ alert_summary = self.alert_manager.get_alert_summary()
954
+
955
+ # Determine system status
956
+ if alert_summary["critical_count"] > 0:
957
+ status = SystemStatus.CRITICAL
958
+ elif error_summary["threshold_exceeded"] or alert_summary["error_count"] > 5:
959
+ status = SystemStatus.DEGRADED
960
+ else:
961
+ status = SystemStatus.OPERATIONAL
962
+
963
+ self.system_status = status
964
+
965
+ uptime = (datetime.utcnow() - self.start_time).total_seconds()
966
+
967
+ return {
968
+ "status": status.value,
969
+ "uptime_seconds": uptime,
970
+ "timestamp": datetime.utcnow().isoformat(),
971
+ "error_rate": error_summary["error_rate"],
972
+ "error_threshold": self.error_monitor.error_threshold,
973
+ "active_alerts": alert_summary["total_active"],
974
+ "critical_alerts": alert_summary["critical_count"],
975
+ "total_requests": self.metrics_collector.get_counter("total_requests", 0),
976
+ "counters": self.metrics_collector.get_all_counters(),
977
+ "gauges": self.metrics_collector.get_all_gauges()
978
+ }
979
+
980
+ def get_performance_dashboard(self) -> Dict[str, Any]:
981
+ """Get performance metrics for dashboard display"""
982
+ # Define key stages
983
+ stages = ["pdf_processing", "classification", "model_routing", "synthesis"]
984
+
985
+ stage_stats = {}
986
+ for stage in stages:
987
+ stage_stats[stage] = self.latency_tracker.get_stage_statistics(stage)
988
+
989
+ return {
990
+ "system_health": self.get_system_health(),
991
+ "error_summary": self.error_monitor.get_error_summary(),
992
+ "latency_by_stage": stage_stats,
993
+ "active_alerts": [a.to_dict() for a in self.alert_manager.get_active_alerts()],
994
+ "timestamp": datetime.utcnow().isoformat()
995
+ }
996
+
997
+ def start_monitoring(self):
998
+ """Start monitoring services (placeholder for initialization)"""
999
+ logger.info("Monitoring services started")
1000
+ self.system_status = SystemStatus.OPERATIONAL
1001
+
1002
+ def track_request(self, endpoint: str, latency_ms: float, status_code: int):
1003
+ """Track incoming request for monitoring"""
1004
+ # Record latency metric
1005
+ self.metrics_collector.record_metric(
1006
+ f"request_latency_{endpoint}",
1007
+ latency_ms,
1008
+ unit="milliseconds",
1009
+ tags={"endpoint": endpoint, "status_code": str(status_code)}
1010
+ )
1011
+
1012
+ # Increment request counter
1013
+ self.metrics_collector.increment_counter("total_requests")
1014
+ self.metrics_collector.increment_counter(f"requests_{endpoint}")
1015
+
1016
+ # Track status code
1017
+ if status_code >= 500:
1018
+ self.metrics_collector.increment_counter("server_errors")
1019
+ elif status_code >= 400:
1020
+ self.metrics_collector.increment_counter("client_errors")
1021
+ else:
1022
+ self.metrics_collector.increment_counter("successful_requests")
1023
+
1024
+ def track_error(self, endpoint: str, error_type: str, error_message: str):
1025
+ """Track error occurrence"""
1026
+ self.error_monitor.record_error(
1027
+ error_type=error_type,
1028
+ message=error_message,
1029
+ component=endpoint,
1030
+ details={"endpoint": endpoint}
1031
+ )
1032
+
1033
+ # Increment error counter
1034
+ self.metrics_collector.increment_counter("total_errors")
1035
+ self.metrics_collector.increment_counter(f"errors_{error_type}")
1036
+
1037
+ def get_cache_statistics(self) -> Dict[str, Any]:
1038
+ """Get cache performance statistics from real cache service"""
1039
+ return self.cache_service.get_statistics()
1040
+
1041
+ def cache_result(self, data: Any, result: Any, ttl: Optional[int] = None):
1042
+ """
1043
+ Cache a computation result with SHA256 fingerprint
1044
+
1045
+ Args:
1046
+ data: Input data to fingerprint
1047
+ result: Result to cache
1048
+ ttl: Time to live in seconds
1049
+ """
1050
+ fingerprint = self.cache_service._compute_fingerprint(data)
1051
+ self.cache_service.set(fingerprint, result, ttl)
1052
+ logger.debug(f"Cached result for fingerprint: {fingerprint[:16]}...")
1053
+
1054
+ def get_cached_result(self, data: Any) -> Optional[Any]:
1055
+ """
1056
+ Retrieve cached result by computing fingerprint
1057
+
1058
+ Args:
1059
+ data: Input data to fingerprint
1060
+
1061
+ Returns:
1062
+ Cached result if found, None otherwise
1063
+ """
1064
+ fingerprint = self.cache_service._compute_fingerprint(data)
1065
+ return self.cache_service.get(fingerprint)
1066
+
1067
+ def get_or_compute_cached(
1068
+ self,
1069
+ data: Any,
1070
+ compute_fn: callable,
1071
+ ttl: Optional[int] = None
1072
+ ) -> Tuple[Any, bool]:
1073
+ """
1074
+ Get cached result or compute and cache it
1075
+
1076
+ Args:
1077
+ data: Input data to fingerprint
1078
+ compute_fn: Function to compute result if not cached
1079
+ ttl: Time to live for cached result
1080
+
1081
+ Returns:
1082
+ Tuple of (result, was_cached)
1083
+ """
1084
+ return self.cache_service.get_or_compute(data, compute_fn, ttl)
1085
+
1086
+ def get_recent_alerts(self, limit: int = 10) -> List[Dict[str, Any]]:
1087
+ """Get recent alerts"""
1088
+ alerts = self.alert_manager.get_active_alerts()
1089
+ recent = sorted(alerts, key=lambda a: a.timestamp, reverse=True)[:limit]
1090
+ return [a.to_dict() for a in recent]
1091
+
1092
+
1093
+ # Global monitoring service instance
1094
+ _monitoring_service = None
1095
+
1096
+
1097
+ def get_monitoring_service() -> MonitoringService:
1098
+ """Get singleton monitoring service instance"""
1099
+ global _monitoring_service
1100
+ if _monitoring_service is None:
1101
+ _monitoring_service = MonitoringService()
1102
+ return _monitoring_service