petter2025 commited on
Commit
c9a830d
·
verified ·
1 Parent(s): 11c010f

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -2028
app.py DELETED
@@ -1,2028 +0,0 @@
1
- """
2
- Enterprise Agentic Reliability Framework - Main Application (FIXED VERSION)
3
- Multi-Agent AI System for Production Reliability Monitoring
4
-
5
- CRITICAL FIXES APPLIED:
6
- - Removed event loop creation (uses Gradio native async)
7
- - Fixed FAISS thread safety with single-writer pattern
8
- - ProcessPoolExecutor for CPU-intensive encoding
9
- - Atomic saves with fsync
10
- - Dependency injection
11
- - Rate limiting
12
- - Comprehensive input validation
13
- - Circuit breakers for agent resilience
14
- """
15
-
16
- import os
17
- import json
18
- import numpy as np
19
- import gradio as gr
20
- import requests
21
- import pandas as pd
22
- import datetime
23
- import threading
24
- import logging
25
- import asyncio
26
- import tempfile
27
- from typing import List, Dict, Any, Optional, Tuple
28
- from collections import deque, OrderedDict
29
- from dataclasses import dataclass, asdict
30
- from enum import Enum
31
- from concurrent.futures import ProcessPoolExecutor
32
- from queue import Queue
33
- from circuitbreaker import circuit
34
- import atomicwrites
35
-
36
- # Import our modules
37
- from models import (
38
- ReliabilityEvent, EventSeverity, AnomalyResult,
39
- HealingAction, ForecastResult, PolicyCondition
40
- )
41
- from claude_adapter import get_claude_adapter
42
- from healing_policies import PolicyEngine, DEFAULT_HEALING_POLICIES
43
- from voice_narrator import VoiceHandler
44
-
45
- # === Logging Configuration ===
46
- logging.basicConfig(
47
- level=logging.INFO,
48
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
49
- )
50
- logger = logging.getLogger(__name__)
51
- # Initialize Claude adapter for AI reasoning
52
- claude_adapter = get_claude_adapter()
53
-
54
-
55
- # === CONSTANTS (FIXED: Extracted all magic numbers) ===
56
- class Constants:
57
- """Centralized constants to eliminate magic numbers"""
58
-
59
- # Thresholds
60
- LATENCY_WARNING = 150.0
61
- LATENCY_CRITICAL = 300.0
62
- LATENCY_EXTREME = 500.0
63
-
64
- ERROR_RATE_WARNING = 0.05
65
- ERROR_RATE_HIGH = 0.15
66
- ERROR_RATE_CRITICAL = 0.3
67
-
68
- CPU_WARNING = 0.8
69
- CPU_CRITICAL = 0.9
70
-
71
- MEMORY_WARNING = 0.8
72
- MEMORY_CRITICAL = 0.9
73
-
74
- # Forecasting
75
- SLOPE_THRESHOLD_INCREASING = 5.0
76
- SLOPE_THRESHOLD_DECREASING = -2.0
77
-
78
- FORECAST_MIN_DATA_POINTS = 5
79
- FORECAST_LOOKAHEAD_MINUTES = 15
80
-
81
- # Performance
82
- HISTORY_WINDOW = 50
83
- MAX_EVENTS_STORED = 1000
84
- AGENT_TIMEOUT_SECONDS = 5
85
- CACHE_EXPIRY_MINUTES = 15
86
-
87
- # FAISS
88
- FAISS_BATCH_SIZE = 10
89
- FAISS_SAVE_INTERVAL_SECONDS = 30
90
- VECTOR_DIM = 384
91
-
92
- # Business metrics
93
- BASE_REVENUE_PER_MINUTE = 100.0
94
- BASE_USERS = 1000
95
-
96
- # Rate limiting
97
- MAX_REQUESTS_PER_MINUTE = 60
98
- MAX_REQUESTS_PER_HOUR = 500
99
-
100
-
101
- # === Configuration ===
102
- class Config:
103
- """Centralized configuration for the reliability framework"""
104
- HF_TOKEN: str = os.getenv("HF_TOKEN", "").strip()
105
- HF_API_URL: str = "https://router.huggingface.co/hf-inference/v1/completions"
106
-
107
- INDEX_FILE: str = os.getenv("INDEX_FILE", "data/incident_vectors.index")
108
- TEXTS_FILE: str = os.getenv("TEXTS_FILE", "data/incident_texts.json")
109
- DATA_DIR: str = os.getenv("DATA_DIR", "data")
110
-
111
- # Create data directory if it doesn't exist
112
- os.makedirs(DATA_DIR, exist_ok=True)
113
-
114
-
115
- config = Config()
116
- HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
117
-
118
-
119
- # === Input Validation (FIXED: Comprehensive validation) ===
120
- def validate_component_id(component_id: str) -> Tuple[bool, str]:
121
- """Validate component ID format"""
122
- if not isinstance(component_id, str):
123
- return False, "Component ID must be a string"
124
-
125
- if not (1 <= len(component_id) <= 255):
126
- return False, "Component ID must be 1-255 characters"
127
-
128
- import re
129
- if not re.match(r"^[a-z0-9-]+$", component_id):
130
- return False, "Component ID must contain only lowercase letters, numbers, and hyphens"
131
-
132
- return True, ""
133
-
134
-
135
- def validate_inputs(
136
- latency: Any,
137
- error_rate: Any,
138
- throughput: Any,
139
- cpu_util: Any,
140
- memory_util: Any
141
- ) -> Tuple[bool, str]:
142
- """
143
- Comprehensive input validation with type checking
144
-
145
- FIXED: Added proper type validation before conversion
146
- """
147
- try:
148
- # Type conversion with error handling
149
- try:
150
- latency_f = float(latency)
151
- except (ValueError, TypeError):
152
- return False, "❌ Invalid latency: must be a number"
153
-
154
- try:
155
- error_rate_f = float(error_rate)
156
- except (ValueError, TypeError):
157
- return False, "❌ Invalid error rate: must be a number"
158
-
159
- try:
160
- throughput_f = float(throughput) if throughput else 1000.0
161
- except (ValueError, TypeError):
162
- return False, "❌ Invalid throughput: must be a number"
163
-
164
- # CPU and memory are optional
165
- cpu_util_f = None
166
- if cpu_util:
167
- try:
168
- cpu_util_f = float(cpu_util)
169
- except (ValueError, TypeError):
170
- return False, "❌ Invalid CPU utilization: must be a number"
171
-
172
- memory_util_f = None
173
- if memory_util:
174
- try:
175
- memory_util_f = float(memory_util)
176
- except (ValueError, TypeError):
177
- return False, "❌ Invalid memory utilization: must be a number"
178
-
179
- # Range validation
180
- if not (0 <= latency_f <= 10000):
181
- return False, "❌ Invalid latency: must be between 0-10000ms"
182
-
183
- if not (0 <= error_rate_f <= 1):
184
- return False, "❌ Invalid error rate: must be between 0-1"
185
-
186
- if throughput_f < 0:
187
- return False, "❌ Invalid throughput: must be positive"
188
-
189
- if cpu_util_f is not None and not (0 <= cpu_util_f <= 1):
190
- return False, "❌ Invalid CPU utilization: must be between 0-1"
191
-
192
- if memory_util_f is not None and not (0 <= memory_util_f <= 1):
193
- return False, "❌ Invalid memory utilization: must be between 0-1"
194
-
195
- return True, ""
196
-
197
- except Exception as e:
198
- logger.error(f"Validation error: {e}", exc_info=True)
199
- return False, f"❌ Validation error: {str(e)}"
200
-
201
-
202
- # === Thread-Safe Data Structures ===
203
- class ThreadSafeEventStore:
204
- """Thread-safe storage for reliability events"""
205
-
206
- def __init__(self, max_size: int = Constants.MAX_EVENTS_STORED):
207
- self._events = deque(maxlen=max_size)
208
- self._lock = threading.RLock()
209
- logger.info(f"Initialized ThreadSafeEventStore with max_size={max_size}")
210
-
211
- def add(self, event: ReliabilityEvent) -> None:
212
- """Add event to store"""
213
- with self._lock:
214
- self._events.append(event)
215
- logger.debug(f"Added event for {event.component}: {event.severity.value}")
216
-
217
- def get_recent(self, n: int = 15) -> List[ReliabilityEvent]:
218
- """Get n most recent events"""
219
- with self._lock:
220
- return list(self._events)[-n:] if self._events else []
221
-
222
- def get_all(self) -> List[ReliabilityEvent]:
223
- """Get all events"""
224
- with self._lock:
225
- return list(self._events)
226
-
227
- def count(self) -> int:
228
- """Get total event count"""
229
- with self._lock:
230
- return len(self._events)
231
-
232
-
233
- # === FAISS Integration (FIXED: Single-writer pattern for thread safety) ===
234
- class ProductionFAISSIndex:
235
- """
236
- Production-safe FAISS index with single-writer pattern
237
-
238
- CRITICAL FIX: FAISS is NOT thread-safe for concurrent writes
239
- Solution: Queue-based single writer thread + atomic saves
240
- """
241
-
242
- def __init__(self, index, texts: List[str]):
243
- self.index = index
244
- self.texts = texts
245
- self._lock = threading.RLock()
246
-
247
- # FIXED: Initialize shutdown event BEFORE starting thread
248
- self._shutdown = threading.Event()
249
-
250
- # Single writer thread (no concurrent write conflicts)
251
- self._write_queue: Queue = Queue()
252
- self._writer_thread = threading.Thread(
253
- target=self._writer_loop,
254
- daemon=True,
255
- name="FAISSWriter"
256
- )
257
- self._writer_thread.start() # ← Only start ONCE, AFTER _shutdown exists
258
-
259
- # ProcessPool for encoding (avoids GIL + memory leaks)
260
- self._encoder_pool = ProcessPoolExecutor(max_workers=2)
261
-
262
- logger.info(
263
- f"Initialized ProductionFAISSIndex with {len(texts)} vectors, "
264
- f"single-writer pattern"
265
- )
266
-
267
- def add_async(self, vector: np.ndarray, text: str) -> None:
268
- """
269
- Add vector and text asynchronously (thread-safe)
270
-
271
- FIXED: Queue-based design - no concurrent FAISS writes
272
- """
273
- self._write_queue.put((vector, text))
274
- logger.debug(f"Queued vector for indexing: {text[:50]}...")
275
-
276
- def _writer_loop(self) -> None:
277
- """
278
- Single writer thread - processes queue in batches
279
-
280
- This ensures only ONE thread ever writes to FAISS index
281
- """
282
- batch = []
283
- last_save = datetime.datetime.now()
284
- save_interval = datetime.timedelta(
285
- seconds=Constants.FAISS_SAVE_INTERVAL_SECONDS
286
- )
287
-
288
- while not self._shutdown.is_set():
289
- try:
290
- # Collect batch (non-blocking with timeout)
291
- import queue
292
- try:
293
- item = self._write_queue.get(timeout=1.0)
294
- batch.append(item)
295
- except queue.Empty:
296
- pass
297
-
298
- # Process batch when ready
299
- if len(batch) >= Constants.FAISS_BATCH_SIZE or \
300
- (batch and datetime.datetime.now() - last_save > save_interval):
301
-
302
- self._flush_batch(batch)
303
- batch = []
304
-
305
- # Periodic save
306
- if datetime.datetime.now() - last_save > save_interval:
307
- self._save_atomic()
308
- last_save = datetime.datetime.now()
309
-
310
- except Exception as e:
311
- logger.error(f"Writer loop error: {e}", exc_info=True)
312
-
313
- def _flush_batch(self, batch: List[Tuple[np.ndarray, str]]) -> None:
314
- """
315
- Flush batch to FAISS index
316
-
317
- SAFE: Only called from single writer thread
318
- """
319
- if not batch:
320
- return
321
-
322
- try:
323
- vectors = np.vstack([v for v, _ in batch])
324
- texts = [t for _, t in batch]
325
-
326
- # SAFE: Single writer - no concurrent access
327
- self.index.add(vectors)
328
-
329
- with self._lock: # Only lock for text list modification
330
- self.texts.extend(texts)
331
-
332
- logger.info(f"Flushed batch of {len(batch)} vectors to FAISS index")
333
-
334
- except Exception as e:
335
- logger.error(f"Error flushing batch: {e}", exc_info=True)
336
-
337
- def _save_atomic(self) -> None:
338
- """
339
- Atomic save with fsync for durability
340
-
341
- FIXED: Prevents corruption on crash
342
- """
343
- try:
344
- import faiss
345
-
346
- # Write to temporary file first
347
- with tempfile.NamedTemporaryFile(
348
- mode='wb',
349
- delete=False,
350
- dir=os.path.dirname(config.INDEX_FILE),
351
- prefix='index_',
352
- suffix='.tmp'
353
- ) as tmp:
354
- temp_path = tmp.name
355
-
356
- # Write index
357
- faiss.write_index(self.index, temp_path)
358
-
359
- # Fsync for durability
360
- with open(temp_path, 'r+b') as f:
361
- f.flush()
362
- os.fsync(f.fileno())
363
-
364
- # Atomic rename
365
- os.replace(temp_path, config.INDEX_FILE)
366
-
367
- # Save texts with atomic write
368
- with self._lock:
369
- texts_copy = self.texts.copy()
370
-
371
- with atomicwrites.atomic_write(
372
- config.TEXTS_FILE,
373
- mode='w',
374
- overwrite=True
375
- ) as f:
376
- json.dump(texts_copy, f)
377
-
378
- logger.info(
379
- f"Atomically saved FAISS index with {len(texts_copy)} vectors"
380
- )
381
-
382
- except Exception as e:
383
- logger.error(f"Error saving index: {e}", exc_info=True)
384
-
385
- def get_count(self) -> int:
386
- """Get total count of vectors"""
387
- with self._lock:
388
- return len(self.texts) + self._write_queue.qsize()
389
-
390
- def force_save(self) -> None:
391
- """Force immediate save of pending vectors"""
392
- logger.info("Forcing FAISS index save...")
393
-
394
- # Wait for queue to drain (with timeout)
395
- timeout = 10.0
396
- start = datetime.datetime.now()
397
-
398
- while not self._write_queue.empty():
399
- if (datetime.datetime.now() - start).total_seconds() > timeout:
400
- logger.warning("Force save timeout - queue not empty")
401
- break
402
- import time
403
- time.sleep(0.1)
404
-
405
- self._save_atomic()
406
-
407
- def shutdown(self) -> None:
408
- """Graceful shutdown"""
409
- logger.info("Shutting down FAISS index...")
410
- self._shutdown.set()
411
- self.force_save()
412
- self._writer_thread.join(timeout=5.0)
413
- self._encoder_pool.shutdown(wait=True)
414
-
415
-
416
- # === FAISS & Embeddings Setup ===
417
- try:
418
- from sentence_transformers import SentenceTransformer
419
- import faiss
420
-
421
- logger.info("Loading SentenceTransformer model...")
422
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
423
- logger.info("SentenceTransformer model loaded successfully")
424
-
425
- if os.path.exists(config.INDEX_FILE):
426
- logger.info(f"Loading existing FAISS index from {config.INDEX_FILE}")
427
- index = faiss.read_index(config.INDEX_FILE)
428
-
429
- if index.d != Constants.VECTOR_DIM:
430
- logger.warning(
431
- f"Index dimension mismatch: {index.d} != {Constants.VECTOR_DIM}. "
432
- f"Creating new index."
433
- )
434
- index = faiss.IndexFlatL2(Constants.VECTOR_DIM)
435
- incident_texts = []
436
- else:
437
- with open(config.TEXTS_FILE, "r") as f:
438
- incident_texts = json.load(f)
439
- logger.info(f"Loaded {len(incident_texts)} incident texts")
440
- else:
441
- logger.info("Creating new FAISS index")
442
- index = faiss.IndexFlatL2(Constants.VECTOR_DIM)
443
- incident_texts = []
444
-
445
- thread_safe_index = ProductionFAISSIndex(index, incident_texts)
446
-
447
- except ImportError as e:
448
- logger.warning(f"FAISS or SentenceTransformers not available: {e}")
449
- index = None
450
- incident_texts = []
451
- model = None
452
- thread_safe_index = None
453
- except Exception as e:
454
- logger.error(f"Error initializing FAISS: {e}", exc_info=True)
455
- index = None
456
- incident_texts = []
457
- model = None
458
- thread_safe_index = None
459
-
460
- # === Predictive Models ===
461
- class SimplePredictiveEngine:
462
- """
463
- Lightweight forecasting engine with proper constant usage
464
-
465
- FIXED: All magic numbers extracted to Constants
466
- """
467
-
468
- def __init__(self, history_window: int = Constants.HISTORY_WINDOW):
469
- self.history_window = history_window
470
- self.service_history: Dict[str, deque] = {}
471
- self.prediction_cache: Dict[str, Tuple[ForecastResult, datetime.datetime]] = {}
472
- self.max_cache_age = datetime.timedelta(minutes=Constants.CACHE_EXPIRY_MINUTES)
473
- self._lock = threading.RLock()
474
- logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
475
-
476
- def add_telemetry(self, service: str, event_data: Dict) -> None:
477
- """Add telemetry data to service history"""
478
- with self._lock:
479
- if service not in self.service_history:
480
- self.service_history[service] = deque(maxlen=self.history_window)
481
-
482
- telemetry_point = {
483
- 'timestamp': datetime.datetime.now(datetime.timezone.utc),
484
- 'latency': event_data.get('latency_p99', 0),
485
- 'error_rate': event_data.get('error_rate', 0),
486
- 'throughput': event_data.get('throughput', 0),
487
- 'cpu_util': event_data.get('cpu_util'),
488
- 'memory_util': event_data.get('memory_util')
489
- }
490
-
491
- self.service_history[service].append(telemetry_point)
492
- self._clean_cache()
493
-
494
- def _clean_cache(self) -> None:
495
- """Remove expired entries from prediction cache"""
496
- now = datetime.datetime.now(datetime.timezone.utc)
497
- expired = [k for k, (_, ts) in self.prediction_cache.items()
498
- if now - ts > self.max_cache_age]
499
- for k in expired:
500
- del self.prediction_cache[k]
501
-
502
- if expired:
503
- logger.debug(f"Cleaned {len(expired)} expired cache entries")
504
-
505
- def forecast_service_health(
506
- self,
507
- service: str,
508
- lookahead_minutes: int = Constants.FORECAST_LOOKAHEAD_MINUTES
509
- ) -> List[ForecastResult]:
510
- """Forecast service health metrics"""
511
- with self._lock:
512
- if service not in self.service_history or \
513
- len(self.service_history[service]) < Constants.FORECAST_MIN_DATA_POINTS:
514
- return []
515
-
516
- history = list(self.service_history[service])
517
-
518
- forecasts = []
519
-
520
- # Forecast latency
521
- latency_forecast = self._forecast_latency(history, lookahead_minutes)
522
- if latency_forecast:
523
- forecasts.append(latency_forecast)
524
-
525
- # Forecast error rate
526
- error_forecast = self._forecast_error_rate(history, lookahead_minutes)
527
- if error_forecast:
528
- forecasts.append(error_forecast)
529
-
530
- # Forecast resource utilization
531
- resource_forecasts = self._forecast_resources(history, lookahead_minutes)
532
- forecasts.extend(resource_forecasts)
533
-
534
- # Cache results
535
- with self._lock:
536
- for forecast in forecasts:
537
- cache_key = f"{service}_{forecast.metric}"
538
- self.prediction_cache[cache_key] = (forecast, datetime.datetime.now(datetime.timezone.utc))
539
-
540
- return forecasts
541
-
542
- def _forecast_latency(
543
- self,
544
- history: List,
545
- lookahead_minutes: int
546
- ) -> Optional[ForecastResult]:
547
- """Forecast latency using linear regression"""
548
- try:
549
- latencies = [point['latency'] for point in history[-20:]]
550
-
551
- if len(latencies) < Constants.FORECAST_MIN_DATA_POINTS:
552
- return None
553
-
554
- # Linear trend
555
- x = np.arange(len(latencies))
556
- slope, intercept = np.polyfit(x, latencies, 1)
557
-
558
- # Predict next value
559
- next_x = len(latencies)
560
- predicted_latency = slope * next_x + intercept
561
-
562
- # Calculate confidence
563
- residuals = latencies - (slope * x + intercept)
564
- confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
565
-
566
- # Determine trend and risk
567
- if slope > Constants.SLOPE_THRESHOLD_INCREASING:
568
- trend = "increasing"
569
- risk = "critical" if predicted_latency > Constants.LATENCY_EXTREME else "high"
570
- elif slope < Constants.SLOPE_THRESHOLD_DECREASING:
571
- trend = "decreasing"
572
- risk = "low"
573
- else:
574
- trend = "stable"
575
- risk = "low" if predicted_latency < Constants.LATENCY_WARNING else "medium"
576
-
577
- # Calculate time to reach critical threshold
578
- time_to_critical = None
579
- if slope > 0 and predicted_latency < Constants.LATENCY_EXTREME:
580
- denominator = predicted_latency - latencies[-1]
581
- if abs(denominator) > 0.1:
582
- minutes_to_critical = lookahead_minutes * \
583
- (Constants.LATENCY_EXTREME - predicted_latency) / denominator
584
- if minutes_to_critical > 0:
585
- time_to_critical = minutes_to_critical
586
-
587
- return ForecastResult(
588
- metric="latency",
589
- predicted_value=predicted_latency,
590
- confidence=confidence,
591
- trend=trend,
592
- time_to_threshold=time_to_critical,
593
- risk_level=risk
594
- )
595
-
596
- except Exception as e:
597
- logger.error(f"Latency forecast error: {e}", exc_info=True)
598
- return None
599
-
600
- def _forecast_error_rate(
601
- self,
602
- history: List,
603
- lookahead_minutes: int
604
- ) -> Optional[ForecastResult]:
605
- """Forecast error rate using exponential smoothing"""
606
- try:
607
- error_rates = [point['error_rate'] for point in history[-15:]]
608
-
609
- if len(error_rates) < Constants.FORECAST_MIN_DATA_POINTS:
610
- return None
611
-
612
- # Exponential smoothing
613
- alpha = 0.3
614
- forecast = error_rates[0]
615
- for rate in error_rates[1:]:
616
- forecast = alpha * rate + (1 - alpha) * forecast
617
-
618
- predicted_rate = forecast
619
-
620
- # Trend analysis
621
- recent_trend = np.mean(error_rates[-3:]) - np.mean(error_rates[-6:-3])
622
-
623
- if recent_trend > 0.02:
624
- trend = "increasing"
625
- risk = "critical" if predicted_rate > Constants.ERROR_RATE_CRITICAL else "high"
626
- elif recent_trend < -0.01:
627
- trend = "decreasing"
628
- risk = "low"
629
- else:
630
- trend = "stable"
631
- risk = "low" if predicted_rate < Constants.ERROR_RATE_WARNING else "medium"
632
-
633
- # Confidence based on volatility
634
- confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
635
-
636
- return ForecastResult(
637
- metric="error_rate",
638
- predicted_value=predicted_rate,
639
- confidence=confidence,
640
- trend=trend,
641
- risk_level=risk
642
- )
643
-
644
- except Exception as e:
645
- logger.error(f"Error rate forecast error: {e}", exc_info=True)
646
- return None
647
-
648
- def _forecast_resources(
649
- self,
650
- history: List,
651
- lookahead_minutes: int
652
- ) -> List[ForecastResult]:
653
- """Forecast CPU and memory utilization"""
654
- forecasts = []
655
-
656
- # CPU forecast
657
- cpu_values = [point['cpu_util'] for point in history if point.get('cpu_util') is not None]
658
- if len(cpu_values) >= Constants.FORECAST_MIN_DATA_POINTS:
659
- try:
660
- predicted_cpu = np.mean(cpu_values[-5:])
661
- trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
662
-
663
- risk = "low"
664
- if predicted_cpu > Constants.CPU_CRITICAL:
665
- risk = "critical"
666
- elif predicted_cpu > Constants.CPU_WARNING:
667
- risk = "high"
668
- elif predicted_cpu > 0.7:
669
- risk = "medium"
670
-
671
- forecasts.append(ForecastResult(
672
- metric="cpu_util",
673
- predicted_value=predicted_cpu,
674
- confidence=0.7,
675
- trend=trend,
676
- risk_level=risk
677
- ))
678
- except Exception as e:
679
- logger.error(f"CPU forecast error: {e}", exc_info=True)
680
-
681
- # Memory forecast
682
- memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
683
- if len(memory_values) >= Constants.FORECAST_MIN_DATA_POINTS:
684
- try:
685
- predicted_memory = np.mean(memory_values[-5:])
686
- trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
687
-
688
- risk = "low"
689
- if predicted_memory > Constants.MEMORY_CRITICAL:
690
- risk = "critical"
691
- elif predicted_memory > Constants.MEMORY_WARNING:
692
- risk = "high"
693
- elif predicted_memory > 0.7:
694
- risk = "medium"
695
-
696
- forecasts.append(ForecastResult(
697
- metric="memory_util",
698
- predicted_value=predicted_memory,
699
- confidence=0.7,
700
- trend=trend,
701
- risk_level=risk
702
- ))
703
- except Exception as e:
704
- logger.error(f"Memory forecast error: {e}", exc_info=True)
705
-
706
- return forecasts
707
-
708
- def get_predictive_insights(self, service: str) -> Dict[str, Any]:
709
- """Generate actionable insights from forecasts"""
710
- forecasts = self.forecast_service_health(service)
711
-
712
- critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
713
- warnings = []
714
- recommendations = []
715
-
716
- for forecast in critical_risks:
717
- if forecast.metric == "latency" and forecast.risk_level in ["high", "critical"]:
718
- warnings.append(f"📈 Latency expected to reach {forecast.predicted_value:.0f}ms")
719
- if forecast.time_to_threshold:
720
- minutes = int(forecast.time_to_threshold)
721
- recommendations.append(f"⏰ Critical latency (~{Constants.LATENCY_EXTREME}ms) in ~{minutes} minutes")
722
- recommendations.append("🔧 Consider scaling or optimizing dependencies")
723
-
724
- elif forecast.metric == "error_rate" and forecast.risk_level in ["high", "critical"]:
725
- warnings.append(f"🚨 Errors expected to reach {forecast.predicted_value*100:.1f}%")
726
- recommendations.append("🐛 Investigate recent deployments or dependency issues")
727
-
728
- elif forecast.metric == "cpu_util" and forecast.risk_level in ["high", "critical"]:
729
- warnings.append(f"🔥 CPU expected at {forecast.predicted_value*100:.1f}%")
730
- recommendations.append("⚡ Consider scaling compute resources")
731
-
732
- elif forecast.metric == "memory_util" and forecast.risk_level in ["high", "critical"]:
733
- warnings.append(f"💾 Memory expected at {forecast.predicted_value*100:.1f}%")
734
- recommendations.append("🧹 Check for memory leaks or optimize usage")
735
-
736
- return {
737
- 'service': service,
738
- 'forecasts': [
739
- {
740
- 'metric': f.metric,
741
- 'predicted_value': f.predicted_value,
742
- 'confidence': f.confidence,
743
- 'trend': f.trend,
744
- 'risk_level': f.risk_level,
745
- 'time_to_threshold': f.time_to_threshold
746
- }
747
- for f in forecasts
748
- ],
749
- 'warnings': warnings[:3],
750
- 'recommendations': list(dict.fromkeys(recommendations))[:3],
751
- 'critical_risk_count': len(critical_risks),
752
- 'forecast_timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat()
753
- }
754
-
755
-
756
- class BusinessImpactCalculator:
757
- """Calculate business impact of anomalies"""
758
-
759
- def __init__(self, revenue_per_request: float = 0.01):
760
- self.revenue_per_request = revenue_per_request
761
- logger.info(f"Initialized BusinessImpactCalculator")
762
-
763
- def calculate_impact(
764
- self,
765
- event: ReliabilityEvent,
766
- duration_minutes: int = 5
767
- ) -> Dict[str, Any]:
768
- """Calculate business impact for a reliability event"""
769
- base_revenue_per_minute = Constants.BASE_REVENUE_PER_MINUTE
770
-
771
- impact_multiplier = 1.0
772
-
773
- # Impact factors
774
- if event.latency_p99 > Constants.LATENCY_CRITICAL:
775
- impact_multiplier += 0.5
776
- if event.error_rate > 0.1:
777
- impact_multiplier += 0.8
778
- if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
779
- impact_multiplier += 0.3
780
-
781
- revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
782
-
783
- base_users_affected = Constants.BASE_USERS
784
- user_impact_multiplier = (event.error_rate * 10) + \
785
- (max(0, event.latency_p99 - 100) / 500)
786
- affected_users = int(base_users_affected * user_impact_multiplier)
787
-
788
- # Severity classification
789
- if revenue_loss > 500 or affected_users > 5000:
790
- severity = "CRITICAL"
791
- elif revenue_loss > 100 or affected_users > 1000:
792
- severity = "HIGH"
793
- elif revenue_loss > 50 or affected_users > 500:
794
- severity = "MEDIUM"
795
- else:
796
- severity = "LOW"
797
-
798
- logger.info(
799
- f"Business impact: ${revenue_loss:.2f} revenue loss, "
800
- f"{affected_users} users, {severity} severity"
801
- )
802
-
803
- return {
804
- 'revenue_loss_estimate': round(revenue_loss, 2),
805
- 'affected_users_estimate': affected_users,
806
- 'severity_level': severity,
807
- 'throughput_reduction_pct': round(min(100, user_impact_multiplier * 100), 1)
808
- }
809
-
810
-
811
- class AdvancedAnomalyDetector:
812
- """Enhanced anomaly detection with adaptive thresholds"""
813
-
814
- def __init__(self):
815
- self.historical_data = deque(maxlen=100)
816
- self.adaptive_thresholds = {
817
- 'latency_p99': Constants.LATENCY_WARNING,
818
- 'error_rate': Constants.ERROR_RATE_WARNING
819
- }
820
- self._lock = threading.RLock()
821
- logger.info("Initialized AdvancedAnomalyDetector")
822
-
823
- def detect_anomaly(self, event: ReliabilityEvent) -> bool:
824
- """Detect if event is anomalous using adaptive thresholds"""
825
- with self._lock:
826
- latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
827
- error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
828
-
829
- resource_anomaly = False
830
- if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
831
- resource_anomaly = True
832
- if event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL:
833
- resource_anomaly = True
834
-
835
- self._update_thresholds(event)
836
-
837
- is_anomaly = latency_anomaly or error_anomaly or resource_anomaly
838
-
839
- if is_anomaly:
840
- logger.info(
841
- f"Anomaly detected for {event.component}: "
842
- f"latency={latency_anomaly}, error={error_anomaly}, "
843
- f"resource={resource_anomaly}"
844
- )
845
-
846
- return is_anomaly
847
-
848
- def _update_thresholds(self, event: ReliabilityEvent) -> None:
849
- """Update adaptive thresholds based on historical data"""
850
- self.historical_data.append(event)
851
-
852
- if len(self.historical_data) > 10:
853
- recent_latencies = [e.latency_p99 for e in list(self.historical_data)[-20:]]
854
- new_threshold = np.percentile(recent_latencies, 90)
855
- self.adaptive_thresholds['latency_p99'] = new_threshold
856
- logger.debug(f"Updated adaptive latency threshold to {new_threshold:.2f}ms")
857
-
858
- # === Multi-Agent System ===
859
- class AgentSpecialization(Enum):
860
- """Agent specialization types"""
861
- DETECTIVE = "anomaly_detection"
862
- DIAGNOSTICIAN = "root_cause_analysis"
863
- PREDICTIVE = "predictive_analytics"
864
-
865
-
866
- class BaseAgent:
867
- """Base class for all specialized agents"""
868
-
869
- def __init__(self, specialization: AgentSpecialization):
870
- self.specialization = specialization
871
- self.performance_metrics = {
872
- 'processed_events': 0,
873
- 'successful_analyses': 0,
874
- 'average_confidence': 0.0
875
- }
876
-
877
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
878
- """Base analysis method to be implemented by specialized agents"""
879
- raise NotImplementedError
880
-
881
-
882
- class AnomalyDetectionAgent(BaseAgent):
883
- """Specialized agent for anomaly detection and pattern recognition"""
884
-
885
- def __init__(self):
886
- super().__init__(AgentSpecialization.DETECTIVE)
887
- logger.info("Initialized AnomalyDetectionAgent")
888
-
889
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
890
- """Perform comprehensive anomaly analysis"""
891
- try:
892
- anomaly_score = self._calculate_anomaly_score(event)
893
-
894
- return {
895
- 'specialization': self.specialization.value,
896
- 'confidence': anomaly_score,
897
- 'findings': {
898
- 'anomaly_score': anomaly_score,
899
- 'severity_tier': self._classify_severity(anomaly_score),
900
- 'primary_metrics_affected': self._identify_affected_metrics(event)
901
- },
902
- 'recommendations': self._generate_detection_recommendations(event, anomaly_score)
903
- }
904
- except Exception as e:
905
- logger.error(f"AnomalyDetectionAgent error: {e}", exc_info=True)
906
- return {
907
- 'specialization': self.specialization.value,
908
- 'confidence': 0.0,
909
- 'findings': {},
910
- 'recommendations': [f"Analysis error: {str(e)}"]
911
- }
912
-
913
- def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
914
- """Calculate comprehensive anomaly score (0-1)"""
915
- scores = []
916
-
917
- # Latency anomaly (weighted 40%)
918
- if event.latency_p99 > Constants.LATENCY_WARNING:
919
- latency_score = min(1.0, (event.latency_p99 - Constants.LATENCY_WARNING) / 500)
920
- scores.append(0.4 * latency_score)
921
-
922
- # Error rate anomaly (weighted 30%)
923
- if event.error_rate > Constants.ERROR_RATE_WARNING:
924
- error_score = min(1.0, event.error_rate / 0.3)
925
- scores.append(0.3 * error_score)
926
-
927
- # Resource anomaly (weighted 30%)
928
- resource_score = 0
929
- if event.cpu_util and event.cpu_util > Constants.CPU_WARNING:
930
- resource_score += 0.15 * min(1.0, (event.cpu_util - Constants.CPU_WARNING) / 0.2)
931
- if event.memory_util and event.memory_util > Constants.MEMORY_WARNING:
932
- resource_score += 0.15 * min(1.0, (event.memory_util - Constants.MEMORY_WARNING) / 0.2)
933
- scores.append(resource_score)
934
-
935
- return min(1.0, sum(scores))
936
-
937
- def _classify_severity(self, anomaly_score: float) -> str:
938
- """Classify severity tier based on anomaly score"""
939
- if anomaly_score > 0.8:
940
- return "CRITICAL"
941
- elif anomaly_score > 0.6:
942
- return "HIGH"
943
- elif anomaly_score > 0.4:
944
- return "MEDIUM"
945
- else:
946
- return "LOW"
947
-
948
- def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
949
- """Identify which metrics are outside normal ranges"""
950
- affected = []
951
-
952
- # Latency checks
953
- if event.latency_p99 > Constants.LATENCY_EXTREME:
954
- affected.append({
955
- "metric": "latency",
956
- "value": event.latency_p99,
957
- "severity": "CRITICAL",
958
- "threshold": Constants.LATENCY_WARNING
959
- })
960
- elif event.latency_p99 > Constants.LATENCY_CRITICAL:
961
- affected.append({
962
- "metric": "latency",
963
- "value": event.latency_p99,
964
- "severity": "HIGH",
965
- "threshold": Constants.LATENCY_WARNING
966
- })
967
- elif event.latency_p99 > Constants.LATENCY_WARNING:
968
- affected.append({
969
- "metric": "latency",
970
- "value": event.latency_p99,
971
- "severity": "MEDIUM",
972
- "threshold": Constants.LATENCY_WARNING
973
- })
974
-
975
- # Error rate checks
976
- if event.error_rate > Constants.ERROR_RATE_CRITICAL:
977
- affected.append({
978
- "metric": "error_rate",
979
- "value": event.error_rate,
980
- "severity": "CRITICAL",
981
- "threshold": Constants.ERROR_RATE_WARNING
982
- })
983
- elif event.error_rate > Constants.ERROR_RATE_HIGH:
984
- affected.append({
985
- "metric": "error_rate",
986
- "value": event.error_rate,
987
- "severity": "HIGH",
988
- "threshold": Constants.ERROR_RATE_WARNING
989
- })
990
- elif event.error_rate > Constants.ERROR_RATE_WARNING:
991
- affected.append({
992
- "metric": "error_rate",
993
- "value": event.error_rate,
994
- "severity": "MEDIUM",
995
- "threshold": Constants.ERROR_RATE_WARNING
996
- })
997
-
998
- # CPU checks
999
- if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
1000
- affected.append({
1001
- "metric": "cpu",
1002
- "value": event.cpu_util,
1003
- "severity": "CRITICAL",
1004
- "threshold": Constants.CPU_WARNING
1005
- })
1006
- elif event.cpu_util and event.cpu_util > Constants.CPU_WARNING:
1007
- affected.append({
1008
- "metric": "cpu",
1009
- "value": event.cpu_util,
1010
- "severity": "HIGH",
1011
- "threshold": Constants.CPU_WARNING
1012
- })
1013
-
1014
- # Memory checks
1015
- if event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL:
1016
- affected.append({
1017
- "metric": "memory",
1018
- "value": event.memory_util,
1019
- "severity": "CRITICAL",
1020
- "threshold": Constants.MEMORY_WARNING
1021
- })
1022
- elif event.memory_util and event.memory_util > Constants.MEMORY_WARNING:
1023
- affected.append({
1024
- "metric": "memory",
1025
- "value": event.memory_util,
1026
- "severity": "HIGH",
1027
- "threshold": Constants.MEMORY_WARNING
1028
- })
1029
-
1030
- return affected
1031
-
1032
- def _generate_detection_recommendations(
1033
- self,
1034
- event: ReliabilityEvent,
1035
- anomaly_score: float
1036
- ) -> List[str]:
1037
- """Generate actionable recommendations"""
1038
- recommendations = []
1039
- affected_metrics = self._identify_affected_metrics(event)
1040
-
1041
- for metric in affected_metrics:
1042
- metric_name = metric["metric"]
1043
- severity = metric["severity"]
1044
- value = metric["value"]
1045
- threshold = metric["threshold"]
1046
-
1047
- if metric_name == "latency":
1048
- if severity == "CRITICAL":
1049
- recommendations.append(
1050
- f"🚨 CRITICAL: Latency {value:.0f}ms (>{threshold}ms) - "
1051
- f"Check database & external dependencies"
1052
- )
1053
- elif severity == "HIGH":
1054
- recommendations.append(
1055
- f"⚠️ HIGH: Latency {value:.0f}ms (>{threshold}ms) - "
1056
- f"Investigate service performance"
1057
- )
1058
- else:
1059
- recommendations.append(
1060
- f"📈 Latency elevated: {value:.0f}ms (>{threshold}ms) - Monitor trend"
1061
- )
1062
-
1063
- elif metric_name == "error_rate":
1064
- if severity == "CRITICAL":
1065
- recommendations.append(
1066
- f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100:.1f}%) - "
1067
- f"Check recent deployments"
1068
- )
1069
- elif severity == "HIGH":
1070
- recommendations.append(
1071
- f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100:.1f}%) - "
1072
- f"Review application logs"
1073
- )
1074
- else:
1075
- recommendations.append(
1076
- f"📈 Errors increasing: {value*100:.1f}% (>{threshold*100:.1f}%)"
1077
- )
1078
-
1079
- elif metric_name == "cpu":
1080
- recommendations.append(
1081
- f"🔥 CPU {severity}: {value*100:.1f}% utilization - Consider scaling"
1082
- )
1083
-
1084
- elif metric_name == "memory":
1085
- recommendations.append(
1086
- f"💾 Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks"
1087
- )
1088
-
1089
- # Overall severity recommendations
1090
- if anomaly_score > 0.8:
1091
- recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
1092
- elif anomaly_score > 0.6:
1093
- recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
1094
- elif anomaly_score > 0.4:
1095
- recommendations.append("📊 MONITOR: Early warning signs detected")
1096
-
1097
- return recommendations[:4]
1098
-
1099
-
1100
- class RootCauseAgent(BaseAgent):
1101
- """Specialized agent for root cause analysis"""
1102
-
1103
- def __init__(self):
1104
- super().__init__(AgentSpecialization.DIAGNOSTICIAN)
1105
- logger.info("Initialized RootCauseAgent")
1106
-
1107
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
1108
- """Perform root cause analysis"""
1109
- try:
1110
- causes = self._analyze_potential_causes(event)
1111
-
1112
- return {
1113
- 'specialization': self.specialization.value,
1114
- 'confidence': 0.7,
1115
- 'findings': {
1116
- 'likely_root_causes': causes,
1117
- 'evidence_patterns': self._identify_evidence(event),
1118
- 'investigation_priority': self._prioritize_investigation(causes)
1119
- },
1120
- 'recommendations': [
1121
- f"Check {cause['cause']} for issues" for cause in causes[:2]
1122
- ]
1123
- }
1124
- except Exception as e:
1125
- logger.error(f"RootCauseAgent error: {e}", exc_info=True)
1126
- return {
1127
- 'specialization': self.specialization.value,
1128
- 'confidence': 0.0,
1129
- 'findings': {},
1130
- 'recommendations': [f"Analysis error: {str(e)}"]
1131
- }
1132
-
1133
- def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
1134
- """Analyze potential root causes based on event patterns"""
1135
- causes = []
1136
-
1137
- # Pattern 1: Database/External Dependency Failure
1138
- if event.latency_p99 > Constants.LATENCY_EXTREME and event.error_rate > 0.2:
1139
- causes.append({
1140
- "cause": "Database/External Dependency Failure",
1141
- "confidence": 0.85,
1142
- "evidence": f"Extreme latency ({event.latency_p99:.0f}ms) with high errors ({event.error_rate*100:.1f}%)",
1143
- "investigation": "Check database connection pool, external API health"
1144
- })
1145
-
1146
- # Pattern 2: Resource Exhaustion
1147
- if (event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL and
1148
- event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL):
1149
- causes.append({
1150
- "cause": "Resource Exhaustion",
1151
- "confidence": 0.90,
1152
- "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
1153
- "investigation": "Check for memory leaks, infinite loops, insufficient resources"
1154
- })
1155
-
1156
- # Pattern 3: Application Bug / Configuration Issue
1157
- if event.error_rate > Constants.ERROR_RATE_CRITICAL and event.latency_p99 < 200:
1158
- causes.append({
1159
- "cause": "Application Bug / Configuration Issue",
1160
- "confidence": 0.75,
1161
- "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
1162
- "investigation": "Review recent deployments, configuration changes, application logs"
1163
- })
1164
-
1165
- # Pattern 4: Gradual Performance Degradation
1166
- if (200 <= event.latency_p99 <= 400 and
1167
- Constants.ERROR_RATE_WARNING <= event.error_rate <= Constants.ERROR_RATE_HIGH):
1168
- causes.append({
1169
- "cause": "Gradual Performance Degradation",
1170
- "confidence": 0.65,
1171
- "evidence": f"Moderate latency ({event.latency_p99:.0f}ms) and errors ({event.error_rate*100:.1f}%)",
1172
- "investigation": "Check resource trends, dependency performance, capacity planning"
1173
- })
1174
-
1175
- # Default: Unknown pattern
1176
- if not causes:
1177
- causes.append({
1178
- "cause": "Unknown - Requires Investigation",
1179
- "confidence": 0.3,
1180
- "evidence": "Pattern does not match known failure modes",
1181
- "investigation": "Complete system review needed"
1182
- })
1183
-
1184
- return causes
1185
-
1186
- def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
1187
- """Identify evidence patterns in the event data"""
1188
- evidence = []
1189
-
1190
- if event.latency_p99 > event.error_rate * 1000:
1191
- evidence.append("latency_disproportionate_to_errors")
1192
-
1193
- if (event.cpu_util and event.cpu_util > Constants.CPU_WARNING and
1194
- event.memory_util and event.memory_util > Constants.MEMORY_WARNING):
1195
- evidence.append("correlated_resource_exhaustion")
1196
-
1197
- if event.error_rate > Constants.ERROR_RATE_HIGH and event.latency_p99 < Constants.LATENCY_CRITICAL:
1198
- evidence.append("errors_without_latency_impact")
1199
-
1200
- return evidence
1201
-
1202
- def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
1203
- """Determine investigation priority"""
1204
- for cause in causes:
1205
- if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
1206
- return "HIGH"
1207
- return "MEDIUM"
1208
-
1209
-
1210
- class PredictiveAgent(BaseAgent):
1211
- """Specialized agent for predictive analytics"""
1212
-
1213
- def __init__(self, engine: SimplePredictiveEngine):
1214
- super().__init__(AgentSpecialization.PREDICTIVE)
1215
- self.engine = engine
1216
- logger.info("Initialized PredictiveAgent")
1217
-
1218
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
1219
- """Perform predictive analysis for future risks"""
1220
- try:
1221
- event_data = {
1222
- 'latency_p99': event.latency_p99,
1223
- 'error_rate': event.error_rate,
1224
- 'throughput': event.throughput,
1225
- 'cpu_util': event.cpu_util,
1226
- 'memory_util': event.memory_util
1227
- }
1228
- self.engine.add_telemetry(event.component, event_data)
1229
-
1230
- insights = self.engine.get_predictive_insights(event.component)
1231
-
1232
- return {
1233
- 'specialization': self.specialization.value,
1234
- 'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
1235
- 'findings': insights,
1236
- 'recommendations': insights['recommendations']
1237
- }
1238
- except Exception as e:
1239
- logger.error(f"PredictiveAgent error: {e}", exc_info=True)
1240
- return {
1241
- 'specialization': self.specialization.value,
1242
- 'confidence': 0.0,
1243
- 'findings': {},
1244
- 'recommendations': [f"Analysis error: {str(e)}"]
1245
- }
1246
-
1247
-
1248
- # FIXED: Add circuit breaker for agent resilience
1249
- @circuit(failure_threshold=3, recovery_timeout=30, name="agent_circuit_breaker")
1250
- async def call_agent_with_protection(agent: BaseAgent, event: ReliabilityEvent) -> Dict[str, Any]:
1251
- """
1252
- Call agent with circuit breaker protection
1253
-
1254
- FIXED: Prevents cascading failures from misbehaving agents
1255
- """
1256
- try:
1257
- result = await asyncio.wait_for(
1258
- agent.analyze(event),
1259
- timeout=Constants.AGENT_TIMEOUT_SECONDS
1260
- )
1261
- return result
1262
- except asyncio.TimeoutError:
1263
- logger.warning(f"Agent {agent.specialization.value} timed out")
1264
- raise
1265
- except Exception as e:
1266
- logger.error(f"Agent {agent.specialization.value} error: {e}", exc_info=True)
1267
- raise
1268
-
1269
-
1270
- class OrchestrationManager:
1271
- """Orchestrates multiple specialized agents for comprehensive analysis"""
1272
-
1273
- def __init__(
1274
- self,
1275
- detective: Optional[AnomalyDetectionAgent] = None,
1276
- diagnostician: Optional[RootCauseAgent] = None,
1277
- predictive: Optional[PredictiveAgent] = None
1278
- ):
1279
- """
1280
- Initialize orchestration manager
1281
-
1282
- FIXED: Dependency injection for testability
1283
- """
1284
- self.agents = {
1285
- AgentSpecialization.DETECTIVE: detective or AnomalyDetectionAgent(),
1286
- AgentSpecialization.DIAGNOSTICIAN: diagnostician or RootCauseAgent(),
1287
- AgentSpecialization.PREDICTIVE: predictive or PredictiveAgent(SimplePredictiveEngine()),
1288
- }
1289
- logger.info(f"Initialized OrchestrationManager with {len(self.agents)} agents")
1290
-
1291
- async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
1292
- """
1293
- Coordinate multiple agents for comprehensive analysis
1294
-
1295
- FIXED: Improved timeout handling with circuit breakers
1296
- """
1297
- # Create tasks for all agents
1298
- agent_tasks = []
1299
- agent_specs = []
1300
-
1301
- for spec, agent in self.agents.items():
1302
- agent_tasks.append(call_agent_with_protection(agent, event))
1303
- agent_specs.append(spec)
1304
-
1305
- # FIXED: Parallel execution with global timeout
1306
- agent_results = {}
1307
-
1308
- try:
1309
- # Run all agents in parallel with global timeout
1310
- results = await asyncio.wait_for(
1311
- asyncio.gather(*agent_tasks, return_exceptions=True),
1312
- timeout=Constants.AGENT_TIMEOUT_SECONDS + 1
1313
- )
1314
-
1315
- # Process results
1316
- for spec, result in zip(agent_specs, results):
1317
- if isinstance(result, Exception):
1318
- logger.error(f"Agent {spec.value} failed: {result}")
1319
- continue
1320
-
1321
- agent_results[spec.value] = result
1322
- logger.debug(f"Agent {spec.value} completed successfully")
1323
-
1324
- except asyncio.TimeoutError:
1325
- logger.warning("Agent orchestration timed out")
1326
- except Exception as e:
1327
- logger.error(f"Agent orchestration error: {e}", exc_info=True)
1328
-
1329
- return self._synthesize_agent_findings(event, agent_results)
1330
-
1331
- def _synthesize_agent_findings(
1332
- self,
1333
- event: ReliabilityEvent,
1334
- agent_results: Dict
1335
- ) -> Dict[str, Any]:
1336
- """Combine insights from all specialized agents"""
1337
- detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
1338
- diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
1339
- predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
1340
-
1341
- if not detective_result:
1342
- logger.warning("No detective agent results available")
1343
- return {'error': 'No agent results available'}
1344
-
1345
- synthesis = {
1346
- 'incident_summary': {
1347
- 'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
1348
- 'anomaly_confidence': detective_result['confidence'],
1349
- 'primary_metrics_affected': [
1350
- metric["metric"] for metric in
1351
- detective_result['findings'].get('primary_metrics_affected', [])
1352
- ]
1353
- },
1354
- 'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
1355
- 'predictive_insights': predictive_result['findings'] if predictive_result else {},
1356
- 'recommended_actions': self._prioritize_actions(
1357
- detective_result.get('recommendations', []),
1358
- diagnostician_result.get('recommendations', []) if diagnostician_result else [],
1359
- predictive_result.get('recommendations', []) if predictive_result else []
1360
- ),
1361
- 'agent_metadata': {
1362
- 'participating_agents': list(agent_results.keys()),
1363
- 'analysis_timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat()
1364
- }
1365
- }
1366
-
1367
- return synthesis
1368
-
1369
- def _prioritize_actions(
1370
- self,
1371
- detection_actions: List[str],
1372
- diagnosis_actions: List[str],
1373
- predictive_actions: List[str]
1374
- ) -> List[str]:
1375
- """Combine and prioritize actions from multiple agents"""
1376
- all_actions = detection_actions + diagnosis_actions + predictive_actions
1377
- seen = set()
1378
- unique_actions = []
1379
- for action in all_actions:
1380
- if action not in seen:
1381
- seen.add(action)
1382
- unique_actions.append(action)
1383
- return unique_actions[:5]
1384
-
1385
- # === Enhanced Reliability Engine ===
1386
- class EnhancedReliabilityEngine:
1387
- """
1388
- Main engine for processing reliability events
1389
-
1390
- FIXED: Dependency injection for all components
1391
- """
1392
-
1393
- def __init__(
1394
- self,
1395
- orchestrator: Optional[OrchestrationManager] = None,
1396
- policy_engine: Optional[PolicyEngine] = None,
1397
- event_store: Optional[ThreadSafeEventStore] = None,
1398
- anomaly_detector: Optional[AdvancedAnomalyDetector] = None,
1399
- business_calculator: Optional[BusinessImpactCalculator] = None
1400
- ):
1401
- """
1402
- Initialize reliability engine with dependency injection
1403
-
1404
- FIXED: All dependencies injected for testability
1405
- """
1406
- self.orchestrator = orchestrator or OrchestrationManager()
1407
- self.policy_engine = policy_engine or PolicyEngine()
1408
- self.event_store = event_store or ThreadSafeEventStore()
1409
- self.anomaly_detector = anomaly_detector or AdvancedAnomalyDetector()
1410
- self.business_calculator = business_calculator or BusinessImpactCalculator()
1411
-
1412
- self.performance_metrics = {
1413
- 'total_incidents_processed': 0,
1414
- 'multi_agent_analyses': 0,
1415
- 'anomalies_detected': 0
1416
- }
1417
- self._lock = threading.RLock()
1418
- logger.info("Initialized EnhancedReliabilityEngine")
1419
-
1420
- async def process_event_enhanced(
1421
- self,
1422
- component: str,
1423
- latency: float,
1424
- error_rate: float,
1425
- throughput: float = 1000,
1426
- cpu_util: Optional[float] = None,
1427
- memory_util: Optional[float] = None
1428
- ) -> Dict[str, Any]:
1429
- """
1430
- Process a reliability event through the complete analysis pipeline
1431
-
1432
- FIXED: Proper async/await throughout
1433
- """
1434
- logger.info(
1435
- f"Processing event for {component}: latency={latency}ms, "
1436
- f"error_rate={error_rate*100:.1f}%"
1437
- )
1438
-
1439
- # Validate component ID
1440
- is_valid, error_msg = validate_component_id(component)
1441
- if not is_valid:
1442
- return {'error': error_msg, 'status': 'INVALID'}
1443
-
1444
- # Create event
1445
- try:
1446
- event = ReliabilityEvent(
1447
- component=component,
1448
- latency_p99=latency,
1449
- error_rate=error_rate,
1450
- throughput=throughput,
1451
- cpu_util=cpu_util,
1452
- memory_util=memory_util,
1453
- upstream_deps=["auth-service", "database"] if component == "api-service" else []
1454
- )
1455
- except Exception as e:
1456
- logger.error(f"Event creation error: {e}", exc_info=True)
1457
- return {'error': f'Invalid event data: {str(e)}', 'status': 'INVALID'}
1458
-
1459
- # Multi-agent analysis
1460
- agent_analysis = await self.orchestrator.orchestrate_analysis(event)
1461
-
1462
- # Anomaly detection
1463
- is_anomaly = self.anomaly_detector.detect_anomaly(event)
1464
-
1465
- # Determine severity based on agent confidence
1466
- agent_confidence = 0.0
1467
- if agent_analysis and 'incident_summary' in agent_analysis:
1468
- agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1469
- else:
1470
- agent_confidence = 0.8 if is_anomaly else 0.1
1471
-
1472
- # Set event severity
1473
- if agent_confidence > 0.8:
1474
- severity = EventSeverity.CRITICAL
1475
- elif agent_confidence > 0.6:
1476
- severity = EventSeverity.HIGH
1477
- elif agent_confidence > 0.4:
1478
- severity = EventSeverity.MEDIUM
1479
- else:
1480
- severity = EventSeverity.LOW
1481
-
1482
- # Create mutable copy with updated severity
1483
- event = event.model_copy(update={'severity': severity})
1484
-
1485
- # Evaluate healing policies
1486
- healing_actions = self.policy_engine.evaluate_policies(event)
1487
-
1488
- # Calculate business impact
1489
- business_impact = self.business_calculator.calculate_impact(event) if is_anomaly else None
1490
-
1491
- # Store in vector database for similarity detection
1492
- if thread_safe_index is not None and model is not None and is_anomaly:
1493
- try:
1494
- # FIXED: Non-blocking encoding with ProcessPoolExecutor
1495
- analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
1496
- vector_text = f"{component} {latency} {error_rate} {analysis_text}"
1497
-
1498
- # Encode asynchronously
1499
- loop = asyncio.get_event_loop()
1500
- vec = await loop.run_in_executor(
1501
- thread_safe_index._encoder_pool,
1502
- model.encode,
1503
- [vector_text]
1504
- )
1505
-
1506
- thread_safe_index.add_async(np.array(vec, dtype=np.float32), vector_text)
1507
- except Exception as e:
1508
- logger.error(f"Error storing vector: {e}", exc_info=True)
1509
-
1510
- # Build comprehensive result
1511
- result = {
1512
- "timestamp": event.timestamp.isoformat(),
1513
- "component": component,
1514
- "latency_p99": latency,
1515
- "error_rate": error_rate,
1516
- "throughput": throughput,
1517
- "status": "ANOMALY" if is_anomaly else "NORMAL",
1518
- "multi_agent_analysis": agent_analysis,
1519
- "healing_actions": [action.value for action in healing_actions],
1520
- "business_impact": business_impact,
1521
- "severity": event.severity.value,
1522
- "similar_incidents_count": thread_safe_index.get_count() if thread_safe_index and is_anomaly else 0,
1523
- "processing_metadata": {
1524
- "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
1525
- "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1526
- }
1527
- }
1528
-
1529
- # Store event in history
1530
- self.event_store.add(event)
1531
-
1532
- # Update performance metrics
1533
- with self._lock:
1534
- self.performance_metrics['total_incidents_processed'] += 1
1535
- self.performance_metrics['multi_agent_analyses'] += 1
1536
- if is_anomaly:
1537
- self.performance_metrics['anomalies_detected'] += 1
1538
-
1539
- logger.info(f"Event processed: {result['status']} with {result['severity']} severity")
1540
-
1541
- # Enhance with Claude AI reasoning (optional layer)
1542
- try:
1543
- result = await self.enhance_with_claude(event, result)
1544
- except Exception as e:
1545
- logger.error(f"Failed to enhance with Claude: {e}")
1546
- # Continue without enhancement
1547
-
1548
- # === Generate voice alert for anomalies ===
1549
- if is_anomaly and voice_handler:
1550
- try:
1551
- # Extract diagnosis and action from results
1552
- diagnosis = "System anomaly detected"
1553
- action = "INVESTIGATE"
1554
-
1555
- if agent_analysis and 'recommended_actions' in agent_analysis:
1556
- actions = agent_analysis['recommended_actions']
1557
- if actions:
1558
- diagnosis = actions[0][:100] # First recommendation, truncated
1559
-
1560
- if healing_actions:
1561
- action = healing_actions[0].value if healing_actions else "INVESTIGATE"
1562
-
1563
- # Generate voice alert
1564
- logger.info(f"Generating voice alert for {component}")
1565
- audio_file = voice_handler.speak_alert(
1566
- diagnosis=diagnosis,
1567
- action=action,
1568
- output_filename=f"alert_{component}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
1569
- )
1570
-
1571
- if audio_file:
1572
- result['audio_file'] = audio_file
1573
- logger.info(f"✅ Voice alert generated: {audio_file}")
1574
- else:
1575
- logger.warning("Voice generation returned no file")
1576
-
1577
- except Exception as e:
1578
- logger.error(f"Voice generation failed (non-critical): {e}", exc_info=True)
1579
- # === End voice integration ===
1580
-
1581
- return result
1582
-
1583
- async def enhance_with_claude(
1584
- self,
1585
- event: ReliabilityEvent,
1586
- agent_results: Dict[str, Any]
1587
- ) -> Dict[str, Any]:
1588
- """
1589
- Enhance agent results with Claude AI reasoning
1590
-
1591
- This is a NON-INVASIVE layer - all existing logic stays intact.
1592
- If Claude fails, original results are returned unchanged.
1593
- """
1594
- try:
1595
- # Build comprehensive context for Claude
1596
- context_parts = []
1597
-
1598
- # Add event summary
1599
- context_parts.append("INCIDENT SUMMARY:")
1600
- context_parts.append(f"Component: {event.component}")
1601
- context_parts.append(f"Timestamp: {event.timestamp.isoformat()}")
1602
- context_parts.append(f"Severity: {event.severity.value}")
1603
- context_parts.append("")
1604
-
1605
- # Add metrics
1606
- context_parts.append("METRICS:")
1607
- context_parts.append(f"• Latency P99: {event.latency_p99}ms")
1608
- context_parts.append(f"• Error Rate: {event.error_rate:.1%}")
1609
- context_parts.append(f"• Throughput: {event.throughput} req/s")
1610
- if event.cpu_util:
1611
- context_parts.append(f"• CPU: {event.cpu_util:.1%}")
1612
- if event.memory_util:
1613
- context_parts.append(f"• Memory: {event.memory_util:.1%}")
1614
- context_parts.append("")
1615
-
1616
- # Add agent findings
1617
- if agent_results:
1618
- context_parts.append("AGENT ANALYSIS:")
1619
- if 'multi_agent_analysis' in agent_results:
1620
- analysis = agent_results['multi_agent_analysis']
1621
- context_parts.append(json.dumps(analysis, indent=2))
1622
- elif 'incident_summary' in agent_results:
1623
- context_parts.append(json.dumps(agent_results['incident_summary'], indent=2))
1624
-
1625
- context = "\n".join(context_parts)
1626
-
1627
- # Create prompt for Claude
1628
- prompt = f"""{context}
1629
-
1630
- TASK: Provide an executive summary synthesizing all agent analyses.
1631
- Include:
1632
- 1. Concise incident description
1633
- 2. Most likely root cause
1634
- 3. Single best recovery action
1635
- 4. Estimated impact and recovery time
1636
-
1637
- Be specific and actionable."""
1638
-
1639
- system_prompt = """You are a senior Site Reliability Engineer synthesizing
1640
- multiple AI agent analyses into clear, actionable guidance for incident response.
1641
- Focus on clarity, accuracy, and decisive recommendations."""
1642
-
1643
- # Get Claude's synthesis
1644
- logger.info("Requesting Claude synthesis of agent results")
1645
- claude_synthesis = claude_adapter.generate_completion(
1646
- prompt=prompt,
1647
- system_prompt=system_prompt
1648
- )
1649
-
1650
- # Add Claude's insights to results (non-destructive)
1651
- agent_results['claude_synthesis'] = {
1652
- 'summary': claude_synthesis,
1653
- 'timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat(),
1654
- 'source': 'claude-opus-4'
1655
- }
1656
-
1657
- logger.info("✅ Claude synthesis added to results")
1658
- return agent_results
1659
-
1660
- except Exception as e:
1661
- logger.error(f"Claude enhancement failed: {e}", exc_info=True)
1662
- # Return original results unchanged - system still works!
1663
- return agent_results
1664
-
1665
- # === Initialize Engine (with dependency injection) ===
1666
- enhanced_engine = EnhancedReliabilityEngine()
1667
-
1668
- # === Initialize Voice Handler ===
1669
- voice_handler = VoiceHandler()
1670
- logger.info("Voice handler initialized")
1671
-
1672
- # === Rate Limiting ===
1673
- class RateLimiter:
1674
- """Simple rate limiter for request throttling"""
1675
-
1676
- def __init__(self, max_per_minute: int = Constants.MAX_REQUESTS_PER_MINUTE):
1677
- self.max_per_minute = max_per_minute
1678
- self.requests: deque = deque(maxlen=max_per_minute)
1679
- self._lock = threading.RLock()
1680
-
1681
- def is_allowed(self) -> Tuple[bool, str]:
1682
- """Check if request is allowed"""
1683
- with self._lock:
1684
- now = datetime.datetime.now(datetime.timezone.utc)
1685
-
1686
- # Remove requests older than 1 minute
1687
- one_minute_ago = now - datetime.timedelta(minutes=1)
1688
- while self.requests and self.requests[0] < one_minute_ago:
1689
- self.requests.popleft()
1690
-
1691
- # Check rate limit
1692
- if len(self.requests) >= self.max_per_minute:
1693
- return False, f"Rate limit exceeded: {self.max_per_minute} requests/minute"
1694
-
1695
- # Add current request
1696
- self.requests.append(now)
1697
- return True, ""
1698
-
1699
-
1700
- rate_limiter = RateLimiter()
1701
-
1702
-
1703
- # === Gradio UI ===
1704
- def create_enhanced_ui():
1705
- """
1706
- Create the comprehensive Gradio UI for the reliability framework
1707
-
1708
- FIXED: Uses native async handlers (no event loop creation)
1709
- FIXED: Rate limiting on all endpoints
1710
- """
1711
-
1712
- with gr.Blocks(title="🧠 Agentic Reliability Framework", theme="soft") as demo:
1713
- gr.Markdown("""
1714
- # 🧠 Agentic Reliability Framework
1715
- **Multi-Agent AI System for Production Reliability**
1716
-
1717
- *Specialized AI agents working together to detect, diagnose, predict, and heal system issues*
1718
-
1719
- """)
1720
-
1721
- with gr.Row():
1722
- with gr.Column(scale=1):
1723
- gr.Markdown("### 📊 Telemetry Input")
1724
- component = gr.Dropdown(
1725
- choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
1726
- value="api-service",
1727
- label="Component",
1728
- info="Select the service being monitored"
1729
- )
1730
- latency = gr.Slider(
1731
- minimum=10, maximum=1000, value=100, step=1,
1732
- label="Latency P99 (ms)",
1733
- info=f"Alert threshold: >{Constants.LATENCY_WARNING}ms (adaptive)"
1734
- )
1735
- error_rate = gr.Slider(
1736
- minimum=0, maximum=0.5, value=0.02, step=0.001,
1737
- label="Error Rate",
1738
- info=f"Alert threshold: >{Constants.ERROR_RATE_WARNING}"
1739
- )
1740
- throughput = gr.Number(
1741
- value=1000,
1742
- label="Throughput (req/sec)",
1743
- info="Current request rate"
1744
- )
1745
- cpu_util = gr.Slider(
1746
- minimum=0, maximum=1, value=0.4, step=0.01,
1747
- label="CPU Utilization",
1748
- info="0.0 - 1.0 scale"
1749
- )
1750
- memory_util = gr.Slider(
1751
- minimum=0, maximum=1, value=0.3, step=0.01,
1752
- label="Memory Utilization",
1753
- info="0.0 - 1.0 scale"
1754
- )
1755
- submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary", size="lg")
1756
-
1757
- with gr.Column(scale=2):
1758
- gr.Markdown("### 🔍 Multi-Agent Analysis")
1759
- output_text = gr.Textbox(
1760
- label="Agent Synthesis",
1761
- placeholder="AI agents are analyzing...",
1762
- lines=6
1763
- )
1764
-
1765
- # === ADD AUDIO PLAYER ===
1766
- audio_output = gr.Audio(
1767
- label="🔊 Voice Alert",
1768
- type="filepath",
1769
- visible=False,
1770
- autoplay=True
1771
- )
1772
- # === END AUDIO PLAYER ===
1773
-
1774
- with gr.Accordion("🤖 Agent Specialists Analysis", open=False):
1775
- gr.Markdown("""
1776
- **Specialized AI Agents:**
1777
- - 🕵️ **Detective**: Anomaly detection & pattern recognition
1778
- - 🔍 **Diagnostician**: Root cause analysis & investigation
1779
- - 🔮 **Predictive**: Future risk forecasting & trend analysis
1780
- """)
1781
-
1782
- agent_insights = gr.JSON(
1783
- label="Detailed Agent Findings",
1784
- value={}
1785
- )
1786
-
1787
- with gr.Accordion("🔮 Predictive Analytics & Forecasting", open=False):
1788
- gr.Markdown("""
1789
- **Future Risk Forecasting:**
1790
- - 📈 Latency trends and thresholds
1791
- - 🚨 Error rate predictions
1792
- - ���� Resource utilization forecasts
1793
- - ⏰ Time-to-failure estimates
1794
- """)
1795
-
1796
- predictive_insights = gr.JSON(
1797
- label="Predictive Forecasts",
1798
- value={}
1799
- )
1800
-
1801
- with gr.Accordion("🤖 Claude AI Synthesis", open=True):
1802
- gr.Markdown("""
1803
- **Claude Opus 4.5 Executive Summary:**
1804
- - 📋 Incident synthesis from all agents
1805
- - 🎯 Root cause identification
1806
- - 💡 Recommended recovery actions
1807
- - ⏰ Impact and recovery time estimates
1808
- """)
1809
-
1810
- claude_output = gr.Markdown(
1811
- value="*Claude AI synthesis will appear here after incident analysis*",
1812
- label="AI Executive Summary"
1813
- )
1814
-
1815
- gr.Markdown("### 📈 Recent Events (Last 15)")
1816
- events_table = gr.Dataframe(
1817
- headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
1818
- label="Event History",
1819
- wrap=True,
1820
- )
1821
-
1822
- with gr.Accordion("ℹ️ Framework Capabilities", open=False):
1823
- gr.Markdown("""
1824
- - **🤖 Multi-Agent AI**: Specialized agents for detection, diagnosis, prediction, and healing
1825
- - **🔮 Predictive Analytics**: Forecast future risks and performance degradation
1826
- - **🔧 Policy-Based Healing**: Automated recovery actions based on severity and context
1827
- - **💰 Business Impact**: Revenue and user impact quantification
1828
- - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
1829
- - **📚 Vector Memory**: FAISS-based incident memory for similarity detection
1830
- - **⚡ Production Ready**: Circuit breakers, cooldowns, thread safety, and enterprise features
1831
- - **🔒 Security Patched**: All critical CVEs fixed (Gradio 5.50.0+, Requests 2.32.5+)
1832
- """)
1833
-
1834
- with gr.Accordion("🔧 Healing Policies", open=False):
1835
- policy_info = []
1836
- for policy in enhanced_engine.policy_engine.policies:
1837
- if policy.enabled:
1838
- actions = ", ".join([action.value for action in policy.actions])
1839
- policy_info.append(
1840
- f"**{policy.name}** (Priority {policy.priority}): {actions}\n"
1841
- f" - Cooldown: {policy.cool_down_seconds}s\n"
1842
- f" - Max executions: {policy.max_executions_per_hour}/hour"
1843
- )
1844
-
1845
- gr.Markdown("\n\n".join(policy_info))
1846
-
1847
- # FIXED: Native async handler (no event loop creation needed)
1848
- async def submit_event_enhanced_async(
1849
- component, latency, error_rate, throughput, cpu_util, memory_util
1850
- ):
1851
- """
1852
- Async event handler - uses Gradio's native async support
1853
-
1854
- CRITICAL FIX: No event loop creation - Gradio handles this
1855
- FIXED: Rate limiting added
1856
- FIXED: Comprehensive error handling
1857
- """
1858
- try:
1859
- # Rate limiting check
1860
- allowed, rate_msg = rate_limiter.is_allowed()
1861
- if not allowed:
1862
- logger.warning(f"Rate limit exceeded")
1863
- return rate_msg, {}, {}, "*Rate limit exceeded*", gr.Dataframe(value=[]), gr.Audio(visible=False)
1864
-
1865
- # Type conversion
1866
- try:
1867
- latency = float(latency)
1868
- error_rate = float(error_rate)
1869
- throughput = float(throughput) if throughput else 1000
1870
- cpu_util = float(cpu_util) if cpu_util else None
1871
- memory_util = float(memory_util) if memory_util else None
1872
- except (ValueError, TypeError) as e:
1873
- error_msg = f"❌ Invalid input types: {str(e)}"
1874
- logger.warning(error_msg)
1875
- return error_msg, {}, {}, "*Invalid input type*", gr.Dataframe(value=[]), gr.Audio(visible=False)
1876
-
1877
- # Input validation
1878
- is_valid, error_msg = validate_inputs(
1879
- latency, error_rate, throughput, cpu_util, memory_util
1880
- )
1881
- if not is_valid:
1882
- logger.warning(f"Invalid input: {error_msg}")
1883
- return error_msg, {}, {}, "*Validation failed*", gr.Dataframe(value=[]), gr.Audio(visible=False)
1884
-
1885
- # FIXED: Direct async call - no event loop creation needed
1886
- result = await enhanced_engine.process_event_enhanced(
1887
- component, latency, error_rate, throughput, cpu_util, memory_util
1888
- )
1889
-
1890
- # Handle errors
1891
- if 'error' in result:
1892
- return f"❌ {result['error']}", {}, {}, "*Error occurred*", gr.Dataframe(value=[]), gr.Audio(visible=False)
1893
-
1894
- # Build table data (THREAD-SAFE)
1895
- table_data = []
1896
- for event in enhanced_engine.event_store.get_recent(15):
1897
- table_data.append([
1898
- event.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
1899
- event.component,
1900
- f"{event.latency_p99:.0f}ms",
1901
- f"{event.error_rate:.3f}",
1902
- f"{event.throughput:.0f}",
1903
- event.severity.value.upper(),
1904
- "Multi-agent analysis"
1905
- ])
1906
-
1907
- # Format output message
1908
- status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
1909
- output_msg = f"{status_emoji} **{result['status']}**\n"
1910
-
1911
- if "multi_agent_analysis" in result:
1912
- analysis = result["multi_agent_analysis"]
1913
- confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1914
- output_msg += f"🎯 **Confidence**: {confidence*100:.1f}%\n"
1915
-
1916
- predictive_data = analysis.get('predictive_insights', {})
1917
- if predictive_data.get('critical_risk_count', 0) > 0:
1918
- output_msg += f"🔮 **PREDICTIVE**: {predictive_data['critical_risk_count']} critical risks forecast\n"
1919
-
1920
- if analysis.get('recommended_actions'):
1921
- actions_preview = ', '.join(analysis['recommended_actions'][:2])
1922
- output_msg += f"💡 **Top Insights**: {actions_preview}\n"
1923
-
1924
- if result.get("business_impact"):
1925
- impact = result["business_impact"]
1926
- output_msg += (
1927
- f"💰 **Business Impact**: \${impact['revenue_loss_estimate']:.2f} | "
1928
- f"👥 {impact['affected_users_estimate']} users | "
1929
- f"🚨 {impact['severity_level']}\n"
1930
- )
1931
-
1932
- if result.get("healing_actions") and result["healing_actions"] != ["no_action"]:
1933
- actions = ", ".join(result["healing_actions"])
1934
- output_msg += f"🔧 **Auto-Actions**: {actions}"
1935
-
1936
- agent_insights_data = result.get("multi_agent_analysis", {})
1937
- predictive_insights_data = agent_insights_data.get('predictive_insights', {})
1938
-
1939
- # Extract Claude synthesis for display
1940
- claude_synthesis = result.get('claude_synthesis', {})
1941
- claude_text = claude_synthesis.get('summary', '*No Claude synthesis available*')
1942
-
1943
- # Format Claude output beautifully
1944
- claude_display = f"""
1945
- ### 🤖 Claude Opus 4.5 Executive Analysis
1946
-
1947
- {claude_text}
1948
-
1949
- ---
1950
- *Generated: {claude_synthesis.get('timestamp', 'N/A')}*
1951
- *Model: {claude_synthesis.get('source', 'claude-opus-4')}*
1952
- """
1953
-
1954
- # === Extract audio file if present ===
1955
- audio_file = result.get('audio_file', None)
1956
- audio_visible = audio_file is not None
1957
-
1958
- if audio_file:
1959
- logger.info(f"Audio alert available: {audio_file}")
1960
- # === End audio extraction ===
1961
-
1962
- return (
1963
- output_msg,
1964
- agent_insights_data,
1965
- predictive_insights_data,
1966
- claude_display,
1967
- gr.Dataframe(
1968
- headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
1969
- value=table_data,
1970
- wrap=True
1971
- ),
1972
- gr.Audio(value=audio_file, visible=audio_visible)
1973
- )
1974
-
1975
- except Exception as e:
1976
- error_msg = f"❌ Error processing event: {str(e)}"
1977
- logger.error(error_msg, exc_info=True)
1978
- return error_msg, {}, {}, "*Processing error*", gr.Dataframe(value=[]), gr.Audio(visible=False)
1979
-
1980
- # FIXED: Use async handler directly
1981
- submit_btn.click(
1982
- fn=submit_event_enhanced_async,
1983
- inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
1984
- outputs=[output_text, agent_insights, predictive_insights, claude_output, events_table, audio_output]
1985
- )
1986
-
1987
- return demo
1988
-
1989
-
1990
- # === Main Entry Point ===
1991
- if __name__ == "__main__":
1992
- logger.info("=" * 80)
1993
- logger.info("Starting Enterprise Agentic Reliability Framework (PATCHED VERSION)")
1994
- logger.info("=" * 80)
1995
- logger.info(f"Python version: {os.sys.version}")
1996
- logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
1997
- logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
1998
- logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
1999
- logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
2000
- logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
2001
- logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
2002
- logger.info("=" * 80)
2003
-
2004
- try:
2005
- demo = create_enhanced_ui()
2006
-
2007
- logger.info("Launching Gradio UI on 0.0.0.0:7860...")
2008
- demo.launch(
2009
- server_name="0.0.0.0",
2010
- server_port=7860,
2011
- share=False,
2012
- show_error=True
2013
- )
2014
- except KeyboardInterrupt:
2015
- logger.info("Received shutdown signal...")
2016
- except Exception as e:
2017
- logger.error(f"Application error: {e}", exc_info=True)
2018
- finally:
2019
- # Graceful shutdown
2020
- logger.info("Shutting down gracefully...")
2021
-
2022
- if thread_safe_index:
2023
- logger.info("Saving pending vectors before shutdown...")
2024
- thread_safe_index.shutdown()
2025
-
2026
- logger.info("=" * 80)
2027
- logger.info("Application shutdown complete")
2028
- logger.info("=" * 80)