petter2025 commited on
Commit
cc0eab2
·
verified ·
1 Parent(s): 47e1a25

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -1973
app.py DELETED
@@ -1,1973 +0,0 @@
1
- """
2
- Enterprise Agentic Reliability Framework - Main Application (FIXED VERSION)
3
- Multi-Agent AI System for Production Reliability Monitoring
4
-
5
- CRITICAL FIXES APPLIED:
6
- - Removed event loop creation (uses Gradio native async)
7
- - Fixed FAISS thread safety with single-writer pattern
8
- - ProcessPoolExecutor for CPU-intensive encoding
9
- - Atomic saves with fsync
10
- - Dependency injection
11
- - Rate limiting
12
- - Comprehensive input validation
13
- - Circuit breakers for agent resilience
14
- """
15
-
16
- import os
17
- import json
18
- import numpy as np
19
- import gradio as gr
20
- import requests
21
- import pandas as pd
22
- import datetime
23
- import threading
24
- import logging
25
- import asyncio
26
- import tempfile
27
- from typing import List, Dict, Any, Optional, Tuple
28
- from collections import deque, OrderedDict
29
- from dataclasses import dataclass, asdict
30
- from enum import Enum
31
- from concurrent.futures import ProcessPoolExecutor
32
- from queue import Queue
33
- from circuitbreaker import circuit
34
- import atomicwrites
35
-
36
- # Import our modules
37
- from models import (
38
- ReliabilityEvent, EventSeverity, AnomalyResult,
39
- HealingAction, ForecastResult, PolicyCondition
40
- )
41
- from claude_adapter import get_claude_adapter
42
- from healing_policies import PolicyEngine, DEFAULT_HEALING_POLICIES
43
-
44
- # === Logging Configuration ===
45
- logging.basicConfig(
46
- level=logging.INFO,
47
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
48
- )
49
- logger = logging.getLogger(__name__)
50
- # Initialize Claude adapter for AI reasoning
51
- claude_adapter = get_claude_adapter()
52
-
53
-
54
- # === CONSTANTS (FIXED: Extracted all magic numbers) ===
55
- class Constants:
56
- """Centralized constants to eliminate magic numbers"""
57
-
58
- # Thresholds
59
- LATENCY_WARNING = 150.0
60
- LATENCY_CRITICAL = 300.0
61
- LATENCY_EXTREME = 500.0
62
-
63
- ERROR_RATE_WARNING = 0.05
64
- ERROR_RATE_HIGH = 0.15
65
- ERROR_RATE_CRITICAL = 0.3
66
-
67
- CPU_WARNING = 0.8
68
- CPU_CRITICAL = 0.9
69
-
70
- MEMORY_WARNING = 0.8
71
- MEMORY_CRITICAL = 0.9
72
-
73
- # Forecasting
74
- SLOPE_THRESHOLD_INCREASING = 5.0
75
- SLOPE_THRESHOLD_DECREASING = -2.0
76
-
77
- FORECAST_MIN_DATA_POINTS = 5
78
- FORECAST_LOOKAHEAD_MINUTES = 15
79
-
80
- # Performance
81
- HISTORY_WINDOW = 50
82
- MAX_EVENTS_STORED = 1000
83
- AGENT_TIMEOUT_SECONDS = 5
84
- CACHE_EXPIRY_MINUTES = 15
85
-
86
- # FAISS
87
- FAISS_BATCH_SIZE = 10
88
- FAISS_SAVE_INTERVAL_SECONDS = 30
89
- VECTOR_DIM = 384
90
-
91
- # Business metrics
92
- BASE_REVENUE_PER_MINUTE = 100.0
93
- BASE_USERS = 1000
94
-
95
- # Rate limiting
96
- MAX_REQUESTS_PER_MINUTE = 60
97
- MAX_REQUESTS_PER_HOUR = 500
98
-
99
-
100
- # === Configuration ===
101
- class Config:
102
- """Centralized configuration for the reliability framework"""
103
- HF_TOKEN: str = os.getenv("HF_TOKEN", "").strip()
104
- HF_API_URL: str = "https://router.huggingface.co/hf-inference/v1/completions"
105
-
106
- INDEX_FILE: str = os.getenv("INDEX_FILE", "data/incident_vectors.index")
107
- TEXTS_FILE: str = os.getenv("TEXTS_FILE", "data/incident_texts.json")
108
- DATA_DIR: str = os.getenv("DATA_DIR", "data")
109
-
110
- # Create data directory if it doesn't exist
111
- os.makedirs(DATA_DIR, exist_ok=True)
112
-
113
-
114
- config = Config()
115
- HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
116
-
117
-
118
- # === Input Validation (FIXED: Comprehensive validation) ===
119
- def validate_component_id(component_id: str) -> Tuple[bool, str]:
120
- """Validate component ID format"""
121
- if not isinstance(component_id, str):
122
- return False, "Component ID must be a string"
123
-
124
- if not (1 <= len(component_id) <= 255):
125
- return False, "Component ID must be 1-255 characters"
126
-
127
- import re
128
- if not re.match(r"^[a-z0-9-]+$", component_id):
129
- return False, "Component ID must contain only lowercase letters, numbers, and hyphens"
130
-
131
- return True, ""
132
-
133
-
134
- def validate_inputs(
135
- latency: Any,
136
- error_rate: Any,
137
- throughput: Any,
138
- cpu_util: Any,
139
- memory_util: Any
140
- ) -> Tuple[bool, str]:
141
- """
142
- Comprehensive input validation with type checking
143
-
144
- FIXED: Added proper type validation before conversion
145
- """
146
- try:
147
- # Type conversion with error handling
148
- try:
149
- latency_f = float(latency)
150
- except (ValueError, TypeError):
151
- return False, "❌ Invalid latency: must be a number"
152
-
153
- try:
154
- error_rate_f = float(error_rate)
155
- except (ValueError, TypeError):
156
- return False, "❌ Invalid error rate: must be a number"
157
-
158
- try:
159
- throughput_f = float(throughput) if throughput else 1000.0
160
- except (ValueError, TypeError):
161
- return False, "❌ Invalid throughput: must be a number"
162
-
163
- # CPU and memory are optional
164
- cpu_util_f = None
165
- if cpu_util:
166
- try:
167
- cpu_util_f = float(cpu_util)
168
- except (ValueError, TypeError):
169
- return False, "❌ Invalid CPU utilization: must be a number"
170
-
171
- memory_util_f = None
172
- if memory_util:
173
- try:
174
- memory_util_f = float(memory_util)
175
- except (ValueError, TypeError):
176
- return False, "❌ Invalid memory utilization: must be a number"
177
-
178
- # Range validation
179
- if not (0 <= latency_f <= 10000):
180
- return False, "❌ Invalid latency: must be between 0-10000ms"
181
-
182
- if not (0 <= error_rate_f <= 1):
183
- return False, "❌ Invalid error rate: must be between 0-1"
184
-
185
- if throughput_f < 0:
186
- return False, "❌ Invalid throughput: must be positive"
187
-
188
- if cpu_util_f is not None and not (0 <= cpu_util_f <= 1):
189
- return False, "❌ Invalid CPU utilization: must be between 0-1"
190
-
191
- if memory_util_f is not None and not (0 <= memory_util_f <= 1):
192
- return False, "❌ Invalid memory utilization: must be between 0-1"
193
-
194
- return True, ""
195
-
196
- except Exception as e:
197
- logger.error(f"Validation error: {e}", exc_info=True)
198
- return False, f"❌ Validation error: {str(e)}"
199
-
200
-
201
- # === Thread-Safe Data Structures ===
202
- class ThreadSafeEventStore:
203
- """Thread-safe storage for reliability events"""
204
-
205
- def __init__(self, max_size: int = Constants.MAX_EVENTS_STORED):
206
- self._events = deque(maxlen=max_size)
207
- self._lock = threading.RLock()
208
- logger.info(f"Initialized ThreadSafeEventStore with max_size={max_size}")
209
-
210
- def add(self, event: ReliabilityEvent) -> None:
211
- """Add event to store"""
212
- with self._lock:
213
- self._events.append(event)
214
- logger.debug(f"Added event for {event.component}: {event.severity.value}")
215
-
216
- def get_recent(self, n: int = 15) -> List[ReliabilityEvent]:
217
- """Get n most recent events"""
218
- with self._lock:
219
- return list(self._events)[-n:] if self._events else []
220
-
221
- def get_all(self) -> List[ReliabilityEvent]:
222
- """Get all events"""
223
- with self._lock:
224
- return list(self._events)
225
-
226
- def count(self) -> int:
227
- """Get total event count"""
228
- with self._lock:
229
- return len(self._events)
230
-
231
-
232
- # === FAISS Integration (FIXED: Single-writer pattern for thread safety) ===
233
- class ProductionFAISSIndex:
234
- """
235
- Production-safe FAISS index with single-writer pattern
236
-
237
- CRITICAL FIX: FAISS is NOT thread-safe for concurrent writes
238
- Solution: Queue-based single writer thread + atomic saves
239
- """
240
-
241
- def __init__(self, index, texts: List[str]):
242
- self.index = index
243
- self.texts = texts
244
- self._lock = threading.RLock()
245
-
246
- # FIXED: Initialize shutdown event BEFORE starting thread
247
- self._shutdown = threading.Event()
248
-
249
- # Single writer thread (no concurrent write conflicts)
250
- self._write_queue: Queue = Queue()
251
- self._writer_thread = threading.Thread(
252
- target=self._writer_loop,
253
- daemon=True,
254
- name="FAISSWriter"
255
- )
256
- self._writer_thread.start() # ← Only start ONCE, AFTER _shutdown exists
257
-
258
- # ProcessPool for encoding (avoids GIL + memory leaks)
259
- self._encoder_pool = ProcessPoolExecutor(max_workers=2)
260
-
261
- logger.info(
262
- f"Initialized ProductionFAISSIndex with {len(texts)} vectors, "
263
- f"single-writer pattern"
264
- )
265
-
266
- def add_async(self, vector: np.ndarray, text: str) -> None:
267
- """
268
- Add vector and text asynchronously (thread-safe)
269
-
270
- FIXED: Queue-based design - no concurrent FAISS writes
271
- """
272
- self._write_queue.put((vector, text))
273
- logger.debug(f"Queued vector for indexing: {text[:50]}...")
274
-
275
- def _writer_loop(self) -> None:
276
- """
277
- Single writer thread - processes queue in batches
278
-
279
- This ensures only ONE thread ever writes to FAISS index
280
- """
281
- batch = []
282
- last_save = datetime.datetime.now()
283
- save_interval = datetime.timedelta(
284
- seconds=Constants.FAISS_SAVE_INTERVAL_SECONDS
285
- )
286
-
287
- while not self._shutdown.is_set():
288
- try:
289
- # Collect batch (non-blocking with timeout)
290
- import queue
291
- try:
292
- item = self._write_queue.get(timeout=1.0)
293
- batch.append(item)
294
- except queue.Empty:
295
- pass
296
-
297
- # Process batch when ready
298
- if len(batch) >= Constants.FAISS_BATCH_SIZE or \
299
- (batch and datetime.datetime.now() - last_save > save_interval):
300
-
301
- self._flush_batch(batch)
302
- batch = []
303
-
304
- # Periodic save
305
- if datetime.datetime.now() - last_save > save_interval:
306
- self._save_atomic()
307
- last_save = datetime.datetime.now()
308
-
309
- except Exception as e:
310
- logger.error(f"Writer loop error: {e}", exc_info=True)
311
-
312
- def _flush_batch(self, batch: List[Tuple[np.ndarray, str]]) -> None:
313
- """
314
- Flush batch to FAISS index
315
-
316
- SAFE: Only called from single writer thread
317
- """
318
- if not batch:
319
- return
320
-
321
- try:
322
- vectors = np.vstack([v for v, _ in batch])
323
- texts = [t for _, t in batch]
324
-
325
- # SAFE: Single writer - no concurrent access
326
- self.index.add(vectors)
327
-
328
- with self._lock: # Only lock for text list modification
329
- self.texts.extend(texts)
330
-
331
- logger.info(f"Flushed batch of {len(batch)} vectors to FAISS index")
332
-
333
- except Exception as e:
334
- logger.error(f"Error flushing batch: {e}", exc_info=True)
335
-
336
- def _save_atomic(self) -> None:
337
- """
338
- Atomic save with fsync for durability
339
-
340
- FIXED: Prevents corruption on crash
341
- """
342
- try:
343
- import faiss
344
-
345
- # Write to temporary file first
346
- with tempfile.NamedTemporaryFile(
347
- mode='wb',
348
- delete=False,
349
- dir=os.path.dirname(config.INDEX_FILE),
350
- prefix='index_',
351
- suffix='.tmp'
352
- ) as tmp:
353
- temp_path = tmp.name
354
-
355
- # Write index
356
- faiss.write_index(self.index, temp_path)
357
-
358
- # Fsync for durability
359
- with open(temp_path, 'r+b') as f:
360
- f.flush()
361
- os.fsync(f.fileno())
362
-
363
- # Atomic rename
364
- os.replace(temp_path, config.INDEX_FILE)
365
-
366
- # Save texts with atomic write
367
- with self._lock:
368
- texts_copy = self.texts.copy()
369
-
370
- with atomicwrites.atomic_write(
371
- config.TEXTS_FILE,
372
- mode='w',
373
- overwrite=True
374
- ) as f:
375
- json.dump(texts_copy, f)
376
-
377
- logger.info(
378
- f"Atomically saved FAISS index with {len(texts_copy)} vectors"
379
- )
380
-
381
- except Exception as e:
382
- logger.error(f"Error saving index: {e}", exc_info=True)
383
-
384
- def get_count(self) -> int:
385
- """Get total count of vectors"""
386
- with self._lock:
387
- return len(self.texts) + self._write_queue.qsize()
388
-
389
- def force_save(self) -> None:
390
- """Force immediate save of pending vectors"""
391
- logger.info("Forcing FAISS index save...")
392
-
393
- # Wait for queue to drain (with timeout)
394
- timeout = 10.0
395
- start = datetime.datetime.now()
396
-
397
- while not self._write_queue.empty():
398
- if (datetime.datetime.now() - start).total_seconds() > timeout:
399
- logger.warning("Force save timeout - queue not empty")
400
- break
401
- import time
402
- time.sleep(0.1)
403
-
404
- self._save_atomic()
405
-
406
- def shutdown(self) -> None:
407
- """Graceful shutdown"""
408
- logger.info("Shutting down FAISS index...")
409
- self._shutdown.set()
410
- self.force_save()
411
- self._writer_thread.join(timeout=5.0)
412
- self._encoder_pool.shutdown(wait=True)
413
-
414
-
415
- # === FAISS & Embeddings Setup ===
416
- try:
417
- from sentence_transformers import SentenceTransformer
418
- import faiss
419
-
420
- logger.info("Loading SentenceTransformer model...")
421
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
422
- logger.info("SentenceTransformer model loaded successfully")
423
-
424
- if os.path.exists(config.INDEX_FILE):
425
- logger.info(f"Loading existing FAISS index from {config.INDEX_FILE}")
426
- index = faiss.read_index(config.INDEX_FILE)
427
-
428
- if index.d != Constants.VECTOR_DIM:
429
- logger.warning(
430
- f"Index dimension mismatch: {index.d} != {Constants.VECTOR_DIM}. "
431
- f"Creating new index."
432
- )
433
- index = faiss.IndexFlatL2(Constants.VECTOR_DIM)
434
- incident_texts = []
435
- else:
436
- with open(config.TEXTS_FILE, "r") as f:
437
- incident_texts = json.load(f)
438
- logger.info(f"Loaded {len(incident_texts)} incident texts")
439
- else:
440
- logger.info("Creating new FAISS index")
441
- index = faiss.IndexFlatL2(Constants.VECTOR_DIM)
442
- incident_texts = []
443
-
444
- thread_safe_index = ProductionFAISSIndex(index, incident_texts)
445
-
446
- except ImportError as e:
447
- logger.warning(f"FAISS or SentenceTransformers not available: {e}")
448
- index = None
449
- incident_texts = []
450
- model = None
451
- thread_safe_index = None
452
- except Exception as e:
453
- logger.error(f"Error initializing FAISS: {e}", exc_info=True)
454
- index = None
455
- incident_texts = []
456
- model = None
457
- thread_safe_index = None
458
-
459
- # === Predictive Models ===
460
- class SimplePredictiveEngine:
461
- """
462
- Lightweight forecasting engine with proper constant usage
463
-
464
- FIXED: All magic numbers extracted to Constants
465
- """
466
-
467
- def __init__(self, history_window: int = Constants.HISTORY_WINDOW):
468
- self.history_window = history_window
469
- self.service_history: Dict[str, deque] = {}
470
- self.prediction_cache: Dict[str, Tuple[ForecastResult, datetime.datetime]] = {}
471
- self.max_cache_age = datetime.timedelta(minutes=Constants.CACHE_EXPIRY_MINUTES)
472
- self._lock = threading.RLock()
473
- logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
474
-
475
- def add_telemetry(self, service: str, event_data: Dict) -> None:
476
- """Add telemetry data to service history"""
477
- with self._lock:
478
- if service not in self.service_history:
479
- self.service_history[service] = deque(maxlen=self.history_window)
480
-
481
- telemetry_point = {
482
- 'timestamp': datetime.datetime.now(datetime.timezone.utc),
483
- 'latency': event_data.get('latency_p99', 0),
484
- 'error_rate': event_data.get('error_rate', 0),
485
- 'throughput': event_data.get('throughput', 0),
486
- 'cpu_util': event_data.get('cpu_util'),
487
- 'memory_util': event_data.get('memory_util')
488
- }
489
-
490
- self.service_history[service].append(telemetry_point)
491
- self._clean_cache()
492
-
493
- def _clean_cache(self) -> None:
494
- """Remove expired entries from prediction cache"""
495
- now = datetime.datetime.now(datetime.timezone.utc)
496
- expired = [k for k, (_, ts) in self.prediction_cache.items()
497
- if now - ts > self.max_cache_age]
498
- for k in expired:
499
- del self.prediction_cache[k]
500
-
501
- if expired:
502
- logger.debug(f"Cleaned {len(expired)} expired cache entries")
503
-
504
- def forecast_service_health(
505
- self,
506
- service: str,
507
- lookahead_minutes: int = Constants.FORECAST_LOOKAHEAD_MINUTES
508
- ) -> List[ForecastResult]:
509
- """Forecast service health metrics"""
510
- with self._lock:
511
- if service not in self.service_history or \
512
- len(self.service_history[service]) < Constants.FORECAST_MIN_DATA_POINTS:
513
- return []
514
-
515
- history = list(self.service_history[service])
516
-
517
- forecasts = []
518
-
519
- # Forecast latency
520
- latency_forecast = self._forecast_latency(history, lookahead_minutes)
521
- if latency_forecast:
522
- forecasts.append(latency_forecast)
523
-
524
- # Forecast error rate
525
- error_forecast = self._forecast_error_rate(history, lookahead_minutes)
526
- if error_forecast:
527
- forecasts.append(error_forecast)
528
-
529
- # Forecast resource utilization
530
- resource_forecasts = self._forecast_resources(history, lookahead_minutes)
531
- forecasts.extend(resource_forecasts)
532
-
533
- # Cache results
534
- with self._lock:
535
- for forecast in forecasts:
536
- cache_key = f"{service}_{forecast.metric}"
537
- self.prediction_cache[cache_key] = (forecast, datetime.datetime.now(datetime.timezone.utc))
538
-
539
- return forecasts
540
-
541
- def _forecast_latency(
542
- self,
543
- history: List,
544
- lookahead_minutes: int
545
- ) -> Optional[ForecastResult]:
546
- """Forecast latency using linear regression"""
547
- try:
548
- latencies = [point['latency'] for point in history[-20:]]
549
-
550
- if len(latencies) < Constants.FORECAST_MIN_DATA_POINTS:
551
- return None
552
-
553
- # Linear trend
554
- x = np.arange(len(latencies))
555
- slope, intercept = np.polyfit(x, latencies, 1)
556
-
557
- # Predict next value
558
- next_x = len(latencies)
559
- predicted_latency = slope * next_x + intercept
560
-
561
- # Calculate confidence
562
- residuals = latencies - (slope * x + intercept)
563
- confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
564
-
565
- # Determine trend and risk
566
- if slope > Constants.SLOPE_THRESHOLD_INCREASING:
567
- trend = "increasing"
568
- risk = "critical" if predicted_latency > Constants.LATENCY_EXTREME else "high"
569
- elif slope < Constants.SLOPE_THRESHOLD_DECREASING:
570
- trend = "decreasing"
571
- risk = "low"
572
- else:
573
- trend = "stable"
574
- risk = "low" if predicted_latency < Constants.LATENCY_WARNING else "medium"
575
-
576
- # Calculate time to reach critical threshold
577
- time_to_critical = None
578
- if slope > 0 and predicted_latency < Constants.LATENCY_EXTREME:
579
- denominator = predicted_latency - latencies[-1]
580
- if abs(denominator) > 0.1:
581
- minutes_to_critical = lookahead_minutes * \
582
- (Constants.LATENCY_EXTREME - predicted_latency) / denominator
583
- if minutes_to_critical > 0:
584
- time_to_critical = minutes_to_critical
585
-
586
- return ForecastResult(
587
- metric="latency",
588
- predicted_value=predicted_latency,
589
- confidence=confidence,
590
- trend=trend,
591
- time_to_threshold=time_to_critical,
592
- risk_level=risk
593
- )
594
-
595
- except Exception as e:
596
- logger.error(f"Latency forecast error: {e}", exc_info=True)
597
- return None
598
-
599
- def _forecast_error_rate(
600
- self,
601
- history: List,
602
- lookahead_minutes: int
603
- ) -> Optional[ForecastResult]:
604
- """Forecast error rate using exponential smoothing"""
605
- try:
606
- error_rates = [point['error_rate'] for point in history[-15:]]
607
-
608
- if len(error_rates) < Constants.FORECAST_MIN_DATA_POINTS:
609
- return None
610
-
611
- # Exponential smoothing
612
- alpha = 0.3
613
- forecast = error_rates[0]
614
- for rate in error_rates[1:]:
615
- forecast = alpha * rate + (1 - alpha) * forecast
616
-
617
- predicted_rate = forecast
618
-
619
- # Trend analysis
620
- recent_trend = np.mean(error_rates[-3:]) - np.mean(error_rates[-6:-3])
621
-
622
- if recent_trend > 0.02:
623
- trend = "increasing"
624
- risk = "critical" if predicted_rate > Constants.ERROR_RATE_CRITICAL else "high"
625
- elif recent_trend < -0.01:
626
- trend = "decreasing"
627
- risk = "low"
628
- else:
629
- trend = "stable"
630
- risk = "low" if predicted_rate < Constants.ERROR_RATE_WARNING else "medium"
631
-
632
- # Confidence based on volatility
633
- confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
634
-
635
- return ForecastResult(
636
- metric="error_rate",
637
- predicted_value=predicted_rate,
638
- confidence=confidence,
639
- trend=trend,
640
- risk_level=risk
641
- )
642
-
643
- except Exception as e:
644
- logger.error(f"Error rate forecast error: {e}", exc_info=True)
645
- return None
646
-
647
- def _forecast_resources(
648
- self,
649
- history: List,
650
- lookahead_minutes: int
651
- ) -> List[ForecastResult]:
652
- """Forecast CPU and memory utilization"""
653
- forecasts = []
654
-
655
- # CPU forecast
656
- cpu_values = [point['cpu_util'] for point in history if point.get('cpu_util') is not None]
657
- if len(cpu_values) >= Constants.FORECAST_MIN_DATA_POINTS:
658
- try:
659
- predicted_cpu = np.mean(cpu_values[-5:])
660
- trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
661
-
662
- risk = "low"
663
- if predicted_cpu > Constants.CPU_CRITICAL:
664
- risk = "critical"
665
- elif predicted_cpu > Constants.CPU_WARNING:
666
- risk = "high"
667
- elif predicted_cpu > 0.7:
668
- risk = "medium"
669
-
670
- forecasts.append(ForecastResult(
671
- metric="cpu_util",
672
- predicted_value=predicted_cpu,
673
- confidence=0.7,
674
- trend=trend,
675
- risk_level=risk
676
- ))
677
- except Exception as e:
678
- logger.error(f"CPU forecast error: {e}", exc_info=True)
679
-
680
- # Memory forecast
681
- memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
682
- if len(memory_values) >= Constants.FORECAST_MIN_DATA_POINTS:
683
- try:
684
- predicted_memory = np.mean(memory_values[-5:])
685
- trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
686
-
687
- risk = "low"
688
- if predicted_memory > Constants.MEMORY_CRITICAL:
689
- risk = "critical"
690
- elif predicted_memory > Constants.MEMORY_WARNING:
691
- risk = "high"
692
- elif predicted_memory > 0.7:
693
- risk = "medium"
694
-
695
- forecasts.append(ForecastResult(
696
- metric="memory_util",
697
- predicted_value=predicted_memory,
698
- confidence=0.7,
699
- trend=trend,
700
- risk_level=risk
701
- ))
702
- except Exception as e:
703
- logger.error(f"Memory forecast error: {e}", exc_info=True)
704
-
705
- return forecasts
706
-
707
- def get_predictive_insights(self, service: str) -> Dict[str, Any]:
708
- """Generate actionable insights from forecasts"""
709
- forecasts = self.forecast_service_health(service)
710
-
711
- critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
712
- warnings = []
713
- recommendations = []
714
-
715
- for forecast in critical_risks:
716
- if forecast.metric == "latency" and forecast.risk_level in ["high", "critical"]:
717
- warnings.append(f"📈 Latency expected to reach {forecast.predicted_value:.0f}ms")
718
- if forecast.time_to_threshold:
719
- minutes = int(forecast.time_to_threshold)
720
- recommendations.append(f"⏰ Critical latency (~{Constants.LATENCY_EXTREME}ms) in ~{minutes} minutes")
721
- recommendations.append("🔧 Consider scaling or optimizing dependencies")
722
-
723
- elif forecast.metric == "error_rate" and forecast.risk_level in ["high", "critical"]:
724
- warnings.append(f"🚨 Errors expected to reach {forecast.predicted_value*100:.1f}%")
725
- recommendations.append("🐛 Investigate recent deployments or dependency issues")
726
-
727
- elif forecast.metric == "cpu_util" and forecast.risk_level in ["high", "critical"]:
728
- warnings.append(f"🔥 CPU expected at {forecast.predicted_value*100:.1f}%")
729
- recommendations.append("⚡ Consider scaling compute resources")
730
-
731
- elif forecast.metric == "memory_util" and forecast.risk_level in ["high", "critical"]:
732
- warnings.append(f"💾 Memory expected at {forecast.predicted_value*100:.1f}%")
733
- recommendations.append("🧹 Check for memory leaks or optimize usage")
734
-
735
- return {
736
- 'service': service,
737
- 'forecasts': [
738
- {
739
- 'metric': f.metric,
740
- 'predicted_value': f.predicted_value,
741
- 'confidence': f.confidence,
742
- 'trend': f.trend,
743
- 'risk_level': f.risk_level,
744
- 'time_to_threshold': f.time_to_threshold
745
- }
746
- for f in forecasts
747
- ],
748
- 'warnings': warnings[:3],
749
- 'recommendations': list(dict.fromkeys(recommendations))[:3],
750
- 'critical_risk_count': len(critical_risks),
751
- 'forecast_timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat()
752
- }
753
-
754
-
755
- class BusinessImpactCalculator:
756
- """Calculate business impact of anomalies"""
757
-
758
- def __init__(self, revenue_per_request: float = 0.01):
759
- self.revenue_per_request = revenue_per_request
760
- logger.info(f"Initialized BusinessImpactCalculator")
761
-
762
- def calculate_impact(
763
- self,
764
- event: ReliabilityEvent,
765
- duration_minutes: int = 5
766
- ) -> Dict[str, Any]:
767
- """Calculate business impact for a reliability event"""
768
- base_revenue_per_minute = Constants.BASE_REVENUE_PER_MINUTE
769
-
770
- impact_multiplier = 1.0
771
-
772
- # Impact factors
773
- if event.latency_p99 > Constants.LATENCY_CRITICAL:
774
- impact_multiplier += 0.5
775
- if event.error_rate > 0.1:
776
- impact_multiplier += 0.8
777
- if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
778
- impact_multiplier += 0.3
779
-
780
- revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
781
-
782
- base_users_affected = Constants.BASE_USERS
783
- user_impact_multiplier = (event.error_rate * 10) + \
784
- (max(0, event.latency_p99 - 100) / 500)
785
- affected_users = int(base_users_affected * user_impact_multiplier)
786
-
787
- # Severity classification
788
- if revenue_loss > 500 or affected_users > 5000:
789
- severity = "CRITICAL"
790
- elif revenue_loss > 100 or affected_users > 1000:
791
- severity = "HIGH"
792
- elif revenue_loss > 50 or affected_users > 500:
793
- severity = "MEDIUM"
794
- else:
795
- severity = "LOW"
796
-
797
- logger.info(
798
- f"Business impact: ${revenue_loss:.2f} revenue loss, "
799
- f"{affected_users} users, {severity} severity"
800
- )
801
-
802
- return {
803
- 'revenue_loss_estimate': round(revenue_loss, 2),
804
- 'affected_users_estimate': affected_users,
805
- 'severity_level': severity,
806
- 'throughput_reduction_pct': round(min(100, user_impact_multiplier * 100), 1)
807
- }
808
-
809
-
810
- class AdvancedAnomalyDetector:
811
- """Enhanced anomaly detection with adaptive thresholds"""
812
-
813
- def __init__(self):
814
- self.historical_data = deque(maxlen=100)
815
- self.adaptive_thresholds = {
816
- 'latency_p99': Constants.LATENCY_WARNING,
817
- 'error_rate': Constants.ERROR_RATE_WARNING
818
- }
819
- self._lock = threading.RLock()
820
- logger.info("Initialized AdvancedAnomalyDetector")
821
-
822
- def detect_anomaly(self, event: ReliabilityEvent) -> bool:
823
- """Detect if event is anomalous using adaptive thresholds"""
824
- with self._lock:
825
- latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
826
- error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
827
-
828
- resource_anomaly = False
829
- if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
830
- resource_anomaly = True
831
- if event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL:
832
- resource_anomaly = True
833
-
834
- self._update_thresholds(event)
835
-
836
- is_anomaly = latency_anomaly or error_anomaly or resource_anomaly
837
-
838
- if is_anomaly:
839
- logger.info(
840
- f"Anomaly detected for {event.component}: "
841
- f"latency={latency_anomaly}, error={error_anomaly}, "
842
- f"resource={resource_anomaly}"
843
- )
844
-
845
- return is_anomaly
846
-
847
- def _update_thresholds(self, event: ReliabilityEvent) -> None:
848
- """Update adaptive thresholds based on historical data"""
849
- self.historical_data.append(event)
850
-
851
- if len(self.historical_data) > 10:
852
- recent_latencies = [e.latency_p99 for e in list(self.historical_data)[-20:]]
853
- new_threshold = np.percentile(recent_latencies, 90)
854
- self.adaptive_thresholds['latency_p99'] = new_threshold
855
- logger.debug(f"Updated adaptive latency threshold to {new_threshold:.2f}ms")
856
-
857
- # === Multi-Agent System ===
858
- class AgentSpecialization(Enum):
859
- """Agent specialization types"""
860
- DETECTIVE = "anomaly_detection"
861
- DIAGNOSTICIAN = "root_cause_analysis"
862
- PREDICTIVE = "predictive_analytics"
863
-
864
-
865
- class BaseAgent:
866
- """Base class for all specialized agents"""
867
-
868
- def __init__(self, specialization: AgentSpecialization):
869
- self.specialization = specialization
870
- self.performance_metrics = {
871
- 'processed_events': 0,
872
- 'successful_analyses': 0,
873
- 'average_confidence': 0.0
874
- }
875
-
876
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
877
- """Base analysis method to be implemented by specialized agents"""
878
- raise NotImplementedError
879
-
880
-
881
- class AnomalyDetectionAgent(BaseAgent):
882
- """Specialized agent for anomaly detection and pattern recognition"""
883
-
884
- def __init__(self):
885
- super().__init__(AgentSpecialization.DETECTIVE)
886
- logger.info("Initialized AnomalyDetectionAgent")
887
-
888
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
889
- """Perform comprehensive anomaly analysis"""
890
- try:
891
- anomaly_score = self._calculate_anomaly_score(event)
892
-
893
- return {
894
- 'specialization': self.specialization.value,
895
- 'confidence': anomaly_score,
896
- 'findings': {
897
- 'anomaly_score': anomaly_score,
898
- 'severity_tier': self._classify_severity(anomaly_score),
899
- 'primary_metrics_affected': self._identify_affected_metrics(event)
900
- },
901
- 'recommendations': self._generate_detection_recommendations(event, anomaly_score)
902
- }
903
- except Exception as e:
904
- logger.error(f"AnomalyDetectionAgent error: {e}", exc_info=True)
905
- return {
906
- 'specialization': self.specialization.value,
907
- 'confidence': 0.0,
908
- 'findings': {},
909
- 'recommendations': [f"Analysis error: {str(e)}"]
910
- }
911
-
912
- def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
913
- """Calculate comprehensive anomaly score (0-1)"""
914
- scores = []
915
-
916
- # Latency anomaly (weighted 40%)
917
- if event.latency_p99 > Constants.LATENCY_WARNING:
918
- latency_score = min(1.0, (event.latency_p99 - Constants.LATENCY_WARNING) / 500)
919
- scores.append(0.4 * latency_score)
920
-
921
- # Error rate anomaly (weighted 30%)
922
- if event.error_rate > Constants.ERROR_RATE_WARNING:
923
- error_score = min(1.0, event.error_rate / 0.3)
924
- scores.append(0.3 * error_score)
925
-
926
- # Resource anomaly (weighted 30%)
927
- resource_score = 0
928
- if event.cpu_util and event.cpu_util > Constants.CPU_WARNING:
929
- resource_score += 0.15 * min(1.0, (event.cpu_util - Constants.CPU_WARNING) / 0.2)
930
- if event.memory_util and event.memory_util > Constants.MEMORY_WARNING:
931
- resource_score += 0.15 * min(1.0, (event.memory_util - Constants.MEMORY_WARNING) / 0.2)
932
- scores.append(resource_score)
933
-
934
- return min(1.0, sum(scores))
935
-
936
- def _classify_severity(self, anomaly_score: float) -> str:
937
- """Classify severity tier based on anomaly score"""
938
- if anomaly_score > 0.8:
939
- return "CRITICAL"
940
- elif anomaly_score > 0.6:
941
- return "HIGH"
942
- elif anomaly_score > 0.4:
943
- return "MEDIUM"
944
- else:
945
- return "LOW"
946
-
947
- def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
948
- """Identify which metrics are outside normal ranges"""
949
- affected = []
950
-
951
- # Latency checks
952
- if event.latency_p99 > Constants.LATENCY_EXTREME:
953
- affected.append({
954
- "metric": "latency",
955
- "value": event.latency_p99,
956
- "severity": "CRITICAL",
957
- "threshold": Constants.LATENCY_WARNING
958
- })
959
- elif event.latency_p99 > Constants.LATENCY_CRITICAL:
960
- affected.append({
961
- "metric": "latency",
962
- "value": event.latency_p99,
963
- "severity": "HIGH",
964
- "threshold": Constants.LATENCY_WARNING
965
- })
966
- elif event.latency_p99 > Constants.LATENCY_WARNING:
967
- affected.append({
968
- "metric": "latency",
969
- "value": event.latency_p99,
970
- "severity": "MEDIUM",
971
- "threshold": Constants.LATENCY_WARNING
972
- })
973
-
974
- # Error rate checks
975
- if event.error_rate > Constants.ERROR_RATE_CRITICAL:
976
- affected.append({
977
- "metric": "error_rate",
978
- "value": event.error_rate,
979
- "severity": "CRITICAL",
980
- "threshold": Constants.ERROR_RATE_WARNING
981
- })
982
- elif event.error_rate > Constants.ERROR_RATE_HIGH:
983
- affected.append({
984
- "metric": "error_rate",
985
- "value": event.error_rate,
986
- "severity": "HIGH",
987
- "threshold": Constants.ERROR_RATE_WARNING
988
- })
989
- elif event.error_rate > Constants.ERROR_RATE_WARNING:
990
- affected.append({
991
- "metric": "error_rate",
992
- "value": event.error_rate,
993
- "severity": "MEDIUM",
994
- "threshold": Constants.ERROR_RATE_WARNING
995
- })
996
-
997
- # CPU checks
998
- if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
999
- affected.append({
1000
- "metric": "cpu",
1001
- "value": event.cpu_util,
1002
- "severity": "CRITICAL",
1003
- "threshold": Constants.CPU_WARNING
1004
- })
1005
- elif event.cpu_util and event.cpu_util > Constants.CPU_WARNING:
1006
- affected.append({
1007
- "metric": "cpu",
1008
- "value": event.cpu_util,
1009
- "severity": "HIGH",
1010
- "threshold": Constants.CPU_WARNING
1011
- })
1012
-
1013
- # Memory checks
1014
- if event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL:
1015
- affected.append({
1016
- "metric": "memory",
1017
- "value": event.memory_util,
1018
- "severity": "CRITICAL",
1019
- "threshold": Constants.MEMORY_WARNING
1020
- })
1021
- elif event.memory_util and event.memory_util > Constants.MEMORY_WARNING:
1022
- affected.append({
1023
- "metric": "memory",
1024
- "value": event.memory_util,
1025
- "severity": "HIGH",
1026
- "threshold": Constants.MEMORY_WARNING
1027
- })
1028
-
1029
- return affected
1030
-
1031
- def _generate_detection_recommendations(
1032
- self,
1033
- event: ReliabilityEvent,
1034
- anomaly_score: float
1035
- ) -> List[str]:
1036
- """Generate actionable recommendations"""
1037
- recommendations = []
1038
- affected_metrics = self._identify_affected_metrics(event)
1039
-
1040
- for metric in affected_metrics:
1041
- metric_name = metric["metric"]
1042
- severity = metric["severity"]
1043
- value = metric["value"]
1044
- threshold = metric["threshold"]
1045
-
1046
- if metric_name == "latency":
1047
- if severity == "CRITICAL":
1048
- recommendations.append(
1049
- f"🚨 CRITICAL: Latency {value:.0f}ms (>{threshold}ms) - "
1050
- f"Check database & external dependencies"
1051
- )
1052
- elif severity == "HIGH":
1053
- recommendations.append(
1054
- f"⚠️ HIGH: Latency {value:.0f}ms (>{threshold}ms) - "
1055
- f"Investigate service performance"
1056
- )
1057
- else:
1058
- recommendations.append(
1059
- f"📈 Latency elevated: {value:.0f}ms (>{threshold}ms) - Monitor trend"
1060
- )
1061
-
1062
- elif metric_name == "error_rate":
1063
- if severity == "CRITICAL":
1064
- recommendations.append(
1065
- f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100:.1f}%) - "
1066
- f"Check recent deployments"
1067
- )
1068
- elif severity == "HIGH":
1069
- recommendations.append(
1070
- f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100:.1f}%) - "
1071
- f"Review application logs"
1072
- )
1073
- else:
1074
- recommendations.append(
1075
- f"📈 Errors increasing: {value*100:.1f}% (>{threshold*100:.1f}%)"
1076
- )
1077
-
1078
- elif metric_name == "cpu":
1079
- recommendations.append(
1080
- f"🔥 CPU {severity}: {value*100:.1f}% utilization - Consider scaling"
1081
- )
1082
-
1083
- elif metric_name == "memory":
1084
- recommendations.append(
1085
- f"💾 Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks"
1086
- )
1087
-
1088
- # Overall severity recommendations
1089
- if anomaly_score > 0.8:
1090
- recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
1091
- elif anomaly_score > 0.6:
1092
- recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
1093
- elif anomaly_score > 0.4:
1094
- recommendations.append("📊 MONITOR: Early warning signs detected")
1095
-
1096
- return recommendations[:4]
1097
-
1098
-
1099
- class RootCauseAgent(BaseAgent):
1100
- """Specialized agent for root cause analysis"""
1101
-
1102
- def __init__(self):
1103
- super().__init__(AgentSpecialization.DIAGNOSTICIAN)
1104
- logger.info("Initialized RootCauseAgent")
1105
-
1106
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
1107
- """Perform root cause analysis"""
1108
- try:
1109
- causes = self._analyze_potential_causes(event)
1110
-
1111
- return {
1112
- 'specialization': self.specialization.value,
1113
- 'confidence': 0.7,
1114
- 'findings': {
1115
- 'likely_root_causes': causes,
1116
- 'evidence_patterns': self._identify_evidence(event),
1117
- 'investigation_priority': self._prioritize_investigation(causes)
1118
- },
1119
- 'recommendations': [
1120
- f"Check {cause['cause']} for issues" for cause in causes[:2]
1121
- ]
1122
- }
1123
- except Exception as e:
1124
- logger.error(f"RootCauseAgent error: {e}", exc_info=True)
1125
- return {
1126
- 'specialization': self.specialization.value,
1127
- 'confidence': 0.0,
1128
- 'findings': {},
1129
- 'recommendations': [f"Analysis error: {str(e)}"]
1130
- }
1131
-
1132
- def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
1133
- """Analyze potential root causes based on event patterns"""
1134
- causes = []
1135
-
1136
- # Pattern 1: Database/External Dependency Failure
1137
- if event.latency_p99 > Constants.LATENCY_EXTREME and event.error_rate > 0.2:
1138
- causes.append({
1139
- "cause": "Database/External Dependency Failure",
1140
- "confidence": 0.85,
1141
- "evidence": f"Extreme latency ({event.latency_p99:.0f}ms) with high errors ({event.error_rate*100:.1f}%)",
1142
- "investigation": "Check database connection pool, external API health"
1143
- })
1144
-
1145
- # Pattern 2: Resource Exhaustion
1146
- if (event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL and
1147
- event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL):
1148
- causes.append({
1149
- "cause": "Resource Exhaustion",
1150
- "confidence": 0.90,
1151
- "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
1152
- "investigation": "Check for memory leaks, infinite loops, insufficient resources"
1153
- })
1154
-
1155
- # Pattern 3: Application Bug / Configuration Issue
1156
- if event.error_rate > Constants.ERROR_RATE_CRITICAL and event.latency_p99 < 200:
1157
- causes.append({
1158
- "cause": "Application Bug / Configuration Issue",
1159
- "confidence": 0.75,
1160
- "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
1161
- "investigation": "Review recent deployments, configuration changes, application logs"
1162
- })
1163
-
1164
- # Pattern 4: Gradual Performance Degradation
1165
- if (200 <= event.latency_p99 <= 400 and
1166
- Constants.ERROR_RATE_WARNING <= event.error_rate <= Constants.ERROR_RATE_HIGH):
1167
- causes.append({
1168
- "cause": "Gradual Performance Degradation",
1169
- "confidence": 0.65,
1170
- "evidence": f"Moderate latency ({event.latency_p99:.0f}ms) and errors ({event.error_rate*100:.1f}%)",
1171
- "investigation": "Check resource trends, dependency performance, capacity planning"
1172
- })
1173
-
1174
- # Default: Unknown pattern
1175
- if not causes:
1176
- causes.append({
1177
- "cause": "Unknown - Requires Investigation",
1178
- "confidence": 0.3,
1179
- "evidence": "Pattern does not match known failure modes",
1180
- "investigation": "Complete system review needed"
1181
- })
1182
-
1183
- return causes
1184
-
1185
- def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
1186
- """Identify evidence patterns in the event data"""
1187
- evidence = []
1188
-
1189
- if event.latency_p99 > event.error_rate * 1000:
1190
- evidence.append("latency_disproportionate_to_errors")
1191
-
1192
- if (event.cpu_util and event.cpu_util > Constants.CPU_WARNING and
1193
- event.memory_util and event.memory_util > Constants.MEMORY_WARNING):
1194
- evidence.append("correlated_resource_exhaustion")
1195
-
1196
- if event.error_rate > Constants.ERROR_RATE_HIGH and event.latency_p99 < Constants.LATENCY_CRITICAL:
1197
- evidence.append("errors_without_latency_impact")
1198
-
1199
- return evidence
1200
-
1201
- def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
1202
- """Determine investigation priority"""
1203
- for cause in causes:
1204
- if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
1205
- return "HIGH"
1206
- return "MEDIUM"
1207
-
1208
-
1209
- class PredictiveAgent(BaseAgent):
1210
- """Specialized agent for predictive analytics"""
1211
-
1212
- def __init__(self, engine: SimplePredictiveEngine):
1213
- super().__init__(AgentSpecialization.PREDICTIVE)
1214
- self.engine = engine
1215
- logger.info("Initialized PredictiveAgent")
1216
-
1217
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
1218
- """Perform predictive analysis for future risks"""
1219
- try:
1220
- event_data = {
1221
- 'latency_p99': event.latency_p99,
1222
- 'error_rate': event.error_rate,
1223
- 'throughput': event.throughput,
1224
- 'cpu_util': event.cpu_util,
1225
- 'memory_util': event.memory_util
1226
- }
1227
- self.engine.add_telemetry(event.component, event_data)
1228
-
1229
- insights = self.engine.get_predictive_insights(event.component)
1230
-
1231
- return {
1232
- 'specialization': self.specialization.value,
1233
- 'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
1234
- 'findings': insights,
1235
- 'recommendations': insights['recommendations']
1236
- }
1237
- except Exception as e:
1238
- logger.error(f"PredictiveAgent error: {e}", exc_info=True)
1239
- return {
1240
- 'specialization': self.specialization.value,
1241
- 'confidence': 0.0,
1242
- 'findings': {},
1243
- 'recommendations': [f"Analysis error: {str(e)}"]
1244
- }
1245
-
1246
-
1247
- # FIXED: Add circuit breaker for agent resilience
1248
- @circuit(failure_threshold=3, recovery_timeout=30, name="agent_circuit_breaker")
1249
- async def call_agent_with_protection(agent: BaseAgent, event: ReliabilityEvent) -> Dict[str, Any]:
1250
- """
1251
- Call agent with circuit breaker protection
1252
-
1253
- FIXED: Prevents cascading failures from misbehaving agents
1254
- """
1255
- try:
1256
- result = await asyncio.wait_for(
1257
- agent.analyze(event),
1258
- timeout=Constants.AGENT_TIMEOUT_SECONDS
1259
- )
1260
- return result
1261
- except asyncio.TimeoutError:
1262
- logger.warning(f"Agent {agent.specialization.value} timed out")
1263
- raise
1264
- except Exception as e:
1265
- logger.error(f"Agent {agent.specialization.value} error: {e}", exc_info=True)
1266
- raise
1267
-
1268
-
1269
- class OrchestrationManager:
1270
- """Orchestrates multiple specialized agents for comprehensive analysis"""
1271
-
1272
- def __init__(
1273
- self,
1274
- detective: Optional[AnomalyDetectionAgent] = None,
1275
- diagnostician: Optional[RootCauseAgent] = None,
1276
- predictive: Optional[PredictiveAgent] = None
1277
- ):
1278
- """
1279
- Initialize orchestration manager
1280
-
1281
- FIXED: Dependency injection for testability
1282
- """
1283
- self.agents = {
1284
- AgentSpecialization.DETECTIVE: detective or AnomalyDetectionAgent(),
1285
- AgentSpecialization.DIAGNOSTICIAN: diagnostician or RootCauseAgent(),
1286
- AgentSpecialization.PREDICTIVE: predictive or PredictiveAgent(SimplePredictiveEngine()),
1287
- }
1288
- logger.info(f"Initialized OrchestrationManager with {len(self.agents)} agents")
1289
-
1290
- async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
1291
- """
1292
- Coordinate multiple agents for comprehensive analysis
1293
-
1294
- FIXED: Improved timeout handling with circuit breakers
1295
- """
1296
- # Create tasks for all agents
1297
- agent_tasks = []
1298
- agent_specs = []
1299
-
1300
- for spec, agent in self.agents.items():
1301
- agent_tasks.append(call_agent_with_protection(agent, event))
1302
- agent_specs.append(spec)
1303
-
1304
- # FIXED: Parallel execution with global timeout
1305
- agent_results = {}
1306
-
1307
- try:
1308
- # Run all agents in parallel with global timeout
1309
- results = await asyncio.wait_for(
1310
- asyncio.gather(*agent_tasks, return_exceptions=True),
1311
- timeout=Constants.AGENT_TIMEOUT_SECONDS + 1
1312
- )
1313
-
1314
- # Process results
1315
- for spec, result in zip(agent_specs, results):
1316
- if isinstance(result, Exception):
1317
- logger.error(f"Agent {spec.value} failed: {result}")
1318
- continue
1319
-
1320
- agent_results[spec.value] = result
1321
- logger.debug(f"Agent {spec.value} completed successfully")
1322
-
1323
- except asyncio.TimeoutError:
1324
- logger.warning("Agent orchestration timed out")
1325
- except Exception as e:
1326
- logger.error(f"Agent orchestration error: {e}", exc_info=True)
1327
-
1328
- return self._synthesize_agent_findings(event, agent_results)
1329
-
1330
- def _synthesize_agent_findings(
1331
- self,
1332
- event: ReliabilityEvent,
1333
- agent_results: Dict
1334
- ) -> Dict[str, Any]:
1335
- """Combine insights from all specialized agents"""
1336
- detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
1337
- diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
1338
- predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
1339
-
1340
- if not detective_result:
1341
- logger.warning("No detective agent results available")
1342
- return {'error': 'No agent results available'}
1343
-
1344
- synthesis = {
1345
- 'incident_summary': {
1346
- 'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
1347
- 'anomaly_confidence': detective_result['confidence'],
1348
- 'primary_metrics_affected': [
1349
- metric["metric"] for metric in
1350
- detective_result['findings'].get('primary_metrics_affected', [])
1351
- ]
1352
- },
1353
- 'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
1354
- 'predictive_insights': predictive_result['findings'] if predictive_result else {},
1355
- 'recommended_actions': self._prioritize_actions(
1356
- detective_result.get('recommendations', []),
1357
- diagnostician_result.get('recommendations', []) if diagnostician_result else [],
1358
- predictive_result.get('recommendations', []) if predictive_result else []
1359
- ),
1360
- 'agent_metadata': {
1361
- 'participating_agents': list(agent_results.keys()),
1362
- 'analysis_timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat()
1363
- }
1364
- }
1365
-
1366
- return synthesis
1367
-
1368
- def _prioritize_actions(
1369
- self,
1370
- detection_actions: List[str],
1371
- diagnosis_actions: List[str],
1372
- predictive_actions: List[str]
1373
- ) -> List[str]:
1374
- """Combine and prioritize actions from multiple agents"""
1375
- all_actions = detection_actions + diagnosis_actions + predictive_actions
1376
- seen = set()
1377
- unique_actions = []
1378
- for action in all_actions:
1379
- if action not in seen:
1380
- seen.add(action)
1381
- unique_actions.append(action)
1382
- return unique_actions[:5]
1383
-
1384
- # === Enhanced Reliability Engine ===
1385
- class EnhancedReliabilityEngine:
1386
- """
1387
- Main engine for processing reliability events
1388
-
1389
- FIXED: Dependency injection for all components
1390
- """
1391
-
1392
- def __init__(
1393
- self,
1394
- orchestrator: Optional[OrchestrationManager] = None,
1395
- policy_engine: Optional[PolicyEngine] = None,
1396
- event_store: Optional[ThreadSafeEventStore] = None,
1397
- anomaly_detector: Optional[AdvancedAnomalyDetector] = None,
1398
- business_calculator: Optional[BusinessImpactCalculator] = None
1399
- ):
1400
- """
1401
- Initialize reliability engine with dependency injection
1402
-
1403
- FIXED: All dependencies injected for testability
1404
- """
1405
- self.orchestrator = orchestrator or OrchestrationManager()
1406
- self.policy_engine = policy_engine or PolicyEngine()
1407
- self.event_store = event_store or ThreadSafeEventStore()
1408
- self.anomaly_detector = anomaly_detector or AdvancedAnomalyDetector()
1409
- self.business_calculator = business_calculator or BusinessImpactCalculator()
1410
-
1411
- self.performance_metrics = {
1412
- 'total_incidents_processed': 0,
1413
- 'multi_agent_analyses': 0,
1414
- 'anomalies_detected': 0
1415
- }
1416
- self._lock = threading.RLock()
1417
- logger.info("Initialized EnhancedReliabilityEngine")
1418
-
1419
- async def process_event_enhanced(
1420
- self,
1421
- component: str,
1422
- latency: float,
1423
- error_rate: float,
1424
- throughput: float = 1000,
1425
- cpu_util: Optional[float] = None,
1426
- memory_util: Optional[float] = None
1427
- ) -> Dict[str, Any]:
1428
- """
1429
- Process a reliability event through the complete analysis pipeline
1430
-
1431
- FIXED: Proper async/await throughout
1432
- """
1433
- logger.info(
1434
- f"Processing event for {component}: latency={latency}ms, "
1435
- f"error_rate={error_rate*100:.1f}%"
1436
- )
1437
-
1438
- # Validate component ID
1439
- is_valid, error_msg = validate_component_id(component)
1440
- if not is_valid:
1441
- return {'error': error_msg, 'status': 'INVALID'}
1442
-
1443
- # Create event
1444
- try:
1445
- event = ReliabilityEvent(
1446
- component=component,
1447
- latency_p99=latency,
1448
- error_rate=error_rate,
1449
- throughput=throughput,
1450
- cpu_util=cpu_util,
1451
- memory_util=memory_util,
1452
- upstream_deps=["auth-service", "database"] if component == "api-service" else []
1453
- )
1454
- except Exception as e:
1455
- logger.error(f"Event creation error: {e}", exc_info=True)
1456
- return {'error': f'Invalid event data: {str(e)}', 'status': 'INVALID'}
1457
-
1458
- # Multi-agent analysis
1459
- agent_analysis = await self.orchestrator.orchestrate_analysis(event)
1460
-
1461
- # Anomaly detection
1462
- is_anomaly = self.anomaly_detector.detect_anomaly(event)
1463
-
1464
- # Determine severity based on agent confidence
1465
- agent_confidence = 0.0
1466
- if agent_analysis and 'incident_summary' in agent_analysis:
1467
- agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1468
- else:
1469
- agent_confidence = 0.8 if is_anomaly else 0.1
1470
-
1471
- # Set event severity
1472
- if agent_confidence > 0.8:
1473
- severity = EventSeverity.CRITICAL
1474
- elif agent_confidence > 0.6:
1475
- severity = EventSeverity.HIGH
1476
- elif agent_confidence > 0.4:
1477
- severity = EventSeverity.MEDIUM
1478
- else:
1479
- severity = EventSeverity.LOW
1480
-
1481
- # Create mutable copy with updated severity
1482
- event = event.model_copy(update={'severity': severity})
1483
-
1484
- # Evaluate healing policies
1485
- healing_actions = self.policy_engine.evaluate_policies(event)
1486
-
1487
- # Calculate business impact
1488
- business_impact = self.business_calculator.calculate_impact(event) if is_anomaly else None
1489
-
1490
- # Store in vector database for similarity detection
1491
- if thread_safe_index is not None and model is not None and is_anomaly:
1492
- try:
1493
- # FIXED: Non-blocking encoding with ProcessPoolExecutor
1494
- analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
1495
- vector_text = f"{component} {latency} {error_rate} {analysis_text}"
1496
-
1497
- # Encode asynchronously
1498
- loop = asyncio.get_event_loop()
1499
- vec = await loop.run_in_executor(
1500
- thread_safe_index._encoder_pool,
1501
- model.encode,
1502
- [vector_text]
1503
- )
1504
-
1505
- thread_safe_index.add_async(np.array(vec, dtype=np.float32), vector_text)
1506
- except Exception as e:
1507
- logger.error(f"Error storing vector: {e}", exc_info=True)
1508
-
1509
- # Build comprehensive result
1510
- result = {
1511
- "timestamp": event.timestamp.isoformat(),
1512
- "component": component,
1513
- "latency_p99": latency,
1514
- "error_rate": error_rate,
1515
- "throughput": throughput,
1516
- "status": "ANOMALY" if is_anomaly else "NORMAL",
1517
- "multi_agent_analysis": agent_analysis,
1518
- "healing_actions": [action.value for action in healing_actions],
1519
- "business_impact": business_impact,
1520
- "severity": event.severity.value,
1521
- "similar_incidents_count": thread_safe_index.get_count() if thread_safe_index and is_anomaly else 0,
1522
- "processing_metadata": {
1523
- "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
1524
- "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1525
- }
1526
- }
1527
-
1528
- # Store event in history
1529
- self.event_store.add(event)
1530
-
1531
- # Update performance metrics
1532
- with self._lock:
1533
- self.performance_metrics['total_incidents_processed'] += 1
1534
- self.performance_metrics['multi_agent_analyses'] += 1
1535
- if is_anomaly:
1536
- self.performance_metrics['anomalies_detected'] += 1
1537
-
1538
- logger.info(f"Event processed: {result['status']} with {result['severity']} severity")
1539
-
1540
- # Enhance with Claude AI reasoning (optional layer)
1541
- try:
1542
- result = await self.enhance_with_claude(event, result)
1543
- except Exception as e:
1544
- logger.error(f"Failed to enhance with Claude: {e}")
1545
- # Continue without enhancement
1546
-
1547
- return result
1548
-
1549
- async def enhance_with_claude(
1550
- self,
1551
- event: ReliabilityEvent,
1552
- agent_results: Dict[str, Any]
1553
- ) -> Dict[str, Any]:
1554
- """
1555
- Enhance agent results with Claude AI reasoning
1556
-
1557
- This is a NON-INVASIVE layer - all existing logic stays intact.
1558
- If Claude fails, original results are returned unchanged.
1559
- """
1560
- try:
1561
- # Build comprehensive context for Claude
1562
- context_parts = []
1563
-
1564
- # Add event summary
1565
- context_parts.append("INCIDENT SUMMARY:")
1566
- context_parts.append(f"Component: {event.component}")
1567
- context_parts.append(f"Timestamp: {event.timestamp.isoformat()}")
1568
- context_parts.append(f"Severity: {event.severity.value}")
1569
- context_parts.append("")
1570
-
1571
- # Add metrics
1572
- context_parts.append("METRICS:")
1573
- context_parts.append(f"• Latency P99: {event.latency_p99}ms")
1574
- context_parts.append(f"• Error Rate: {event.error_rate:.1%}")
1575
- context_parts.append(f"• Throughput: {event.throughput} req/s")
1576
- if event.cpu_util:
1577
- context_parts.append(f"• CPU: {event.cpu_util:.1%}")
1578
- if event.memory_util:
1579
- context_parts.append(f"• Memory: {event.memory_util:.1%}")
1580
- context_parts.append("")
1581
-
1582
- # Add agent findings
1583
- if agent_results:
1584
- context_parts.append("AGENT ANALYSIS:")
1585
- if 'multi_agent_analysis' in agent_results:
1586
- analysis = agent_results['multi_agent_analysis']
1587
- context_parts.append(json.dumps(analysis, indent=2))
1588
- elif 'incident_summary' in agent_results:
1589
- context_parts.append(json.dumps(agent_results['incident_summary'], indent=2))
1590
-
1591
- context = "\n".join(context_parts)
1592
-
1593
- # Create prompt for Claude
1594
- prompt = f"""{context}
1595
-
1596
- TASK: Provide an executive summary synthesizing all agent analyses.
1597
- Include:
1598
- 1. Concise incident description
1599
- 2. Most likely root cause
1600
- 3. Single best recovery action
1601
- 4. Estimated impact and recovery time
1602
-
1603
- Be specific and actionable."""
1604
-
1605
- system_prompt = """You are a senior Site Reliability Engineer synthesizing
1606
- multiple AI agent analyses into clear, actionable guidance for incident response.
1607
- Focus on clarity, accuracy, and decisive recommendations."""
1608
-
1609
- # Get Claude's synthesis
1610
- logger.info("Requesting Claude synthesis of agent results")
1611
- claude_synthesis = claude_adapter.generate_completion(
1612
- prompt=prompt,
1613
- system_prompt=system_prompt
1614
- )
1615
-
1616
- # Add Claude's insights to results (non-destructive)
1617
- agent_results['claude_synthesis'] = {
1618
- 'summary': claude_synthesis,
1619
- 'timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat(),
1620
- 'source': 'claude-opus-4'
1621
- }
1622
-
1623
- logger.info("✅ Claude synthesis added to results")
1624
- return agent_results
1625
-
1626
- except Exception as e:
1627
- logger.error(f"Claude enhancement failed: {e}", exc_info=True)
1628
- # Return original results unchanged - system still works!
1629
- return agent_results
1630
-
1631
- # === Initialize Engine (with dependency injection) ===
1632
- enhanced_engine = EnhancedReliabilityEngine()
1633
-
1634
-
1635
- # === Rate Limiting ===
1636
- class RateLimiter:
1637
- """Simple rate limiter for request throttling"""
1638
-
1639
- def __init__(self, max_per_minute: int = Constants.MAX_REQUESTS_PER_MINUTE):
1640
- self.max_per_minute = max_per_minute
1641
- self.requests: deque = deque(maxlen=max_per_minute)
1642
- self._lock = threading.RLock()
1643
-
1644
- def is_allowed(self) -> Tuple[bool, str]:
1645
- """Check if request is allowed"""
1646
- with self._lock:
1647
- now = datetime.datetime.now(datetime.timezone.utc)
1648
-
1649
- # Remove requests older than 1 minute
1650
- one_minute_ago = now - datetime.timedelta(minutes=1)
1651
- while self.requests and self.requests[0] < one_minute_ago:
1652
- self.requests.popleft()
1653
-
1654
- # Check rate limit
1655
- if len(self.requests) >= self.max_per_minute:
1656
- return False, f"Rate limit exceeded: {self.max_per_minute} requests/minute"
1657
-
1658
- # Add current request
1659
- self.requests.append(now)
1660
- return True, ""
1661
-
1662
-
1663
- rate_limiter = RateLimiter()
1664
-
1665
-
1666
- # === Gradio UI ===
1667
- def create_enhanced_ui():
1668
- """
1669
- Create the comprehensive Gradio UI for the reliability framework
1670
-
1671
- FIXED: Uses native async handlers (no event loop creation)
1672
- FIXED: Rate limiting on all endpoints
1673
- """
1674
-
1675
- with gr.Blocks(title="🧠 Agentic Reliability Framework", theme="soft") as demo:
1676
- gr.Markdown("""
1677
- # 🧠 Agentic Reliability Framework
1678
- **Multi-Agent AI System for Production Reliability**
1679
-
1680
- *Specialized AI agents working together to detect, diagnose, predict, and heal system issues*
1681
-
1682
- """)
1683
-
1684
- with gr.Row():
1685
- with gr.Column(scale=1):
1686
- gr.Markdown("### 📊 Telemetry Input")
1687
- component = gr.Dropdown(
1688
- choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
1689
- value="api-service",
1690
- label="Component",
1691
- info="Select the service being monitored"
1692
- )
1693
- latency = gr.Slider(
1694
- minimum=10, maximum=1000, value=100, step=1,
1695
- label="Latency P99 (ms)",
1696
- info=f"Alert threshold: >{Constants.LATENCY_WARNING}ms (adaptive)"
1697
- )
1698
- error_rate = gr.Slider(
1699
- minimum=0, maximum=0.5, value=0.02, step=0.001,
1700
- label="Error Rate",
1701
- info=f"Alert threshold: >{Constants.ERROR_RATE_WARNING}"
1702
- )
1703
- throughput = gr.Number(
1704
- value=1000,
1705
- label="Throughput (req/sec)",
1706
- info="Current request rate"
1707
- )
1708
- cpu_util = gr.Slider(
1709
- minimum=0, maximum=1, value=0.4, step=0.01,
1710
- label="CPU Utilization",
1711
- info="0.0 - 1.0 scale"
1712
- )
1713
- memory_util = gr.Slider(
1714
- minimum=0, maximum=1, value=0.3, step=0.01,
1715
- label="Memory Utilization",
1716
- info="0.0 - 1.0 scale"
1717
- )
1718
- submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary", size="lg")
1719
-
1720
- with gr.Column(scale=2):
1721
- gr.Markdown("### 🔍 Multi-Agent Analysis")
1722
- output_text = gr.Textbox(
1723
- label="Agent Synthesis",
1724
- placeholder="AI agents are analyzing...",
1725
- lines=6
1726
- )
1727
-
1728
- with gr.Accordion("🤖 Agent Specialists Analysis", open=False):
1729
- gr.Markdown("""
1730
- **Specialized AI Agents:**
1731
- - 🕵️ **Detective**: Anomaly detection & pattern recognition
1732
- - 🔍 **Diagnostician**: Root cause analysis & investigation
1733
- - 🔮 **Predictive**: Future risk forecasting & trend analysis
1734
- """)
1735
-
1736
- agent_insights = gr.JSON(
1737
- label="Detailed Agent Findings",
1738
- value={}
1739
- )
1740
-
1741
- with gr.Accordion("🔮 Predictive Analytics & Forecasting", open=False):
1742
- gr.Markdown("""
1743
- **Future Risk Forecasting:**
1744
- - 📈 Latency trends and thresholds
1745
- - 🚨 Error rate predictions
1746
- - 🔥 Resource utilization forecasts
1747
- - ⏰ Time-to-failure estimates
1748
- """)
1749
-
1750
- predictive_insights = gr.JSON(
1751
- label="Predictive Forecasts",
1752
- value={}
1753
- )
1754
-
1755
- with gr.Accordion("🤖 Claude AI Synthesis", open=True):
1756
- gr.Markdown("""
1757
- **Claude Opus 4.5 Executive Summary:**
1758
- - 📋 Incident synthesis from all agents
1759
- - 🎯 Root cause identification
1760
- - 💡 Recommended recovery actions
1761
- - ⏰ Impact and recovery time estimates
1762
- """)
1763
-
1764
- claude_output = gr.Markdown(
1765
- value="*Claude AI synthesis will appear here after incident analysis*",
1766
- label="AI Executive Summary"
1767
- )
1768
-
1769
- gr.Markdown("### 📈 Recent Events (Last 15)")
1770
- events_table = gr.Dataframe(
1771
- headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
1772
- label="Event History",
1773
- wrap=True,
1774
- )
1775
-
1776
- with gr.Accordion("ℹ️ Framework Capabilities", open=False):
1777
- gr.Markdown("""
1778
- - **🤖 Multi-Agent AI**: Specialized agents for detection, diagnosis, prediction, and healing
1779
- - **🔮 Predictive Analytics**: Forecast future risks and performance degradation
1780
- - **🔧 Policy-Based Healing**: Automated recovery actions based on severity and context
1781
- - **💰 Business Impact**: Revenue and user impact quantification
1782
- - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
1783
- - **📚 Vector Memory**: FAISS-based incident memory for similarity detection
1784
- - **⚡ Production Ready**: Circuit breakers, cooldowns, thread safety, and enterprise features
1785
- - **🔒 Security Patched**: All critical CVEs fixed (Gradio 5.50.0+, Requests 2.32.5+)
1786
- """)
1787
-
1788
- with gr.Accordion("🔧 Healing Policies", open=False):
1789
- policy_info = []
1790
- for policy in enhanced_engine.policy_engine.policies:
1791
- if policy.enabled:
1792
- actions = ", ".join([action.value for action in policy.actions])
1793
- policy_info.append(
1794
- f"**{policy.name}** (Priority {policy.priority}): {actions}\n"
1795
- f" - Cooldown: {policy.cool_down_seconds}s\n"
1796
- f" - Max executions: {policy.max_executions_per_hour}/hour"
1797
- )
1798
-
1799
- gr.Markdown("\n\n".join(policy_info))
1800
-
1801
- # FIXED: Native async handler (no event loop creation needed)
1802
- async def submit_event_enhanced_async(
1803
- component, latency, error_rate, throughput, cpu_util, memory_util
1804
- ):
1805
- """
1806
- Async event handler - uses Gradio's native async support
1807
-
1808
- CRITICAL FIX: No event loop creation - Gradio handles this
1809
- FIXED: Rate limiting added
1810
- FIXED: Comprehensive error handling
1811
- """
1812
- try:
1813
- # Rate limiting check
1814
- allowed, rate_msg = rate_limiter.is_allowed()
1815
- if not allowed:
1816
- logger.warning(f"Rate limit exceeded")
1817
- return rate_msg, {}, {}, "*Rate limit exceeded*", gr.Dataframe(value=[])
1818
-
1819
- # Type conversion
1820
- try:
1821
- latency = float(latency)
1822
- error_rate = float(error_rate)
1823
- throughput = float(throughput) if throughput else 1000
1824
- cpu_util = float(cpu_util) if cpu_util else None
1825
- memory_util = float(memory_util) if memory_util else None
1826
- except (ValueError, TypeError) as e:
1827
- error_msg = f"❌ Invalid input types: {str(e)}"
1828
- logger.warning(error_msg)
1829
- return error_msg, {}, {}, "*Invalid input type*", gr.Dataframe(value=[])
1830
-
1831
- # Input validation
1832
- is_valid, error_msg = validate_inputs(
1833
- latency, error_rate, throughput, cpu_util, memory_util
1834
- )
1835
- if not is_valid:
1836
- logger.warning(f"Invalid input: {error_msg}")
1837
- return error_msg, {}, {}, "*Validation failed*", gr.Dataframe(value=[])
1838
-
1839
- # FIXED: Direct async call - no event loop creation needed
1840
- result = await enhanced_engine.process_event_enhanced(
1841
- component, latency, error_rate, throughput, cpu_util, memory_util
1842
- )
1843
-
1844
- # Handle errors
1845
- if 'error' in result:
1846
- return f"❌ {result['error']}", {}, {}, "*Error occurred*", gr.Dataframe(value=[])
1847
-
1848
- # Build table data (THREAD-SAFE)
1849
- table_data = []
1850
- for event in enhanced_engine.event_store.get_recent(15):
1851
- table_data.append([
1852
- event.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
1853
- event.component,
1854
- f"{event.latency_p99:.0f}ms",
1855
- f"{event.error_rate:.3f}",
1856
- f"{event.throughput:.0f}",
1857
- event.severity.value.upper(),
1858
- "Multi-agent analysis"
1859
- ])
1860
-
1861
- # Format output message
1862
- status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
1863
- output_msg = f"{status_emoji} **{result['status']}**\n"
1864
-
1865
- if "multi_agent_analysis" in result:
1866
- analysis = result["multi_agent_analysis"]
1867
- confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1868
- output_msg += f"🎯 **Confidence**: {confidence*100:.1f}%\n"
1869
-
1870
- predictive_data = analysis.get('predictive_insights', {})
1871
- if predictive_data.get('critical_risk_count', 0) > 0:
1872
- output_msg += f"🔮 **PREDICTIVE**: {predictive_data['critical_risk_count']} critical risks forecast\n"
1873
-
1874
- if analysis.get('recommended_actions'):
1875
- actions_preview = ', '.join(analysis['recommended_actions'][:2])
1876
- output_msg += f"💡 **Top Insights**: {actions_preview}\n"
1877
-
1878
- if result.get("business_impact"):
1879
- impact = result["business_impact"]
1880
- output_msg += (
1881
- f"💰 **Business Impact**: \${impact['revenue_loss_estimate']:.2f} | "
1882
- f"👥 {impact['affected_users_estimate']} users | "
1883
- f"🚨 {impact['severity_level']}\n"
1884
- )
1885
-
1886
- if result.get("healing_actions") and result["healing_actions"] != ["no_action"]:
1887
- actions = ", ".join(result["healing_actions"])
1888
- output_msg += f"🔧 **Auto-Actions**: {actions}"
1889
-
1890
- agent_insights_data = result.get("multi_agent_analysis", {})
1891
- predictive_insights_data = agent_insights_data.get('predictive_insights', {})
1892
-
1893
- # Extract Claude synthesis for display
1894
- claude_synthesis = result.get('claude_synthesis', {})
1895
- claude_text = claude_synthesis.get('summary', '*No Claude synthesis available*')
1896
-
1897
- # Format Claude output beautifully
1898
- claude_display = f"""
1899
- ### 🤖 Claude Opus 4.5 Executive Analysis
1900
-
1901
- {claude_text}
1902
-
1903
- ---
1904
- *Generated: {claude_synthesis.get('timestamp', 'N/A')}*
1905
- *Model: {claude_synthesis.get('source', 'claude-opus-4')}*
1906
- """
1907
-
1908
- return (
1909
- output_msg,
1910
- agent_insights_data,
1911
- predictive_insights_data,
1912
- claude_display,
1913
- gr.Dataframe(
1914
- headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
1915
- value=table_data,
1916
- wrap=True
1917
- )
1918
- )
1919
-
1920
- except Exception as e:
1921
- error_msg = f"❌ Error processing event: {str(e)}"
1922
- logger.error(error_msg, exc_info=True)
1923
- return error_msg, {}, {}, "*Processing error*", gr.Dataframe(value=[])
1924
-
1925
- # FIXED: Use async handler directly
1926
- submit_btn.click(
1927
- fn=submit_event_enhanced_async,
1928
- inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
1929
- outputs=[output_text, agent_insights, predictive_insights, claude_output, events_table]
1930
- )
1931
-
1932
- return demo
1933
-
1934
-
1935
- # === Main Entry Point ===
1936
- if __name__ == "__main__":
1937
- logger.info("=" * 80)
1938
- logger.info("Starting Enterprise Agentic Reliability Framework (PATCHED VERSION)")
1939
- logger.info("=" * 80)
1940
- logger.info(f"Python version: {os.sys.version}")
1941
- logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
1942
- logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
1943
- logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
1944
- logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
1945
- logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
1946
- logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
1947
- logger.info("=" * 80)
1948
-
1949
- try:
1950
- demo = create_enhanced_ui()
1951
-
1952
- logger.info("Launching Gradio UI on 0.0.0.0:7860...")
1953
- demo.launch(
1954
- server_name="0.0.0.0",
1955
- server_port=7860,
1956
- share=False,
1957
- show_error=True
1958
- )
1959
- except KeyboardInterrupt:
1960
- logger.info("Received shutdown signal...")
1961
- except Exception as e:
1962
- logger.error(f"Application error: {e}", exc_info=True)
1963
- finally:
1964
- # Graceful shutdown
1965
- logger.info("Shutting down gracefully...")
1966
-
1967
- if thread_safe_index:
1968
- logger.info("Saving pending vectors before shutdown...")
1969
- thread_safe_index.shutdown()
1970
-
1971
- logger.info("=" * 80)
1972
- logger.info("Application shutdown complete")
1973
- logger.info("=" * 80)