petter2025 commited on
Commit
4dfd09d
·
verified ·
1 Parent(s): c7e6112

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +812 -627
app.py CHANGED
@@ -1,15 +1,16 @@
1
  """
2
- Enterprise Agentic Reliability Framework - Main Application
3
  Multi-Agent AI System for Production Reliability Monitoring
4
 
5
- This module provides the complete reliability monitoring system including:
6
- - Multi-agent anomaly detection and root cause analysis
7
- - Predictive analytics and forecasting
8
- - Policy-based auto-healing
9
- - Business impact quantification
10
- - Vector-based incident memory
11
- - Adaptive thresholds
12
- - Thread-safe concurrent operations
 
13
  """
14
 
15
  import os
@@ -21,16 +22,23 @@ import pandas as pd
21
  import datetime
22
  import threading
23
  import logging
 
 
24
  from typing import List, Dict, Any, Optional, Tuple
25
- from collections import deque
26
  from dataclasses import dataclass, asdict
27
- import hashlib
28
- import asyncio
29
  from enum import Enum
 
 
 
 
30
 
31
  # Import our modules
32
- from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
33
- from healing_policies import PolicyEngine
 
 
 
34
 
35
  # === Logging Configuration ===
36
  logging.basicConfig(
@@ -39,48 +47,159 @@ logging.basicConfig(
39
  )
40
  logger = logging.getLogger(__name__)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # === Configuration ===
43
  class Config:
44
  """Centralized configuration for the reliability framework"""
45
  HF_TOKEN: str = os.getenv("HF_TOKEN", "").strip()
46
  HF_API_URL: str = "https://router.huggingface.co/hf-inference/v1/completions"
47
 
48
- # Vector storage
49
- VECTOR_DIM: int = 384
50
- INDEX_FILE: str = "incident_vectors.index"
51
- TEXTS_FILE: str = "incident_texts.json"
52
 
53
- # Thresholds
54
- LATENCY_WARNING: float = 150.0
55
- LATENCY_CRITICAL: float = 300.0
56
- LATENCY_EXTREME: float = 500.0
57
- ERROR_RATE_WARNING: float = 0.05
58
- ERROR_RATE_HIGH: float = 0.15
59
- ERROR_RATE_CRITICAL: float = 0.3
60
- CPU_WARNING: float = 0.8
61
- CPU_CRITICAL: float = 0.9
62
- MEMORY_WARNING: float = 0.8
63
- MEMORY_CRITICAL: float = 0.9
 
 
64
 
65
- # Performance
66
- HISTORY_WINDOW: int = 50
67
- MAX_EVENTS_STORED: int = 1000
68
- AGENT_TIMEOUT: int = 10
69
- CACHE_EXPIRY_MINUTES: int = 15
70
 
71
- # Business metrics
72
- BASE_REVENUE_PER_MINUTE: float = 100.0
73
- BASE_USERS: int = 1000
 
 
74
 
75
- config = Config()
76
 
77
- HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # === Thread-Safe Data Structures ===
80
  class ThreadSafeEventStore:
81
  """Thread-safe storage for reliability events"""
82
 
83
- def __init__(self, max_size: int = config.MAX_EVENTS_STORED):
84
  self._events = deque(maxlen=max_size)
85
  self._lock = threading.RLock()
86
  logger.info(f"Initialized ThreadSafeEventStore with max_size={max_size}")
@@ -106,71 +225,188 @@ class ThreadSafeEventStore:
106
  with self._lock:
107
  return len(self._events)
108
 
109
- class ThreadSafeFAISSIndex:
110
- """Thread-safe wrapper for FAISS index operations with batching"""
 
 
 
 
 
 
 
111
 
112
  def __init__(self, index, texts: List[str]):
113
  self.index = index
114
  self.texts = texts
115
  self._lock = threading.RLock()
116
- self.last_save = datetime.datetime.now()
117
- self.save_interval = datetime.timedelta(seconds=30)
118
- self.pending_vectors = []
119
- self.pending_texts = []
120
- logger.info(f"Initialized ThreadSafeFAISSIndex with {len(texts)} existing vectors")
121
-
122
- def add(self, vector: np.ndarray, text: str) -> None:
123
- """Add vector and text with batching"""
124
- with self._lock:
125
- self.pending_vectors.append(vector)
126
- self.pending_texts.append(text)
127
-
128
- # Flush if we have enough pending
129
- if len(self.pending_vectors) >= 10:
130
- self._flush()
 
 
 
 
131
 
132
- def _flush(self) -> None:
133
- """Flush pending vectors to index"""
134
- if not self.pending_vectors:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  return
136
 
137
  try:
138
- vectors = np.vstack(self.pending_vectors)
 
 
 
139
  self.index.add(vectors)
140
- self.texts.extend(self.pending_texts)
141
 
142
- logger.info(f"Flushed {len(self.pending_vectors)} vectors to FAISS index")
 
143
 
144
- self.pending_vectors = []
145
- self.pending_texts = []
146
 
147
- # Save if enough time has passed
148
- if datetime.datetime.now() - self.last_save > self.save_interval:
149
- self._save()
150
  except Exception as e:
151
- logger.error(f"Error flushing vectors: {e}", exc_info=True)
152
 
153
- def _save(self) -> None:
154
- """Save index to disk"""
 
 
 
 
155
  try:
156
  import faiss
157
- faiss.write_index(self.index, config.INDEX_FILE)
158
- with open(config.TEXTS_FILE, "w") as f:
159
- json.dump(self.texts, f)
160
- self.last_save = datetime.datetime.now()
161
- logger.info(f"Saved FAISS index with {len(self.texts)} vectors")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  except Exception as e:
163
  logger.error(f"Error saving index: {e}", exc_info=True)
164
 
165
  def get_count(self) -> int:
166
  """Get total count of vectors"""
167
  with self._lock:
168
- return len(self.texts) + len(self.pending_texts)
169
 
170
  def force_save(self) -> None:
171
  """Force immediate save of pending vectors"""
172
- with self._lock:
173
- self._flush()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  # === FAISS & Embeddings Setup ===
176
  try:
@@ -185,10 +421,12 @@ try:
185
  logger.info(f"Loading existing FAISS index from {config.INDEX_FILE}")
186
  index = faiss.read_index(config.INDEX_FILE)
187
 
188
- # Validate dimension
189
- if index.d != config.VECTOR_DIM:
190
- logger.warning(f"Index dimension mismatch: {index.d} != {config.VECTOR_DIM}. Creating new index.")
191
- index = faiss.IndexFlatL2(config.VECTOR_DIM)
 
 
192
  incident_texts = []
193
  else:
194
  with open(config.TEXTS_FILE, "r") as f:
@@ -196,10 +434,10 @@ try:
196
  logger.info(f"Loaded {len(incident_texts)} incident texts")
197
  else:
198
  logger.info("Creating new FAISS index")
199
- index = faiss.IndexFlatL2(config.VECTOR_DIM)
200
  incident_texts = []
201
 
202
- thread_safe_index = ThreadSafeFAISSIndex(index, incident_texts)
203
 
204
  except ImportError as e:
205
  logger.warning(f"FAISS or SentenceTransformers not available: {e}")
@@ -214,45 +452,30 @@ except Exception as e:
214
  model = None
215
  thread_safe_index = None
216
 
217
- # === Predictive Models ===
218
- @dataclass
219
- class ForecastResult:
220
- """Data class for forecast results"""
221
- metric: str
222
- predicted_value: float
223
- confidence: float
224
- trend: str # "increasing", "decreasing", "stable"
225
- time_to_threshold: Optional[datetime.timedelta] = None
226
- risk_level: str = "low" # low, medium, high, critical
227
-
228
  class SimplePredictiveEngine:
229
  """
230
- Lightweight forecasting engine optimized for Hugging Face Spaces.
231
- Uses statistical methods for time-series prediction.
 
232
  """
233
 
234
- def __init__(self, history_window: int = config.HISTORY_WINDOW):
235
  self.history_window = history_window
236
  self.service_history: Dict[str, deque] = {}
237
  self.prediction_cache: Dict[str, Tuple[ForecastResult, datetime.datetime]] = {}
238
- self.max_cache_age = datetime.timedelta(minutes=config.CACHE_EXPIRY_MINUTES)
239
  self._lock = threading.RLock()
240
  logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
241
 
242
  def add_telemetry(self, service: str, event_data: Dict) -> None:
243
- """
244
- Add telemetry data to service history
245
-
246
- Args:
247
- service: Service name
248
- event_data: Dictionary containing metrics (latency_p99, error_rate, etc.)
249
- """
250
  with self._lock:
251
  if service not in self.service_history:
252
  self.service_history[service] = deque(maxlen=self.history_window)
253
 
254
  telemetry_point = {
255
- 'timestamp': datetime.datetime.now(),
256
  'latency': event_data.get('latency_p99', 0),
257
  'error_rate': event_data.get('error_rate', 0),
258
  'throughput': event_data.get('throughput', 0),
@@ -261,13 +484,11 @@ class SimplePredictiveEngine:
261
  }
262
 
263
  self.service_history[service].append(telemetry_point)
264
-
265
- # Clean expired cache
266
  self._clean_cache()
267
 
268
  def _clean_cache(self) -> None:
269
  """Remove expired entries from prediction cache"""
270
- now = datetime.datetime.now()
271
  expired = [k for k, (_, ts) in self.prediction_cache.items()
272
  if now - ts > self.max_cache_age]
273
  for k in expired:
@@ -276,19 +497,15 @@ class SimplePredictiveEngine:
276
  if expired:
277
  logger.debug(f"Cleaned {len(expired)} expired cache entries")
278
 
279
- def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
280
- """
281
- Forecast service health metrics
282
-
283
- Args:
284
- service: Service name to forecast
285
- lookahead_minutes: Time horizon in minutes
286
-
287
- Returns:
288
- List of forecast results for different metrics
289
- """
290
  with self._lock:
291
- if service not in self.service_history or len(self.service_history[service]) < 10:
 
292
  return []
293
 
294
  history = list(self.service_history[service])
@@ -313,28 +530,23 @@ class SimplePredictiveEngine:
313
  with self._lock:
314
  for forecast in forecasts:
315
  cache_key = f"{service}_{forecast.metric}"
316
- self.prediction_cache[cache_key] = (forecast, datetime.datetime.now())
317
 
318
  return forecasts
319
 
320
- def _forecast_latency(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
321
- """
322
- Forecast latency using linear regression and trend analysis
323
-
324
- Args:
325
- history: Historical telemetry data
326
- lookahead_minutes: Forecast horizon
327
-
328
- Returns:
329
- ForecastResult or None if insufficient data
330
- """
331
  try:
332
  latencies = [point['latency'] for point in history[-20:]]
333
 
334
- if len(latencies) < 5:
335
  return None
336
 
337
- # Simple linear trend
338
  x = np.arange(len(latencies))
339
  slope, intercept = np.polyfit(x, latencies, 1)
340
 
@@ -342,29 +554,30 @@ class SimplePredictiveEngine:
342
  next_x = len(latencies)
343
  predicted_latency = slope * next_x + intercept
344
 
345
- # Calculate confidence based on data quality
346
  residuals = latencies - (slope * x + intercept)
347
  confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
348
 
349
  # Determine trend and risk
350
- if slope > 5:
351
  trend = "increasing"
352
- risk = "critical" if predicted_latency > config.LATENCY_EXTREME else "high"
353
- elif slope < -2:
354
- trend = "decreasing"
355
  risk = "low"
356
  else:
357
  trend = "stable"
358
- risk = "low" if predicted_latency < config.LATENCY_WARNING else "medium"
359
 
360
- # Calculate time to reach critical threshold (500ms)
361
  time_to_critical = None
362
- if slope > 0 and predicted_latency < config.LATENCY_EXTREME:
363
  denominator = predicted_latency - latencies[-1]
364
- if abs(denominator) > 0.1: # Avoid division by very small numbers
365
- minutes_to_critical = lookahead_minutes * (config.LATENCY_EXTREME - predicted_latency) / denominator
 
366
  if minutes_to_critical > 0:
367
- time_to_critical = datetime.timedelta(minutes=minutes_to_critical)
368
 
369
  return ForecastResult(
370
  metric="latency",
@@ -379,21 +592,16 @@ class SimplePredictiveEngine:
379
  logger.error(f"Latency forecast error: {e}", exc_info=True)
380
  return None
381
 
382
- def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
383
- """
384
- Forecast error rate using exponential smoothing
385
-
386
- Args:
387
- history: Historical telemetry data
388
- lookahead_minutes: Forecast horizon
389
-
390
- Returns:
391
- ForecastResult or None if insufficient data
392
- """
393
  try:
394
  error_rates = [point['error_rate'] for point in history[-15:]]
395
 
396
- if len(error_rates) < 5:
397
  return None
398
 
399
  # Exponential smoothing
@@ -409,13 +617,13 @@ class SimplePredictiveEngine:
409
 
410
  if recent_trend > 0.02:
411
  trend = "increasing"
412
- risk = "critical" if predicted_rate > config.ERROR_RATE_CRITICAL else "high"
413
  elif recent_trend < -0.01:
414
  trend = "decreasing"
415
  risk = "low"
416
  else:
417
  trend = "stable"
418
- risk = "low" if predicted_rate < config.ERROR_RATE_WARNING else "medium"
419
 
420
  # Confidence based on volatility
421
  confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
@@ -432,30 +640,25 @@ class SimplePredictiveEngine:
432
  logger.error(f"Error rate forecast error: {e}", exc_info=True)
433
  return None
434
 
435
- def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
436
- """
437
- Forecast CPU and memory utilization
438
-
439
- Args:
440
- history: Historical telemetry data
441
- lookahead_minutes: Forecast horizon
442
-
443
- Returns:
444
- List of forecast results for CPU and memory
445
- """
446
  forecasts = []
447
 
448
  # CPU forecast
449
  cpu_values = [point['cpu_util'] for point in history if point.get('cpu_util') is not None]
450
- if len(cpu_values) >= 5:
451
  try:
452
  predicted_cpu = np.mean(cpu_values[-5:])
453
  trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
454
 
455
  risk = "low"
456
- if predicted_cpu > config.CPU_CRITICAL:
457
  risk = "critical"
458
- elif predicted_cpu > config.CPU_WARNING:
459
  risk = "high"
460
  elif predicted_cpu > 0.7:
461
  risk = "medium"
@@ -472,15 +675,15 @@ class SimplePredictiveEngine:
472
 
473
  # Memory forecast
474
  memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
475
- if len(memory_values) >= 5:
476
  try:
477
  predicted_memory = np.mean(memory_values[-5:])
478
  trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
479
 
480
  risk = "low"
481
- if predicted_memory > config.MEMORY_CRITICAL:
482
  risk = "critical"
483
- elif predicted_memory > config.MEMORY_WARNING:
484
  risk = "high"
485
  elif predicted_memory > 0.7:
486
  risk = "medium"
@@ -498,15 +701,7 @@ class SimplePredictiveEngine:
498
  return forecasts
499
 
500
  def get_predictive_insights(self, service: str) -> Dict[str, Any]:
501
- """
502
- Generate actionable insights from forecasts
503
-
504
- Args:
505
- service: Service name
506
-
507
- Returns:
508
- Dictionary containing warnings, recommendations, and forecast data
509
- """
510
  forecasts = self.forecast_service_health(service)
511
 
512
  critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
@@ -517,8 +712,8 @@ class SimplePredictiveEngine:
517
  if forecast.metric == "latency" and forecast.risk_level in ["high", "critical"]:
518
  warnings.append(f"📈 Latency expected to reach {forecast.predicted_value:.0f}ms")
519
  if forecast.time_to_threshold:
520
- minutes = int(forecast.time_to_threshold.total_seconds() / 60)
521
- recommendations.append(f"⏰ Critical latency (~500ms) in ~{minutes} minutes")
522
  recommendations.append("🔧 Consider scaling or optimizing dependencies")
523
 
524
  elif forecast.metric == "error_rate" and forecast.risk_level in ["high", "critical"]:
@@ -535,55 +730,54 @@ class SimplePredictiveEngine:
535
 
536
  return {
537
  'service': service,
538
- 'forecasts': [asdict(f) for f in forecasts],
 
 
 
 
 
 
 
 
 
 
539
  'warnings': warnings[:3],
540
  'recommendations': list(dict.fromkeys(recommendations))[:3],
541
  'critical_risk_count': len(critical_risks),
542
- 'forecast_timestamp': datetime.datetime.now().isoformat()
543
  }
544
 
545
- # === Core Engine Components ===
546
- policy_engine = PolicyEngine()
547
- events_history_store = ThreadSafeEventStore()
548
- predictive_engine = SimplePredictiveEngine()
549
 
550
  class BusinessImpactCalculator:
551
- """
552
- Calculate business impact of anomalies including revenue loss
553
- and user impact estimation
554
- """
555
 
556
  def __init__(self, revenue_per_request: float = 0.01):
557
  self.revenue_per_request = revenue_per_request
558
- logger.info(f"Initialized BusinessImpactCalculator with revenue_per_request={revenue_per_request}")
559
 
560
- def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
561
- """
562
- Calculate business impact for a reliability event
563
-
564
- Args:
565
- event: The reliability event to analyze
566
- duration_minutes: Assumed duration of the incident
567
-
568
- Returns:
569
- Dictionary containing revenue loss, user impact, and severity
570
- """
571
- base_revenue_per_minute = config.BASE_REVENUE_PER_MINUTE
572
 
573
  impact_multiplier = 1.0
574
 
575
  # Impact factors
576
- if event.latency_p99 > config.LATENCY_CRITICAL:
577
  impact_multiplier += 0.5
578
  if event.error_rate > 0.1:
579
  impact_multiplier += 0.8
580
- if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
581
  impact_multiplier += 0.3
582
 
583
  revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
584
 
585
- base_users_affected = config.BASE_USERS
586
- user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
 
587
  affected_users = int(base_users_affected * user_impact_multiplier)
588
 
589
  # Severity classification
@@ -596,7 +790,10 @@ class BusinessImpactCalculator:
596
  else:
597
  severity = "LOW"
598
 
599
- logger.info(f"Business impact: ${revenue_loss:.2f} revenue loss, {affected_users} users, {severity} severity")
 
 
 
600
 
601
  return {
602
  'revenue_loss_estimate': round(revenue_loss, 2),
@@ -605,41 +802,29 @@ class BusinessImpactCalculator:
605
  'throughput_reduction_pct': round(min(100, user_impact_multiplier * 100), 1)
606
  }
607
 
608
- business_calculator = BusinessImpactCalculator()
609
 
610
  class AdvancedAnomalyDetector:
611
- """
612
- Enhanced anomaly detection with adaptive thresholds that learn
613
- from historical data patterns
614
- """
615
 
616
  def __init__(self):
617
  self.historical_data = deque(maxlen=100)
618
  self.adaptive_thresholds = {
619
- 'latency_p99': config.LATENCY_WARNING,
620
- 'error_rate': config.ERROR_RATE_WARNING
621
  }
622
  self._lock = threading.RLock()
623
  logger.info("Initialized AdvancedAnomalyDetector")
624
 
625
  def detect_anomaly(self, event: ReliabilityEvent) -> bool:
626
- """
627
- Detect if event is anomalous using adaptive thresholds
628
-
629
- Args:
630
- event: The reliability event to check
631
-
632
- Returns:
633
- True if anomaly detected, False otherwise
634
- """
635
  with self._lock:
636
  latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
637
  error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
638
 
639
  resource_anomaly = False
640
- if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
641
  resource_anomaly = True
642
- if event.memory_util and event.memory_util > config.MEMORY_CRITICAL:
643
  resource_anomaly = True
644
 
645
  self._update_thresholds(event)
@@ -647,7 +832,11 @@ class AdvancedAnomalyDetector:
647
  is_anomaly = latency_anomaly or error_anomaly or resource_anomaly
648
 
649
  if is_anomaly:
650
- logger.info(f"Anomaly detected for {event.component}: latency={latency_anomaly}, error={error_anomaly}, resource={resource_anomaly}")
 
 
 
 
651
 
652
  return is_anomaly
653
 
@@ -661,15 +850,14 @@ class AdvancedAnomalyDetector:
661
  self.adaptive_thresholds['latency_p99'] = new_threshold
662
  logger.debug(f"Updated adaptive latency threshold to {new_threshold:.2f}ms")
663
 
664
- anomaly_detector = AdvancedAnomalyDetector()
665
-
666
- # === Multi-Agent System ===
667
  class AgentSpecialization(Enum):
668
  """Agent specialization types"""
669
  DETECTIVE = "anomaly_detection"
670
  DIAGNOSTICIAN = "root_cause_analysis"
671
  PREDICTIVE = "predictive_analytics"
672
 
 
673
  class BaseAgent:
674
  """Base class for all specialized agents"""
675
 
@@ -685,26 +873,16 @@ class BaseAgent:
685
  """Base analysis method to be implemented by specialized agents"""
686
  raise NotImplementedError
687
 
 
688
  class AnomalyDetectionAgent(BaseAgent):
689
- """
690
- Specialized agent for anomaly detection and pattern recognition.
691
- Calculates multi-dimensional anomaly scores and identifies affected metrics.
692
- """
693
 
694
  def __init__(self):
695
  super().__init__(AgentSpecialization.DETECTIVE)
696
  logger.info("Initialized AnomalyDetectionAgent")
697
 
698
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
699
- """
700
- Perform comprehensive anomaly analysis
701
-
702
- Args:
703
- event: Reliability event to analyze
704
-
705
- Returns:
706
- Dictionary containing anomaly score, severity, affected metrics, and recommendations
707
- """
708
  try:
709
  anomaly_score = self._calculate_anomaly_score(event)
710
 
@@ -728,47 +906,31 @@ class AnomalyDetectionAgent(BaseAgent):
728
  }
729
 
730
  def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
731
- """
732
- Calculate comprehensive anomaly score (0-1) using weighted metrics
733
-
734
- Args:
735
- event: Reliability event
736
-
737
- Returns:
738
- Float between 0 and 1 representing anomaly severity
739
- """
740
  scores = []
741
 
742
  # Latency anomaly (weighted 40%)
743
- if event.latency_p99 > config.LATENCY_WARNING:
744
- latency_score = min(1.0, (event.latency_p99 - config.LATENCY_WARNING) / 500)
745
  scores.append(0.4 * latency_score)
746
 
747
  # Error rate anomaly (weighted 30%)
748
- if event.error_rate > config.ERROR_RATE_WARNING:
749
  error_score = min(1.0, event.error_rate / 0.3)
750
  scores.append(0.3 * error_score)
751
 
752
  # Resource anomaly (weighted 30%)
753
  resource_score = 0
754
- if event.cpu_util and event.cpu_util > config.CPU_WARNING:
755
- resource_score += 0.15 * min(1.0, (event.cpu_util - config.CPU_WARNING) / 0.2)
756
- if event.memory_util and event.memory_util > config.MEMORY_WARNING:
757
- resource_score += 0.15 * min(1.0, (event.memory_util - config.MEMORY_WARNING) / 0.2)
758
  scores.append(resource_score)
759
 
760
  return min(1.0, sum(scores))
761
 
762
  def _classify_severity(self, anomaly_score: float) -> str:
763
- """
764
- Classify severity tier based on anomaly score
765
-
766
- Args:
767
- anomaly_score: Score between 0 and 1
768
-
769
- Returns:
770
- Severity tier string (LOW, MEDIUM, HIGH, CRITICAL)
771
- """
772
  if anomaly_score > 0.8:
773
  return "CRITICAL"
774
  elif anomaly_score > 0.6:
@@ -779,108 +941,95 @@ class AnomalyDetectionAgent(BaseAgent):
779
  return "LOW"
780
 
781
  def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
782
- """
783
- Identify which metrics are outside normal ranges
784
-
785
- Args:
786
- event: Reliability event
787
-
788
- Returns:
789
- List of dictionaries describing affected metrics with severity
790
- """
791
  affected = []
792
 
793
  # Latency checks
794
- if event.latency_p99 > config.LATENCY_EXTREME:
795
  affected.append({
796
- "metric": "latency",
797
- "value": event.latency_p99,
798
- "severity": "CRITICAL",
799
- "threshold": config.LATENCY_WARNING
800
  })
801
- elif event.latency_p99 > config.LATENCY_CRITICAL:
802
  affected.append({
803
- "metric": "latency",
804
- "value": event.latency_p99,
805
- "severity": "HIGH",
806
- "threshold": config.LATENCY_WARNING
807
  })
808
- elif event.latency_p99 > config.LATENCY_WARNING:
809
  affected.append({
810
- "metric": "latency",
811
- "value": event.latency_p99,
812
- "severity": "MEDIUM",
813
- "threshold": config.LATENCY_WARNING
814
  })
815
 
816
  # Error rate checks
817
- if event.error_rate > config.ERROR_RATE_CRITICAL:
818
  affected.append({
819
- "metric": "error_rate",
820
- "value": event.error_rate,
821
- "severity": "CRITICAL",
822
- "threshold": config.ERROR_RATE_WARNING
823
  })
824
- elif event.error_rate > config.ERROR_RATE_HIGH:
825
  affected.append({
826
- "metric": "error_rate",
827
- "value": event.error_rate,
828
- "severity": "HIGH",
829
- "threshold": config.ERROR_RATE_WARNING
830
  })
831
- elif event.error_rate > config.ERROR_RATE_WARNING:
832
  affected.append({
833
- "metric": "error_rate",
834
- "value": event.error_rate,
835
- "severity": "MEDIUM",
836
- "threshold": config.ERROR_RATE_WARNING
837
  })
838
 
839
  # CPU checks
840
- if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
841
  affected.append({
842
- "metric": "cpu",
843
- "value": event.cpu_util,
844
- "severity": "CRITICAL",
845
- "threshold": config.CPU_WARNING
846
  })
847
- elif event.cpu_util and event.cpu_util > config.CPU_WARNING:
848
  affected.append({
849
- "metric": "cpu",
850
- "value": event.cpu_util,
851
- "severity": "HIGH",
852
- "threshold": config.CPU_WARNING
853
  })
854
 
855
  # Memory checks
856
- if event.memory_util and event.memory_util > config.MEMORY_CRITICAL:
857
  affected.append({
858
- "metric": "memory",
859
- "value": event.memory_util,
860
- "severity": "CRITICAL",
861
- "threshold": config.MEMORY_WARNING
862
  })
863
- elif event.memory_util and event.memory_util > config.MEMORY_WARNING:
864
  affected.append({
865
- "metric": "memory",
866
- "value": event.memory_util,
867
- "severity": "HIGH",
868
- "threshold": config.MEMORY_WARNING
869
  })
870
 
871
  return affected
872
 
873
- def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
874
- """
875
- Generate actionable recommendations based on detected anomalies
876
-
877
- Args:
878
- event: Reliability event
879
- anomaly_score: Calculated anomaly score
880
-
881
- Returns:
882
- List of recommendation strings with emojis for visibility
883
- """
884
  recommendations = []
885
  affected_metrics = self._identify_affected_metrics(event)
886
 
@@ -940,28 +1089,18 @@ class AnomalyDetectionAgent(BaseAgent):
940
  elif anomaly_score > 0.4:
941
  recommendations.append("📊 MONITOR: Early warning signs detected")
942
 
943
- return recommendations[:4] # Return top 4 recommendations
 
944
 
945
  class RootCauseAgent(BaseAgent):
946
- """
947
- Specialized agent for root cause analysis.
948
- Analyzes failure patterns and provides investigation guidance.
949
- """
950
 
951
  def __init__(self):
952
  super().__init__(AgentSpecialization.DIAGNOSTICIAN)
953
  logger.info("Initialized RootCauseAgent")
954
 
955
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
956
- """
957
- Perform root cause analysis
958
-
959
- Args:
960
- event: Reliability event to analyze
961
-
962
- Returns:
963
- Dictionary containing likely root causes and investigation guidance
964
- """
965
  try:
966
  causes = self._analyze_potential_causes(event)
967
 
@@ -987,19 +1126,11 @@ class RootCauseAgent(BaseAgent):
987
  }
988
 
989
  def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
990
- """
991
- Analyze potential root causes based on event patterns
992
-
993
- Args:
994
- event: Reliability event
995
-
996
- Returns:
997
- List of potential root causes with confidence scores
998
- """
999
  causes = []
1000
 
1001
  # Pattern 1: Database/External Dependency Failure
1002
- if event.latency_p99 > config.LATENCY_EXTREME and event.error_rate > 0.2:
1003
  causes.append({
1004
  "cause": "Database/External Dependency Failure",
1005
  "confidence": 0.85,
@@ -1008,8 +1139,8 @@ class RootCauseAgent(BaseAgent):
1008
  })
1009
 
1010
  # Pattern 2: Resource Exhaustion
1011
- if (event.cpu_util and event.cpu_util > config.CPU_CRITICAL and
1012
- event.memory_util and event.memory_util > config.MEMORY_CRITICAL):
1013
  causes.append({
1014
  "cause": "Resource Exhaustion",
1015
  "confidence": 0.90,
@@ -1018,7 +1149,7 @@ class RootCauseAgent(BaseAgent):
1018
  })
1019
 
1020
  # Pattern 3: Application Bug / Configuration Issue
1021
- if event.error_rate > config.ERROR_RATE_CRITICAL and event.latency_p99 < 200:
1022
  causes.append({
1023
  "cause": "Application Bug / Configuration Issue",
1024
  "confidence": 0.75,
@@ -1027,8 +1158,8 @@ class RootCauseAgent(BaseAgent):
1027
  })
1028
 
1029
  # Pattern 4: Gradual Performance Degradation
1030
- if (200 <= event.latency_p99 <= 400 and
1031
- config.ERROR_RATE_WARNING <= event.error_rate <= config.ERROR_RATE_HIGH):
1032
  causes.append({
1033
  "cause": "Gradual Performance Degradation",
1034
  "confidence": 0.65,
@@ -1048,65 +1179,39 @@ class RootCauseAgent(BaseAgent):
1048
  return causes
1049
 
1050
  def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
1051
- """
1052
- Identify evidence patterns in the event data
1053
-
1054
- Args:
1055
- event: Reliability event
1056
-
1057
- Returns:
1058
- List of evidence pattern identifiers
1059
- """
1060
  evidence = []
1061
 
1062
  if event.latency_p99 > event.error_rate * 1000:
1063
  evidence.append("latency_disproportionate_to_errors")
1064
 
1065
- if (event.cpu_util and event.cpu_util > config.CPU_WARNING and
1066
- event.memory_util and event.memory_util > config.MEMORY_WARNING):
1067
  evidence.append("correlated_resource_exhaustion")
1068
 
1069
- if event.error_rate > config.ERROR_RATE_HIGH and event.latency_p99 < config.LATENCY_CRITICAL:
1070
  evidence.append("errors_without_latency_impact")
1071
 
1072
  return evidence
1073
 
1074
  def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
1075
- """
1076
- Determine investigation priority based on identified causes
1077
-
1078
- Args:
1079
- causes: List of potential root causes
1080
-
1081
- Returns:
1082
- Priority level (HIGH, MEDIUM, LOW)
1083
- """
1084
  for cause in causes:
1085
  if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
1086
  return "HIGH"
1087
  return "MEDIUM"
1088
 
 
1089
  class PredictiveAgent(BaseAgent):
1090
- """
1091
- Specialized agent for predictive analytics.
1092
- Forecasts future risks and trends using statistical models.
1093
- """
1094
 
1095
- def __init__(self):
1096
  super().__init__(AgentSpecialization.PREDICTIVE)
1097
- self.engine = predictive_engine
1098
  logger.info("Initialized PredictiveAgent")
1099
 
1100
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
1101
- """
1102
- Perform predictive analysis for future risks
1103
-
1104
- Args:
1105
- event: Current reliability event
1106
-
1107
- Returns:
1108
- Dictionary containing forecasts and predictive insights
1109
- """
1110
  try:
1111
  event_data = {
1112
  'latency_p99': event.latency_p99,
@@ -1134,17 +1239,47 @@ class PredictiveAgent(BaseAgent):
1134
  'recommendations': [f"Analysis error: {str(e)}"]
1135
  }
1136
 
1137
- class OrchestrationManager:
 
 
 
1138
  """
1139
- Orchestrates multiple specialized agents for comprehensive analysis.
1140
- Runs agents in parallel and synthesizes their findings.
 
1141
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1142
 
1143
- def __init__(self):
 
 
 
 
 
 
 
 
 
 
1144
  self.agents = {
1145
- AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
1146
- AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
1147
- AgentSpecialization.PREDICTIVE: PredictiveAgent(),
1148
  }
1149
  logger.info(f"Initialized OrchestrationManager with {len(self.agents)} agents")
1150
 
@@ -1152,44 +1287,48 @@ class OrchestrationManager:
1152
  """
1153
  Coordinate multiple agents for comprehensive analysis
1154
 
1155
- Args:
1156
- event: Reliability event to analyze
1157
-
1158
- Returns:
1159
- Synthesized findings from all agents
1160
  """
1161
- agent_tasks = {
1162
- spec: agent.analyze(event)
1163
- for spec, agent in self.agents.items()
1164
- }
 
 
 
1165
 
1166
- # Parallel agent execution with timeout protection
1167
  agent_results = {}
1168
- for specialization, task in agent_tasks.items():
1169
- try:
1170
- result = await asyncio.wait_for(task, timeout=5.0)
1171
- agent_results[specialization.value] = result
1172
- logger.debug(f"Agent {specialization.value} completed successfully")
1173
- except asyncio.TimeoutError:
1174
- logger.warning(f"Agent {specialization.value} timed out")
1175
- continue
1176
- except Exception as e:
1177
- logger.error(f"Agent {specialization.value} error: {e}", exc_info=True)
1178
- continue
 
 
 
 
 
 
 
 
 
 
1179
 
1180
  return self._synthesize_agent_findings(event, agent_results)
1181
 
1182
- def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
1183
- """
1184
- Combine insights from all specialized agents
1185
-
1186
- Args:
1187
- event: Original reliability event
1188
- agent_results: Results from each agent
1189
-
1190
- Returns:
1191
- Synthesized analysis combining all agent findings
1192
- """
1193
  detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
1194
  diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
1195
  predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
@@ -1203,7 +1342,7 @@ class OrchestrationManager:
1203
  'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
1204
  'anomaly_confidence': detective_result['confidence'],
1205
  'primary_metrics_affected': [
1206
- metric["metric"] for metric in
1207
  detective_result['findings'].get('primary_metrics_affected', [])
1208
  ]
1209
  },
@@ -1216,26 +1355,19 @@ class OrchestrationManager:
1216
  ),
1217
  'agent_metadata': {
1218
  'participating_agents': list(agent_results.keys()),
1219
- 'analysis_timestamp': datetime.datetime.now().isoformat()
1220
  }
1221
  }
1222
 
1223
  return synthesis
1224
 
1225
- def _prioritize_actions(self, detection_actions: List[str],
1226
- diagnosis_actions: List[str],
1227
- predictive_actions: List[str]) -> List[str]:
1228
- """
1229
- Combine and prioritize actions from multiple agents
1230
-
1231
- Args:
1232
- detection_actions: Actions from detective agent
1233
- diagnosis_actions: Actions from diagnostician agent
1234
- predictive_actions: Actions from predictive agent
1235
-
1236
- Returns:
1237
- Prioritized list of unique actions
1238
- """
1239
  all_actions = detection_actions + diagnosis_actions + predictive_actions
1240
  seen = set()
1241
  unique_actions = []
@@ -1243,19 +1375,35 @@ class OrchestrationManager:
1243
  if action not in seen:
1244
  seen.add(action)
1245
  unique_actions.append(action)
1246
- return unique_actions[:5] # Return top 5 actions
1247
 
1248
- # Initialize orchestration manager
1249
- orchestration_manager = OrchestrationManager()
1250
-
1251
- # === Enhanced Reliability Engine ===
1252
  class EnhancedReliabilityEngine:
1253
  """
1254
- Main engine for processing reliability events through the multi-agent system.
1255
- Coordinates anomaly detection, agent analysis, policy evaluation, and impact calculation.
 
1256
  """
1257
 
1258
- def __init__(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1259
  self.performance_metrics = {
1260
  'total_incidents_processed': 0,
1261
  'multi_agent_analyses': 0,
@@ -1265,83 +1413,98 @@ class EnhancedReliabilityEngine:
1265
  logger.info("Initialized EnhancedReliabilityEngine")
1266
 
1267
  async def process_event_enhanced(
1268
- self,
1269
- component: str,
1270
- latency: float,
1271
  error_rate: float,
1272
- throughput: float = 1000,
1273
  cpu_util: Optional[float] = None,
1274
  memory_util: Optional[float] = None
1275
  ) -> Dict[str, Any]:
1276
  """
1277
  Process a reliability event through the complete analysis pipeline
1278
 
1279
- Args:
1280
- component: Service component name
1281
- latency: P99 latency in milliseconds
1282
- error_rate: Error rate (0-1)
1283
- throughput: Requests per second
1284
- cpu_util: CPU utilization (0-1)
1285
- memory_util: Memory utilization (0-1)
1286
-
1287
- Returns:
1288
- Comprehensive analysis results including agent findings, healing actions, and business impact
1289
  """
1290
- logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
 
 
 
 
 
 
 
 
1291
 
1292
  # Create event
1293
- event = ReliabilityEvent(
1294
- component=component,
1295
- latency_p99=latency,
1296
- error_rate=error_rate,
1297
- throughput=throughput,
1298
- cpu_util=cpu_util,
1299
- memory_util=memory_util,
1300
- upstream_deps=["auth-service", "database"] if component == "api-service" else []
1301
- )
 
 
 
 
1302
 
1303
  # Multi-agent analysis
1304
- agent_analysis = await orchestration_manager.orchestrate_analysis(event)
1305
 
1306
  # Anomaly detection
1307
- is_anomaly = anomaly_detector.detect_anomaly(event)
1308
-
1309
  # Determine severity based on agent confidence
1310
  agent_confidence = 0.0
1311
  if agent_analysis and 'incident_summary' in agent_analysis:
1312
  agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1313
  else:
1314
  agent_confidence = 0.8 if is_anomaly else 0.1
1315
-
1316
  # Set event severity
1317
  if agent_confidence > 0.8:
1318
- event.severity = EventSeverity.CRITICAL
1319
  elif agent_confidence > 0.6:
1320
- event.severity = EventSeverity.HIGH
1321
  elif agent_confidence > 0.4:
1322
- event.severity = EventSeverity.MEDIUM
1323
  else:
1324
- event.severity = EventSeverity.LOW
 
 
 
1325
 
1326
  # Evaluate healing policies
1327
- healing_actions = policy_engine.evaluate_policies(event)
1328
 
1329
  # Calculate business impact
1330
- business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
1331
 
1332
  # Store in vector database for similarity detection
1333
  if thread_safe_index is not None and model is not None and is_anomaly:
1334
  try:
 
1335
  analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
1336
  vector_text = f"{component} {latency} {error_rate} {analysis_text}"
1337
- vec = model.encode([vector_text])
1338
- thread_safe_index.add(np.array(vec, dtype=np.float32), vector_text)
 
 
 
 
 
 
 
 
1339
  except Exception as e:
1340
  logger.error(f"Error storing vector: {e}", exc_info=True)
1341
 
1342
  # Build comprehensive result
1343
  result = {
1344
- "timestamp": event.timestamp,
1345
  "component": component,
1346
  "latency_p99": latency,
1347
  "error_rate": error_rate,
@@ -1359,7 +1522,7 @@ class EnhancedReliabilityEngine:
1359
  }
1360
 
1361
  # Store event in history
1362
- events_history_store.add(event)
1363
 
1364
  # Update performance metrics
1365
  with self._lock:
@@ -1372,49 +1535,49 @@ class EnhancedReliabilityEngine:
1372
 
1373
  return result
1374
 
1375
- # Initialize enhanced engine
 
1376
  enhanced_engine = EnhancedReliabilityEngine()
1377
 
1378
- # === Input Validation ===
1379
- def validate_inputs(
1380
- latency: float,
1381
- error_rate: float,
1382
- throughput: float,
1383
- cpu_util: Optional[float],
1384
- memory_util: Optional[float]
1385
- ) -> Tuple[bool, str]:
1386
- """
1387
- Validate user inputs for bounds and type correctness
1388
 
1389
- Args:
1390
- latency: Latency value in milliseconds
1391
- error_rate: Error rate (0-1)
1392
- throughput: Throughput in requests/sec
1393
- cpu_util: CPU utilization (0-1)
1394
- memory_util: Memory utilization (0-1)
1395
-
1396
- Returns:
1397
- Tuple of (is_valid: bool, error_message: str)
1398
- """
1399
- if not (0 <= latency <= 10000):
1400
- return False, "❌ Invalid latency: must be between 0-10000ms"
1401
- if not (0 <= error_rate <= 1):
1402
- return False, "❌ Invalid error rate: must be between 0-1"
1403
- if throughput < 0:
1404
- return False, "❌ Invalid throughput: must be positive"
1405
- if cpu_util is not None and not (0 <= cpu_util <= 1):
1406
- return False, "❌ Invalid CPU utilization: must be between 0-1"
1407
- if memory_util is not None and not (0 <= memory_util <= 1):
1408
- return False, "❌ Invalid memory utilization: must be between 0-1"
1409
 
1410
- return True, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1411
 
1412
  # === Gradio UI ===
1413
  def create_enhanced_ui():
1414
  """
1415
- Create the comprehensive Gradio UI for the reliability framework.
1416
- Includes telemetry input, multi-agent analysis display, predictive insights,
1417
- and event history visualization.
 
1418
  """
1419
 
1420
  with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
@@ -1423,6 +1586,8 @@ def create_enhanced_ui():
1423
  **Multi-Agent AI System for Production Reliability**
1424
 
1425
  *Specialized AI agents working together to detect, diagnose, predict, and heal system issues*
 
 
1426
  """)
1427
 
1428
  with gr.Row():
@@ -1437,12 +1602,12 @@ def create_enhanced_ui():
1437
  latency = gr.Slider(
1438
  minimum=10, maximum=1000, value=100, step=1,
1439
  label="Latency P99 (ms)",
1440
- info=f"Alert threshold: >{config.LATENCY_WARNING}ms (adaptive)"
1441
  )
1442
  error_rate = gr.Slider(
1443
  minimum=0, maximum=0.5, value=0.02, step=0.001,
1444
  label="Error Rate",
1445
- info=f"Alert threshold: >{config.ERROR_RATE_WARNING}"
1446
  )
1447
  throughput = gr.Number(
1448
  value=1000,
@@ -1456,7 +1621,7 @@ def create_enhanced_ui():
1456
  )
1457
  memory_util = gr.Slider(
1458
  minimum=0, maximum=1, value=0.3, step=0.01,
1459
- label="Memory Utilization",
1460
  info="0.0 - 1.0 scale"
1461
  )
1462
  submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary", size="lg")
@@ -1473,7 +1638,7 @@ def create_enhanced_ui():
1473
  gr.Markdown("""
1474
  **Specialized AI Agents:**
1475
  - 🕵️ **Detective**: Anomaly detection & pattern recognition
1476
- - 🔍 **Diagnostician**: Root cause analysis & investigation
1477
  - 🔮 **Predictive**: Future risk forecasting & trend analysis
1478
  """)
1479
 
@@ -1486,7 +1651,7 @@ def create_enhanced_ui():
1486
  gr.Markdown("""
1487
  **Future Risk Forecasting:**
1488
  - 📈 Latency trends and thresholds
1489
- - 🚨 Error rate predictions
1490
  - 🔥 Resource utilization forecasts
1491
  - ⏰ Time-to-failure estimates
1492
  """)
@@ -1511,100 +1676,111 @@ def create_enhanced_ui():
1511
  - **💰 Business Impact**: Revenue and user impact quantification
1512
  - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
1513
  - **📚 Vector Memory**: FAISS-based incident memory for similarity detection
1514
- - **⚡ Production Ready**: Circuit breakers, cooldowns, and enterprise features
 
1515
  """)
1516
-
1517
  with gr.Accordion("🔧 Healing Policies", open=False):
1518
  policy_info = []
1519
- for policy in policy_engine.policies:
1520
  if policy.enabled:
1521
  actions = ", ".join([action.value for action in policy.actions])
1522
- policy_info.append(f"**{policy.name}**: {actions} (Priority: {policy.priority})")
 
 
 
 
1523
 
1524
  gr.Markdown("\n\n".join(policy_info))
1525
 
1526
- # FIXED: Synchronous wrapper for async function (CRITICAL FIX)
1527
- def submit_event_enhanced_sync(component, latency, error_rate, throughput, cpu_util, memory_util):
 
 
1528
  """
1529
- Synchronous wrapper for async event processing.
1530
- FIXES GRADIO ASYNC/SYNC COMPATIBILITY ISSUE.
1531
 
1532
- This wrapper:
1533
- 1. Validates inputs
1534
- 2. Creates new event loop for async execution
1535
- 3. Calls the async processing function
1536
- 4. Formats results for display
1537
- 5. Handles all errors gracefully
1538
  """
1539
  try:
 
 
 
 
 
 
1540
  # Type conversion
1541
- latency = float(latency)
1542
- error_rate = float(error_rate)
1543
- throughput = float(throughput) if throughput else 1000
1544
- cpu_util = float(cpu_util) if cpu_util else None
1545
- memory_util = float(memory_util) if memory_util else None
 
 
 
 
 
1546
 
1547
- # Input validation (CRITICAL FIX)
1548
- is_valid, error_msg = validate_inputs(latency, error_rate, throughput, cpu_util, memory_util)
 
 
1549
  if not is_valid:
1550
  logger.warning(f"Invalid input: {error_msg}")
1551
  return error_msg, {}, {}, gr.Dataframe(value=[])
1552
 
1553
- # Create new event loop for async execution (CRITICAL FIX)
1554
- loop = asyncio.new_event_loop()
1555
- asyncio.set_event_loop(loop)
 
1556
 
1557
- try:
1558
- # Call async function
1559
- result = loop.run_until_complete(
1560
- enhanced_engine.process_event_enhanced(
1561
- component, latency, error_rate, throughput, cpu_util, memory_util
1562
- )
1563
- )
1564
- finally:
1565
- loop.close()
1566
 
1567
- # Build table data (THREAD-SAFE FIX)
1568
  table_data = []
1569
- for event in events_history_store.get_recent(15):
1570
  table_data.append([
1571
- event.timestamp[:19],
1572
  event.component,
1573
- event.latency_p99,
1574
  f"{event.error_rate:.3f}",
1575
- event.throughput,
1576
  event.severity.value.upper(),
1577
  "Multi-agent analysis"
1578
  ])
1579
 
1580
  # Format output message
1581
  status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
1582
- output_msg = f"{status_emoji} **{result['status']}**"
1583
 
1584
  if "multi_agent_analysis" in result:
1585
  analysis = result["multi_agent_analysis"]
1586
  confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1587
- output_msg += f"\n🎯 **Confidence**: {confidence*100:.1f}%"
1588
 
1589
  predictive_data = analysis.get('predictive_insights', {})
1590
  if predictive_data.get('critical_risk_count', 0) > 0:
1591
- output_msg += f"\n🔮 **PREDICTIVE**: {predictive_data['critical_risk_count']} critical risks forecast"
1592
 
1593
  if analysis.get('recommended_actions'):
1594
  actions_preview = ', '.join(analysis['recommended_actions'][:2])
1595
- output_msg += f"\n💡 **Top Insights**: {actions_preview}"
1596
 
1597
- if result["business_impact"]:
1598
  impact = result["business_impact"]
1599
  output_msg += (
1600
- f"\n💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | "
1601
  f"👥 {impact['affected_users_estimate']} users | "
1602
- f"🚨 {impact['severity_level']}"
1603
  )
1604
 
1605
- if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
1606
  actions = ", ".join(result["healing_actions"])
1607
- output_msg += f"\n🔧 **Auto-Actions**: {actions}"
1608
 
1609
  agent_insights_data = result.get("multi_agent_analysis", {})
1610
  predictive_insights_data = agent_insights_data.get('predictive_insights', {})
@@ -1620,48 +1796,57 @@ def create_enhanced_ui():
1620
  )
1621
  )
1622
 
1623
- except ValueError as e:
1624
- error_msg = f"❌ Value error: {str(e)}"
1625
- logger.error(error_msg, exc_info=True)
1626
- return error_msg, {}, {}, gr.Dataframe(value=[])
1627
  except Exception as e:
1628
  error_msg = f"❌ Error processing event: {str(e)}"
1629
  logger.error(error_msg, exc_info=True)
1630
  return error_msg, {}, {}, gr.Dataframe(value=[])
1631
 
1632
- # FIXED: Use sync wrapper instead of async function (CRITICAL FIX)
1633
  submit_btn.click(
1634
- fn=submit_event_enhanced_sync, # Synchronous wrapper
1635
  inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
1636
  outputs=[output_text, agent_insights, predictive_insights, events_table]
1637
  )
1638
 
1639
  return demo
1640
 
 
1641
  # === Main Entry Point ===
1642
  if __name__ == "__main__":
1643
  logger.info("=" * 80)
1644
- logger.info("Starting Enterprise Agentic Reliability Framework")
1645
  logger.info("=" * 80)
1646
- logger.info(f"Total events in history: {events_history_store.count()}")
 
1647
  logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
1648
- logger.info(f"Agents initialized: {len(orchestration_manager.agents)}")
 
1649
  logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
1650
-
1651
- demo = create_enhanced_ui()
1652
-
1653
- logger.info("Launching Gradio UI on 0.0.0.0:7860...")
1654
- demo.launch(
1655
- server_name="0.0.0.0",
1656
- server_port=7860,
1657
- share=False
1658
- )
1659
-
1660
- # Graceful shutdown: Save any pending vectors
1661
- if thread_safe_index:
1662
- logger.info("Saving pending vectors before shutdown...")
1663
- thread_safe_index.force_save()
1664
-
1665
  logger.info("=" * 80)
1666
- logger.info("Application shutdown complete")
1667
- logger.info("=" * 80)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Enterprise Agentic Reliability Framework - Main Application (FIXED VERSION)
3
  Multi-Agent AI System for Production Reliability Monitoring
4
 
5
+ CRITICAL FIXES APPLIED:
6
+ - Removed event loop creation (uses Gradio native async)
7
+ - Fixed FAISS thread safety with single-writer pattern
8
+ - ProcessPoolExecutor for CPU-intensive encoding
9
+ - Atomic saves with fsync
10
+ - Dependency injection
11
+ - Rate limiting
12
+ - Comprehensive input validation
13
+ - Circuit breakers for agent resilience
14
  """
15
 
16
  import os
 
22
  import datetime
23
  import threading
24
  import logging
25
+ import asyncio
26
+ import tempfile
27
  from typing import List, Dict, Any, Optional, Tuple
28
+ from collections import deque, OrderedDict
29
  from dataclasses import dataclass, asdict
 
 
30
  from enum import Enum
31
+ from concurrent.futures import ProcessPoolExecutor
32
+ from queue import Queue
33
+ from circuitbreaker import circuit
34
+ import atomicwrites
35
 
36
  # Import our modules
37
+ from models import (
38
+ ReliabilityEvent, EventSeverity, AnomalyResult,
39
+ HealingAction, ForecastResult, PolicyCondition
40
+ )
41
+ from healing_policies import PolicyEngine, DEFAULT_HEALING_POLICIES
42
 
43
  # === Logging Configuration ===
44
  logging.basicConfig(
 
47
  )
48
  logger = logging.getLogger(__name__)
49
 
50
+
51
+ # === CONSTANTS (FIXED: Extracted all magic numbers) ===
52
+ class Constants:
53
+ """Centralized constants to eliminate magic numbers"""
54
+
55
+ # Thresholds
56
+ LATENCY_WARNING = 150.0
57
+ LATENCY_CRITICAL = 300.0
58
+ LATENCY_EXTREME = 500.0
59
+
60
+ ERROR_RATE_WARNING = 0.05
61
+ ERROR_RATE_HIGH = 0.15
62
+ ERROR_RATE_CRITICAL = 0.3
63
+
64
+ CPU_WARNING = 0.8
65
+ CPU_CRITICAL = 0.9
66
+
67
+ MEMORY_WARNING = 0.8
68
+ MEMORY_CRITICAL = 0.9
69
+
70
+ # Forecasting
71
+ SLOPE_THRESHOLD_INCREASING = 5.0
72
+ SLOPE_THRESHOLD_DECREASING = -2.0
73
+
74
+ FORECAST_MIN_DATA_POINTS = 5
75
+ FORECAST_LOOKAHEAD_MINUTES = 15
76
+
77
+ # Performance
78
+ HISTORY_WINDOW = 50
79
+ MAX_EVENTS_STORED = 1000
80
+ AGENT_TIMEOUT_SECONDS = 5
81
+ CACHE_EXPIRY_MINUTES = 15
82
+
83
+ # FAISS
84
+ FAISS_BATCH_SIZE = 10
85
+ FAISS_SAVE_INTERVAL_SECONDS = 30
86
+ VECTOR_DIM = 384
87
+
88
+ # Business metrics
89
+ BASE_REVENUE_PER_MINUTE = 100.0
90
+ BASE_USERS = 1000
91
+
92
+ # Rate limiting
93
+ MAX_REQUESTS_PER_MINUTE = 60
94
+ MAX_REQUESTS_PER_HOUR = 500
95
+
96
+
97
  # === Configuration ===
98
  class Config:
99
  """Centralized configuration for the reliability framework"""
100
  HF_TOKEN: str = os.getenv("HF_TOKEN", "").strip()
101
  HF_API_URL: str = "https://router.huggingface.co/hf-inference/v1/completions"
102
 
103
+ INDEX_FILE: str = os.getenv("INDEX_FILE", "data/incident_vectors.index")
104
+ TEXTS_FILE: str = os.getenv("TEXTS_FILE", "data/incident_texts.json")
105
+ DATA_DIR: str = os.getenv("DATA_DIR", "data")
 
106
 
107
+ # Create data directory if it doesn't exist
108
+ os.makedirs(DATA_DIR, exist_ok=True)
109
+
110
+
111
+ config = Config()
112
+ HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
113
+
114
+
115
+ # === Input Validation (FIXED: Comprehensive validation) ===
116
+ def validate_component_id(component_id: str) -> Tuple[bool, str]:
117
+ """Validate component ID format"""
118
+ if not isinstance(component_id, str):
119
+ return False, "Component ID must be a string"
120
 
121
+ if not (1 <= len(component_id) <= 255):
122
+ return False, "Component ID must be 1-255 characters"
 
 
 
123
 
124
+ import re
125
+ if not re.match(r"^[a-z0-9-]+$", component_id):
126
+ return False, "Component ID must contain only lowercase letters, numbers, and hyphens"
127
+
128
+ return True, ""
129
 
 
130
 
131
+ def validate_inputs(
132
+ latency: Any,
133
+ error_rate: Any,
134
+ throughput: Any,
135
+ cpu_util: Any,
136
+ memory_util: Any
137
+ ) -> Tuple[bool, str]:
138
+ """
139
+ Comprehensive input validation with type checking
140
+
141
+ FIXED: Added proper type validation before conversion
142
+ """
143
+ try:
144
+ # Type conversion with error handling
145
+ try:
146
+ latency_f = float(latency)
147
+ except (ValueError, TypeError):
148
+ return False, "❌ Invalid latency: must be a number"
149
+
150
+ try:
151
+ error_rate_f = float(error_rate)
152
+ except (ValueError, TypeError):
153
+ return False, "❌ Invalid error rate: must be a number"
154
+
155
+ try:
156
+ throughput_f = float(throughput) if throughput else 1000.0
157
+ except (ValueError, TypeError):
158
+ return False, "❌ Invalid throughput: must be a number"
159
+
160
+ # CPU and memory are optional
161
+ cpu_util_f = None
162
+ if cpu_util:
163
+ try:
164
+ cpu_util_f = float(cpu_util)
165
+ except (ValueError, TypeError):
166
+ return False, "❌ Invalid CPU utilization: must be a number"
167
+
168
+ memory_util_f = None
169
+ if memory_util:
170
+ try:
171
+ memory_util_f = float(memory_util)
172
+ except (ValueError, TypeError):
173
+ return False, "❌ Invalid memory utilization: must be a number"
174
+
175
+ # Range validation
176
+ if not (0 <= latency_f <= 10000):
177
+ return False, "❌ Invalid latency: must be between 0-10000ms"
178
+
179
+ if not (0 <= error_rate_f <= 1):
180
+ return False, "❌ Invalid error rate: must be between 0-1"
181
+
182
+ if throughput_f < 0:
183
+ return False, "❌ Invalid throughput: must be positive"
184
+
185
+ if cpu_util_f is not None and not (0 <= cpu_util_f <= 1):
186
+ return False, "❌ Invalid CPU utilization: must be between 0-1"
187
+
188
+ if memory_util_f is not None and not (0 <= memory_util_f <= 1):
189
+ return False, "❌ Invalid memory utilization: must be between 0-1"
190
+
191
+ return True, ""
192
+
193
+ except Exception as e:
194
+ logger.error(f"Validation error: {e}", exc_info=True)
195
+ return False, f"❌ Validation error: {str(e)}"
196
+
197
 
198
  # === Thread-Safe Data Structures ===
199
  class ThreadSafeEventStore:
200
  """Thread-safe storage for reliability events"""
201
 
202
+ def __init__(self, max_size: int = Constants.MAX_EVENTS_STORED):
203
  self._events = deque(maxlen=max_size)
204
  self._lock = threading.RLock()
205
  logger.info(f"Initialized ThreadSafeEventStore with max_size={max_size}")
 
225
  with self._lock:
226
  return len(self._events)
227
 
228
+
229
+ # === FAISS Integration (FIXED: Single-writer pattern for thread safety) ===
230
+ class ProductionFAISSIndex:
231
+ """
232
+ Production-safe FAISS index with single-writer pattern
233
+
234
+ CRITICAL FIX: FAISS is NOT thread-safe for concurrent writes
235
+ Solution: Queue-based single writer thread + atomic saves
236
+ """
237
 
238
  def __init__(self, index, texts: List[str]):
239
  self.index = index
240
  self.texts = texts
241
  self._lock = threading.RLock()
242
+
243
+ # Single writer thread (no concurrent write conflicts)
244
+ self._write_queue: Queue = Queue()
245
+ self._writer_thread = threading.Thread(
246
+ target=self._writer_loop,
247
+ daemon=True,
248
+ name="FAISSWriter"
249
+ )
250
+ self._writer_thread.start()
251
+
252
+ # ProcessPool for encoding (avoids GIL + memory leaks)
253
+ self._encoder_pool = ProcessPoolExecutor(max_workers=2)
254
+
255
+ self._shutdown = threading.Event()
256
+
257
+ logger.info(
258
+ f"Initialized ProductionFAISSIndex with {len(texts)} vectors, "
259
+ f"single-writer pattern"
260
+ )
261
 
262
+ def add_async(self, vector: np.ndarray, text: str) -> None:
263
+ """
264
+ Add vector and text asynchronously (thread-safe)
265
+
266
+ FIXED: Queue-based design - no concurrent FAISS writes
267
+ """
268
+ self._write_queue.put((vector, text))
269
+ logger.debug(f"Queued vector for indexing: {text[:50]}...")
270
+
271
+ def _writer_loop(self) -> None:
272
+ """
273
+ Single writer thread - processes queue in batches
274
+
275
+ This ensures only ONE thread ever writes to FAISS index
276
+ """
277
+ batch = []
278
+ last_save = datetime.datetime.now()
279
+ save_interval = datetime.timedelta(
280
+ seconds=Constants.FAISS_SAVE_INTERVAL_SECONDS
281
+ )
282
+
283
+ while not self._shutdown.is_set():
284
+ try:
285
+ # Collect batch (non-blocking with timeout)
286
+ import queue
287
+ try:
288
+ item = self._write_queue.get(timeout=1.0)
289
+ batch.append(item)
290
+ except queue.Empty:
291
+ pass
292
+
293
+ # Process batch when ready
294
+ if len(batch) >= Constants.FAISS_BATCH_SIZE or \
295
+ (batch and datetime.datetime.now() - last_save > save_interval):
296
+
297
+ self._flush_batch(batch)
298
+ batch = []
299
+
300
+ # Periodic save
301
+ if datetime.datetime.now() - last_save > save_interval:
302
+ self._save_atomic()
303
+ last_save = datetime.datetime.now()
304
+
305
+ except Exception as e:
306
+ logger.error(f"Writer loop error: {e}", exc_info=True)
307
+
308
+ def _flush_batch(self, batch: List[Tuple[np.ndarray, str]]) -> None:
309
+ """
310
+ Flush batch to FAISS index
311
+
312
+ SAFE: Only called from single writer thread
313
+ """
314
+ if not batch:
315
  return
316
 
317
  try:
318
+ vectors = np.vstack([v for v, _ in batch])
319
+ texts = [t for _, t in batch]
320
+
321
+ # SAFE: Single writer - no concurrent access
322
  self.index.add(vectors)
 
323
 
324
+ with self._lock: # Only lock for text list modification
325
+ self.texts.extend(texts)
326
 
327
+ logger.info(f"Flushed batch of {len(batch)} vectors to FAISS index")
 
328
 
 
 
 
329
  except Exception as e:
330
+ logger.error(f"Error flushing batch: {e}", exc_info=True)
331
 
332
+ def _save_atomic(self) -> None:
333
+ """
334
+ Atomic save with fsync for durability
335
+
336
+ FIXED: Prevents corruption on crash
337
+ """
338
  try:
339
  import faiss
340
+
341
+ # Write to temporary file first
342
+ with tempfile.NamedTemporaryFile(
343
+ mode='wb',
344
+ delete=False,
345
+ dir=os.path.dirname(config.INDEX_FILE),
346
+ prefix='index_',
347
+ suffix='.tmp'
348
+ ) as tmp:
349
+ temp_path = tmp.name
350
+
351
+ # Write index
352
+ faiss.write_index(self.index, temp_path)
353
+
354
+ # Fsync for durability
355
+ with open(temp_path, 'r+b') as f:
356
+ f.flush()
357
+ os.fsync(f.fileno())
358
+
359
+ # Atomic rename
360
+ os.replace(temp_path, config.INDEX_FILE)
361
+
362
+ # Save texts with atomic write
363
+ with self._lock:
364
+ texts_copy = self.texts.copy()
365
+
366
+ with atomicwrites.atomic_write(
367
+ config.TEXTS_FILE,
368
+ mode='w',
369
+ overwrite=True
370
+ ) as f:
371
+ json.dump(texts_copy, f)
372
+
373
+ logger.info(
374
+ f"Atomically saved FAISS index with {len(texts_copy)} vectors"
375
+ )
376
+
377
  except Exception as e:
378
  logger.error(f"Error saving index: {e}", exc_info=True)
379
 
380
  def get_count(self) -> int:
381
  """Get total count of vectors"""
382
  with self._lock:
383
+ return len(self.texts) + self._write_queue.qsize()
384
 
385
  def force_save(self) -> None:
386
  """Force immediate save of pending vectors"""
387
+ logger.info("Forcing FAISS index save...")
388
+
389
+ # Wait for queue to drain (with timeout)
390
+ timeout = 10.0
391
+ start = datetime.datetime.now()
392
+
393
+ while not self._write_queue.empty():
394
+ if (datetime.datetime.now() - start).total_seconds() > timeout:
395
+ logger.warning("Force save timeout - queue not empty")
396
+ break
397
+ import time
398
+ time.sleep(0.1)
399
+
400
+ self._save_atomic()
401
+
402
+ def shutdown(self) -> None:
403
+ """Graceful shutdown"""
404
+ logger.info("Shutting down FAISS index...")
405
+ self._shutdown.set()
406
+ self.force_save()
407
+ self._writer_thread.join(timeout=5.0)
408
+ self._encoder_pool.shutdown(wait=True)
409
+
410
 
411
  # === FAISS & Embeddings Setup ===
412
  try:
 
421
  logger.info(f"Loading existing FAISS index from {config.INDEX_FILE}")
422
  index = faiss.read_index(config.INDEX_FILE)
423
 
424
+ if index.d != Constants.VECTOR_DIM:
425
+ logger.warning(
426
+ f"Index dimension mismatch: {index.d} != {Constants.VECTOR_DIM}. "
427
+ f"Creating new index."
428
+ )
429
+ index = faiss.IndexFlatL2(Constants.VECTOR_DIM)
430
  incident_texts = []
431
  else:
432
  with open(config.TEXTS_FILE, "r") as f:
 
434
  logger.info(f"Loaded {len(incident_texts)} incident texts")
435
  else:
436
  logger.info("Creating new FAISS index")
437
+ index = faiss.IndexFlatL2(Constants.VECTOR_DIM)
438
  incident_texts = []
439
 
440
+ thread_safe_index = ProductionFAISSIndex(index, incident_texts)
441
 
442
  except ImportError as e:
443
  logger.warning(f"FAISS or SentenceTransformers not available: {e}")
 
452
  model = None
453
  thread_safe_index = None
454
 
455
+ # === Predictive Models ===
 
 
 
 
 
 
 
 
 
 
456
  class SimplePredictiveEngine:
457
  """
458
+ Lightweight forecasting engine with proper constant usage
459
+
460
+ FIXED: All magic numbers extracted to Constants
461
  """
462
 
463
+ def __init__(self, history_window: int = Constants.HISTORY_WINDOW):
464
  self.history_window = history_window
465
  self.service_history: Dict[str, deque] = {}
466
  self.prediction_cache: Dict[str, Tuple[ForecastResult, datetime.datetime]] = {}
467
+ self.max_cache_age = datetime.timedelta(minutes=Constants.CACHE_EXPIRY_MINUTES)
468
  self._lock = threading.RLock()
469
  logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
470
 
471
  def add_telemetry(self, service: str, event_data: Dict) -> None:
472
+ """Add telemetry data to service history"""
 
 
 
 
 
 
473
  with self._lock:
474
  if service not in self.service_history:
475
  self.service_history[service] = deque(maxlen=self.history_window)
476
 
477
  telemetry_point = {
478
+ 'timestamp': datetime.datetime.now(datetime.timezone.utc),
479
  'latency': event_data.get('latency_p99', 0),
480
  'error_rate': event_data.get('error_rate', 0),
481
  'throughput': event_data.get('throughput', 0),
 
484
  }
485
 
486
  self.service_history[service].append(telemetry_point)
 
 
487
  self._clean_cache()
488
 
489
  def _clean_cache(self) -> None:
490
  """Remove expired entries from prediction cache"""
491
+ now = datetime.datetime.now(datetime.timezone.utc)
492
  expired = [k for k, (_, ts) in self.prediction_cache.items()
493
  if now - ts > self.max_cache_age]
494
  for k in expired:
 
497
  if expired:
498
  logger.debug(f"Cleaned {len(expired)} expired cache entries")
499
 
500
+ def forecast_service_health(
501
+ self,
502
+ service: str,
503
+ lookahead_minutes: int = Constants.FORECAST_LOOKAHEAD_MINUTES
504
+ ) -> List[ForecastResult]:
505
+ """Forecast service health metrics"""
 
 
 
 
 
506
  with self._lock:
507
+ if service not in self.service_history or \
508
+ len(self.service_history[service]) < Constants.FORECAST_MIN_DATA_POINTS:
509
  return []
510
 
511
  history = list(self.service_history[service])
 
530
  with self._lock:
531
  for forecast in forecasts:
532
  cache_key = f"{service}_{forecast.metric}"
533
+ self.prediction_cache[cache_key] = (forecast, datetime.datetime.now(datetime.timezone.utc))
534
 
535
  return forecasts
536
 
537
+ def _forecast_latency(
538
+ self,
539
+ history: List,
540
+ lookahead_minutes: int
541
+ ) -> Optional[ForecastResult]:
542
+ """Forecast latency using linear regression"""
 
 
 
 
 
543
  try:
544
  latencies = [point['latency'] for point in history[-20:]]
545
 
546
+ if len(latencies) < Constants.FORECAST_MIN_DATA_POINTS:
547
  return None
548
 
549
+ # Linear trend
550
  x = np.arange(len(latencies))
551
  slope, intercept = np.polyfit(x, latencies, 1)
552
 
 
554
  next_x = len(latencies)
555
  predicted_latency = slope * next_x + intercept
556
 
557
+ # Calculate confidence
558
  residuals = latencies - (slope * x + intercept)
559
  confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
560
 
561
  # Determine trend and risk
562
+ if slope > Constants.SLOPE_THRESHOLD_INCREASING:
563
  trend = "increasing"
564
+ risk = "critical" if predicted_latency > Constants.LATENCY_EXTREME else "high"
565
+ elif slope < Constants.SLOPE_THRESHOLD_DECREASING:
566
+ trend = "decreasing"
567
  risk = "low"
568
  else:
569
  trend = "stable"
570
+ risk = "low" if predicted_latency < Constants.LATENCY_WARNING else "medium"
571
 
572
+ # Calculate time to reach critical threshold
573
  time_to_critical = None
574
+ if slope > 0 and predicted_latency < Constants.LATENCY_EXTREME:
575
  denominator = predicted_latency - latencies[-1]
576
+ if abs(denominator) > 0.1:
577
+ minutes_to_critical = lookahead_minutes * \
578
+ (Constants.LATENCY_EXTREME - predicted_latency) / denominator
579
  if minutes_to_critical > 0:
580
+ time_to_critical = minutes_to_critical
581
 
582
  return ForecastResult(
583
  metric="latency",
 
592
  logger.error(f"Latency forecast error: {e}", exc_info=True)
593
  return None
594
 
595
+ def _forecast_error_rate(
596
+ self,
597
+ history: List,
598
+ lookahead_minutes: int
599
+ ) -> Optional[ForecastResult]:
600
+ """Forecast error rate using exponential smoothing"""
 
 
 
 
 
601
  try:
602
  error_rates = [point['error_rate'] for point in history[-15:]]
603
 
604
+ if len(error_rates) < Constants.FORECAST_MIN_DATA_POINTS:
605
  return None
606
 
607
  # Exponential smoothing
 
617
 
618
  if recent_trend > 0.02:
619
  trend = "increasing"
620
+ risk = "critical" if predicted_rate > Constants.ERROR_RATE_CRITICAL else "high"
621
  elif recent_trend < -0.01:
622
  trend = "decreasing"
623
  risk = "low"
624
  else:
625
  trend = "stable"
626
+ risk = "low" if predicted_rate < Constants.ERROR_RATE_WARNING else "medium"
627
 
628
  # Confidence based on volatility
629
  confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
 
640
  logger.error(f"Error rate forecast error: {e}", exc_info=True)
641
  return None
642
 
643
+ def _forecast_resources(
644
+ self,
645
+ history: List,
646
+ lookahead_minutes: int
647
+ ) -> List[ForecastResult]:
648
+ """Forecast CPU and memory utilization"""
 
 
 
 
 
649
  forecasts = []
650
 
651
  # CPU forecast
652
  cpu_values = [point['cpu_util'] for point in history if point.get('cpu_util') is not None]
653
+ if len(cpu_values) >= Constants.FORECAST_MIN_DATA_POINTS:
654
  try:
655
  predicted_cpu = np.mean(cpu_values[-5:])
656
  trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
657
 
658
  risk = "low"
659
+ if predicted_cpu > Constants.CPU_CRITICAL:
660
  risk = "critical"
661
+ elif predicted_cpu > Constants.CPU_WARNING:
662
  risk = "high"
663
  elif predicted_cpu > 0.7:
664
  risk = "medium"
 
675
 
676
  # Memory forecast
677
  memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
678
+ if len(memory_values) >= Constants.FORECAST_MIN_DATA_POINTS:
679
  try:
680
  predicted_memory = np.mean(memory_values[-5:])
681
  trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
682
 
683
  risk = "low"
684
+ if predicted_memory > Constants.MEMORY_CRITICAL:
685
  risk = "critical"
686
+ elif predicted_memory > Constants.MEMORY_WARNING:
687
  risk = "high"
688
  elif predicted_memory > 0.7:
689
  risk = "medium"
 
701
  return forecasts
702
 
703
  def get_predictive_insights(self, service: str) -> Dict[str, Any]:
704
+ """Generate actionable insights from forecasts"""
 
 
 
 
 
 
 
 
705
  forecasts = self.forecast_service_health(service)
706
 
707
  critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
 
712
  if forecast.metric == "latency" and forecast.risk_level in ["high", "critical"]:
713
  warnings.append(f"📈 Latency expected to reach {forecast.predicted_value:.0f}ms")
714
  if forecast.time_to_threshold:
715
+ minutes = int(forecast.time_to_threshold)
716
+ recommendations.append(f"⏰ Critical latency (~{Constants.LATENCY_EXTREME}ms) in ~{minutes} minutes")
717
  recommendations.append("🔧 Consider scaling or optimizing dependencies")
718
 
719
  elif forecast.metric == "error_rate" and forecast.risk_level in ["high", "critical"]:
 
730
 
731
  return {
732
  'service': service,
733
+ 'forecasts': [
734
+ {
735
+ 'metric': f.metric,
736
+ 'predicted_value': f.predicted_value,
737
+ 'confidence': f.confidence,
738
+ 'trend': f.trend,
739
+ 'risk_level': f.risk_level,
740
+ 'time_to_threshold': f.time_to_threshold
741
+ }
742
+ for f in forecasts
743
+ ],
744
  'warnings': warnings[:3],
745
  'recommendations': list(dict.fromkeys(recommendations))[:3],
746
  'critical_risk_count': len(critical_risks),
747
+ 'forecast_timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat()
748
  }
749
 
 
 
 
 
750
 
751
  class BusinessImpactCalculator:
752
+ """Calculate business impact of anomalies"""
 
 
 
753
 
754
  def __init__(self, revenue_per_request: float = 0.01):
755
  self.revenue_per_request = revenue_per_request
756
+ logger.info(f"Initialized BusinessImpactCalculator")
757
 
758
+ def calculate_impact(
759
+ self,
760
+ event: ReliabilityEvent,
761
+ duration_minutes: int = 5
762
+ ) -> Dict[str, Any]:
763
+ """Calculate business impact for a reliability event"""
764
+ base_revenue_per_minute = Constants.BASE_REVENUE_PER_MINUTE
 
 
 
 
 
765
 
766
  impact_multiplier = 1.0
767
 
768
  # Impact factors
769
+ if event.latency_p99 > Constants.LATENCY_CRITICAL:
770
  impact_multiplier += 0.5
771
  if event.error_rate > 0.1:
772
  impact_multiplier += 0.8
773
+ if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
774
  impact_multiplier += 0.3
775
 
776
  revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
777
 
778
+ base_users_affected = Constants.BASE_USERS
779
+ user_impact_multiplier = (event.error_rate * 10) + \
780
+ (max(0, event.latency_p99 - 100) / 500)
781
  affected_users = int(base_users_affected * user_impact_multiplier)
782
 
783
  # Severity classification
 
790
  else:
791
  severity = "LOW"
792
 
793
+ logger.info(
794
+ f"Business impact: ${revenue_loss:.2f} revenue loss, "
795
+ f"{affected_users} users, {severity} severity"
796
+ )
797
 
798
  return {
799
  'revenue_loss_estimate': round(revenue_loss, 2),
 
802
  'throughput_reduction_pct': round(min(100, user_impact_multiplier * 100), 1)
803
  }
804
 
 
805
 
806
  class AdvancedAnomalyDetector:
807
+ """Enhanced anomaly detection with adaptive thresholds"""
 
 
 
808
 
809
  def __init__(self):
810
  self.historical_data = deque(maxlen=100)
811
  self.adaptive_thresholds = {
812
+ 'latency_p99': Constants.LATENCY_WARNING,
813
+ 'error_rate': Constants.ERROR_RATE_WARNING
814
  }
815
  self._lock = threading.RLock()
816
  logger.info("Initialized AdvancedAnomalyDetector")
817
 
818
  def detect_anomaly(self, event: ReliabilityEvent) -> bool:
819
+ """Detect if event is anomalous using adaptive thresholds"""
 
 
 
 
 
 
 
 
820
  with self._lock:
821
  latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
822
  error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
823
 
824
  resource_anomaly = False
825
+ if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
826
  resource_anomaly = True
827
+ if event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL:
828
  resource_anomaly = True
829
 
830
  self._update_thresholds(event)
 
832
  is_anomaly = latency_anomaly or error_anomaly or resource_anomaly
833
 
834
  if is_anomaly:
835
+ logger.info(
836
+ f"Anomaly detected for {event.component}: "
837
+ f"latency={latency_anomaly}, error={error_anomaly}, "
838
+ f"resource={resource_anomaly}"
839
+ )
840
 
841
  return is_anomaly
842
 
 
850
  self.adaptive_thresholds['latency_p99'] = new_threshold
851
  logger.debug(f"Updated adaptive latency threshold to {new_threshold:.2f}ms")
852
 
853
+ # === Multi-Agent System ===
 
 
854
  class AgentSpecialization(Enum):
855
  """Agent specialization types"""
856
  DETECTIVE = "anomaly_detection"
857
  DIAGNOSTICIAN = "root_cause_analysis"
858
  PREDICTIVE = "predictive_analytics"
859
 
860
+
861
  class BaseAgent:
862
  """Base class for all specialized agents"""
863
 
 
873
  """Base analysis method to be implemented by specialized agents"""
874
  raise NotImplementedError
875
 
876
+
877
  class AnomalyDetectionAgent(BaseAgent):
878
+ """Specialized agent for anomaly detection and pattern recognition"""
 
 
 
879
 
880
  def __init__(self):
881
  super().__init__(AgentSpecialization.DETECTIVE)
882
  logger.info("Initialized AnomalyDetectionAgent")
883
 
884
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
885
+ """Perform comprehensive anomaly analysis"""
 
 
 
 
 
 
 
 
886
  try:
887
  anomaly_score = self._calculate_anomaly_score(event)
888
 
 
906
  }
907
 
908
  def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
909
+ """Calculate comprehensive anomaly score (0-1)"""
 
 
 
 
 
 
 
 
910
  scores = []
911
 
912
  # Latency anomaly (weighted 40%)
913
+ if event.latency_p99 > Constants.LATENCY_WARNING:
914
+ latency_score = min(1.0, (event.latency_p99 - Constants.LATENCY_WARNING) / 500)
915
  scores.append(0.4 * latency_score)
916
 
917
  # Error rate anomaly (weighted 30%)
918
+ if event.error_rate > Constants.ERROR_RATE_WARNING:
919
  error_score = min(1.0, event.error_rate / 0.3)
920
  scores.append(0.3 * error_score)
921
 
922
  # Resource anomaly (weighted 30%)
923
  resource_score = 0
924
+ if event.cpu_util and event.cpu_util > Constants.CPU_WARNING:
925
+ resource_score += 0.15 * min(1.0, (event.cpu_util - Constants.CPU_WARNING) / 0.2)
926
+ if event.memory_util and event.memory_util > Constants.MEMORY_WARNING:
927
+ resource_score += 0.15 * min(1.0, (event.memory_util - Constants.MEMORY_WARNING) / 0.2)
928
  scores.append(resource_score)
929
 
930
  return min(1.0, sum(scores))
931
 
932
  def _classify_severity(self, anomaly_score: float) -> str:
933
+ """Classify severity tier based on anomaly score"""
 
 
 
 
 
 
 
 
934
  if anomaly_score > 0.8:
935
  return "CRITICAL"
936
  elif anomaly_score > 0.6:
 
941
  return "LOW"
942
 
943
  def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
944
+ """Identify which metrics are outside normal ranges"""
 
 
 
 
 
 
 
 
945
  affected = []
946
 
947
  # Latency checks
948
+ if event.latency_p99 > Constants.LATENCY_EXTREME:
949
  affected.append({
950
+ "metric": "latency",
951
+ "value": event.latency_p99,
952
+ "severity": "CRITICAL",
953
+ "threshold": Constants.LATENCY_WARNING
954
  })
955
+ elif event.latency_p99 > Constants.LATENCY_CRITICAL:
956
  affected.append({
957
+ "metric": "latency",
958
+ "value": event.latency_p99,
959
+ "severity": "HIGH",
960
+ "threshold": Constants.LATENCY_WARNING
961
  })
962
+ elif event.latency_p99 > Constants.LATENCY_WARNING:
963
  affected.append({
964
+ "metric": "latency",
965
+ "value": event.latency_p99,
966
+ "severity": "MEDIUM",
967
+ "threshold": Constants.LATENCY_WARNING
968
  })
969
 
970
  # Error rate checks
971
+ if event.error_rate > Constants.ERROR_RATE_CRITICAL:
972
  affected.append({
973
+ "metric": "error_rate",
974
+ "value": event.error_rate,
975
+ "severity": "CRITICAL",
976
+ "threshold": Constants.ERROR_RATE_WARNING
977
  })
978
+ elif event.error_rate > Constants.ERROR_RATE_HIGH:
979
  affected.append({
980
+ "metric": "error_rate",
981
+ "value": event.error_rate,
982
+ "severity": "HIGH",
983
+ "threshold": Constants.ERROR_RATE_WARNING
984
  })
985
+ elif event.error_rate > Constants.ERROR_RATE_WARNING:
986
  affected.append({
987
+ "metric": "error_rate",
988
+ "value": event.error_rate,
989
+ "severity": "MEDIUM",
990
+ "threshold": Constants.ERROR_RATE_WARNING
991
  })
992
 
993
  # CPU checks
994
+ if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
995
  affected.append({
996
+ "metric": "cpu",
997
+ "value": event.cpu_util,
998
+ "severity": "CRITICAL",
999
+ "threshold": Constants.CPU_WARNING
1000
  })
1001
+ elif event.cpu_util and event.cpu_util > Constants.CPU_WARNING:
1002
  affected.append({
1003
+ "metric": "cpu",
1004
+ "value": event.cpu_util,
1005
+ "severity": "HIGH",
1006
+ "threshold": Constants.CPU_WARNING
1007
  })
1008
 
1009
  # Memory checks
1010
+ if event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL:
1011
  affected.append({
1012
+ "metric": "memory",
1013
+ "value": event.memory_util,
1014
+ "severity": "CRITICAL",
1015
+ "threshold": Constants.MEMORY_WARNING
1016
  })
1017
+ elif event.memory_util and event.memory_util > Constants.MEMORY_WARNING:
1018
  affected.append({
1019
+ "metric": "memory",
1020
+ "value": event.memory_util,
1021
+ "severity": "HIGH",
1022
+ "threshold": Constants.MEMORY_WARNING
1023
  })
1024
 
1025
  return affected
1026
 
1027
+ def _generate_detection_recommendations(
1028
+ self,
1029
+ event: ReliabilityEvent,
1030
+ anomaly_score: float
1031
+ ) -> List[str]:
1032
+ """Generate actionable recommendations"""
 
 
 
 
 
1033
  recommendations = []
1034
  affected_metrics = self._identify_affected_metrics(event)
1035
 
 
1089
  elif anomaly_score > 0.4:
1090
  recommendations.append("📊 MONITOR: Early warning signs detected")
1091
 
1092
+ return recommendations[:4]
1093
+
1094
 
1095
  class RootCauseAgent(BaseAgent):
1096
+ """Specialized agent for root cause analysis"""
 
 
 
1097
 
1098
  def __init__(self):
1099
  super().__init__(AgentSpecialization.DIAGNOSTICIAN)
1100
  logger.info("Initialized RootCauseAgent")
1101
 
1102
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
1103
+ """Perform root cause analysis"""
 
 
 
 
 
 
 
 
1104
  try:
1105
  causes = self._analyze_potential_causes(event)
1106
 
 
1126
  }
1127
 
1128
  def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
1129
+ """Analyze potential root causes based on event patterns"""
 
 
 
 
 
 
 
 
1130
  causes = []
1131
 
1132
  # Pattern 1: Database/External Dependency Failure
1133
+ if event.latency_p99 > Constants.LATENCY_EXTREME and event.error_rate > 0.2:
1134
  causes.append({
1135
  "cause": "Database/External Dependency Failure",
1136
  "confidence": 0.85,
 
1139
  })
1140
 
1141
  # Pattern 2: Resource Exhaustion
1142
+ if (event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL and
1143
+ event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL):
1144
  causes.append({
1145
  "cause": "Resource Exhaustion",
1146
  "confidence": 0.90,
 
1149
  })
1150
 
1151
  # Pattern 3: Application Bug / Configuration Issue
1152
+ if event.error_rate > Constants.ERROR_RATE_CRITICAL and event.latency_p99 < 200:
1153
  causes.append({
1154
  "cause": "Application Bug / Configuration Issue",
1155
  "confidence": 0.75,
 
1158
  })
1159
 
1160
  # Pattern 4: Gradual Performance Degradation
1161
+ if (200 <= event.latency_p99 <= 400 and
1162
+ Constants.ERROR_RATE_WARNING <= event.error_rate <= Constants.ERROR_RATE_HIGH):
1163
  causes.append({
1164
  "cause": "Gradual Performance Degradation",
1165
  "confidence": 0.65,
 
1179
  return causes
1180
 
1181
  def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
1182
+ """Identify evidence patterns in the event data"""
 
 
 
 
 
 
 
 
1183
  evidence = []
1184
 
1185
  if event.latency_p99 > event.error_rate * 1000:
1186
  evidence.append("latency_disproportionate_to_errors")
1187
 
1188
+ if (event.cpu_util and event.cpu_util > Constants.CPU_WARNING and
1189
+ event.memory_util and event.memory_util > Constants.MEMORY_WARNING):
1190
  evidence.append("correlated_resource_exhaustion")
1191
 
1192
+ if event.error_rate > Constants.ERROR_RATE_HIGH and event.latency_p99 < Constants.LATENCY_CRITICAL:
1193
  evidence.append("errors_without_latency_impact")
1194
 
1195
  return evidence
1196
 
1197
  def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
1198
+ """Determine investigation priority"""
 
 
 
 
 
 
 
 
1199
  for cause in causes:
1200
  if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
1201
  return "HIGH"
1202
  return "MEDIUM"
1203
 
1204
+
1205
  class PredictiveAgent(BaseAgent):
1206
+ """Specialized agent for predictive analytics"""
 
 
 
1207
 
1208
+ def __init__(self, engine: SimplePredictiveEngine):
1209
  super().__init__(AgentSpecialization.PREDICTIVE)
1210
+ self.engine = engine
1211
  logger.info("Initialized PredictiveAgent")
1212
 
1213
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
1214
+ """Perform predictive analysis for future risks"""
 
 
 
 
 
 
 
 
1215
  try:
1216
  event_data = {
1217
  'latency_p99': event.latency_p99,
 
1239
  'recommendations': [f"Analysis error: {str(e)}"]
1240
  }
1241
 
1242
+
1243
+ # FIXED: Add circuit breaker for agent resilience
1244
+ @circuit(failure_threshold=3, recovery_timeout=30, name="agent_circuit_breaker")
1245
+ async def call_agent_with_protection(agent: BaseAgent, event: ReliabilityEvent) -> Dict[str, Any]:
1246
  """
1247
+ Call agent with circuit breaker protection
1248
+
1249
+ FIXED: Prevents cascading failures from misbehaving agents
1250
  """
1251
+ try:
1252
+ result = await asyncio.wait_for(
1253
+ agent.analyze(event),
1254
+ timeout=Constants.AGENT_TIMEOUT_SECONDS
1255
+ )
1256
+ return result
1257
+ except asyncio.TimeoutError:
1258
+ logger.warning(f"Agent {agent.specialization.value} timed out")
1259
+ raise
1260
+ except Exception as e:
1261
+ logger.error(f"Agent {agent.specialization.value} error: {e}", exc_info=True)
1262
+ raise
1263
+
1264
+
1265
+ class OrchestrationManager:
1266
+ """Orchestrates multiple specialized agents for comprehensive analysis"""
1267
 
1268
+ def __init__(
1269
+ self,
1270
+ detective: Optional[AnomalyDetectionAgent] = None,
1271
+ diagnostician: Optional[RootCauseAgent] = None,
1272
+ predictive: Optional[PredictiveAgent] = None
1273
+ ):
1274
+ """
1275
+ Initialize orchestration manager
1276
+
1277
+ FIXED: Dependency injection for testability
1278
+ """
1279
  self.agents = {
1280
+ AgentSpecialization.DETECTIVE: detective or AnomalyDetectionAgent(),
1281
+ AgentSpecialization.DIAGNOSTICIAN: diagnostician or RootCauseAgent(),
1282
+ AgentSpecialization.PREDICTIVE: predictive or PredictiveAgent(SimplePredictiveEngine()),
1283
  }
1284
  logger.info(f"Initialized OrchestrationManager with {len(self.agents)} agents")
1285
 
 
1287
  """
1288
  Coordinate multiple agents for comprehensive analysis
1289
 
1290
+ FIXED: Improved timeout handling with circuit breakers
 
 
 
 
1291
  """
1292
+ # Create tasks for all agents
1293
+ agent_tasks = []
1294
+ agent_specs = []
1295
+
1296
+ for spec, agent in self.agents.items():
1297
+ agent_tasks.append(call_agent_with_protection(agent, event))
1298
+ agent_specs.append(spec)
1299
 
1300
+ # FIXED: Parallel execution with global timeout
1301
  agent_results = {}
1302
+
1303
+ try:
1304
+ # Run all agents in parallel with global timeout
1305
+ results = await asyncio.wait_for(
1306
+ asyncio.gather(*agent_tasks, return_exceptions=True),
1307
+ timeout=Constants.AGENT_TIMEOUT_SECONDS + 1
1308
+ )
1309
+
1310
+ # Process results
1311
+ for spec, result in zip(agent_specs, results):
1312
+ if isinstance(result, Exception):
1313
+ logger.error(f"Agent {spec.value} failed: {result}")
1314
+ continue
1315
+
1316
+ agent_results[spec.value] = result
1317
+ logger.debug(f"Agent {spec.value} completed successfully")
1318
+
1319
+ except asyncio.TimeoutError:
1320
+ logger.warning("Agent orchestration timed out")
1321
+ except Exception as e:
1322
+ logger.error(f"Agent orchestration error: {e}", exc_info=True)
1323
 
1324
  return self._synthesize_agent_findings(event, agent_results)
1325
 
1326
+ def _synthesize_agent_findings(
1327
+ self,
1328
+ event: ReliabilityEvent,
1329
+ agent_results: Dict
1330
+ ) -> Dict[str, Any]:
1331
+ """Combine insights from all specialized agents"""
 
 
 
 
 
1332
  detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
1333
  diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
1334
  predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
 
1342
  'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
1343
  'anomaly_confidence': detective_result['confidence'],
1344
  'primary_metrics_affected': [
1345
+ metric["metric"] for metric in
1346
  detective_result['findings'].get('primary_metrics_affected', [])
1347
  ]
1348
  },
 
1355
  ),
1356
  'agent_metadata': {
1357
  'participating_agents': list(agent_results.keys()),
1358
+ 'analysis_timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat()
1359
  }
1360
  }
1361
 
1362
  return synthesis
1363
 
1364
+ def _prioritize_actions(
1365
+ self,
1366
+ detection_actions: List[str],
1367
+ diagnosis_actions: List[str],
1368
+ predictive_actions: List[str]
1369
+ ) -> List[str]:
1370
+ """Combine and prioritize actions from multiple agents"""
 
 
 
 
 
 
 
1371
  all_actions = detection_actions + diagnosis_actions + predictive_actions
1372
  seen = set()
1373
  unique_actions = []
 
1375
  if action not in seen:
1376
  seen.add(action)
1377
  unique_actions.append(action)
1378
+ return unique_actions[:5]
1379
 
1380
+ # === Enhanced Reliability Engine ===
 
 
 
1381
  class EnhancedReliabilityEngine:
1382
  """
1383
+ Main engine for processing reliability events
1384
+
1385
+ FIXED: Dependency injection for all components
1386
  """
1387
 
1388
+ def __init__(
1389
+ self,
1390
+ orchestrator: Optional[OrchestrationManager] = None,
1391
+ policy_engine: Optional[PolicyEngine] = None,
1392
+ event_store: Optional[ThreadSafeEventStore] = None,
1393
+ anomaly_detector: Optional[AdvancedAnomalyDetector] = None,
1394
+ business_calculator: Optional[BusinessImpactCalculator] = None
1395
+ ):
1396
+ """
1397
+ Initialize reliability engine with dependency injection
1398
+
1399
+ FIXED: All dependencies injected for testability
1400
+ """
1401
+ self.orchestrator = orchestrator or OrchestrationManager()
1402
+ self.policy_engine = policy_engine or PolicyEngine()
1403
+ self.event_store = event_store or ThreadSafeEventStore()
1404
+ self.anomaly_detector = anomaly_detector or AdvancedAnomalyDetector()
1405
+ self.business_calculator = business_calculator or BusinessImpactCalculator()
1406
+
1407
  self.performance_metrics = {
1408
  'total_incidents_processed': 0,
1409
  'multi_agent_analyses': 0,
 
1413
  logger.info("Initialized EnhancedReliabilityEngine")
1414
 
1415
  async def process_event_enhanced(
1416
+ self,
1417
+ component: str,
1418
+ latency: float,
1419
  error_rate: float,
1420
+ throughput: float = 1000,
1421
  cpu_util: Optional[float] = None,
1422
  memory_util: Optional[float] = None
1423
  ) -> Dict[str, Any]:
1424
  """
1425
  Process a reliability event through the complete analysis pipeline
1426
 
1427
+ FIXED: Proper async/await throughout
 
 
 
 
 
 
 
 
 
1428
  """
1429
+ logger.info(
1430
+ f"Processing event for {component}: latency={latency}ms, "
1431
+ f"error_rate={error_rate*100:.1f}%"
1432
+ )
1433
+
1434
+ # Validate component ID
1435
+ is_valid, error_msg = validate_component_id(component)
1436
+ if not is_valid:
1437
+ return {'error': error_msg, 'status': 'INVALID'}
1438
 
1439
  # Create event
1440
+ try:
1441
+ event = ReliabilityEvent(
1442
+ component=component,
1443
+ latency_p99=latency,
1444
+ error_rate=error_rate,
1445
+ throughput=throughput,
1446
+ cpu_util=cpu_util,
1447
+ memory_util=memory_util,
1448
+ upstream_deps=["auth-service", "database"] if component == "api-service" else []
1449
+ )
1450
+ except Exception as e:
1451
+ logger.error(f"Event creation error: {e}", exc_info=True)
1452
+ return {'error': f'Invalid event data: {str(e)}', 'status': 'INVALID'}
1453
 
1454
  # Multi-agent analysis
1455
+ agent_analysis = await self.orchestrator.orchestrate_analysis(event)
1456
 
1457
  # Anomaly detection
1458
+ is_anomaly = self.anomaly_detector.detect_anomaly(event)
1459
+
1460
  # Determine severity based on agent confidence
1461
  agent_confidence = 0.0
1462
  if agent_analysis and 'incident_summary' in agent_analysis:
1463
  agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1464
  else:
1465
  agent_confidence = 0.8 if is_anomaly else 0.1
1466
+
1467
  # Set event severity
1468
  if agent_confidence > 0.8:
1469
+ severity = EventSeverity.CRITICAL
1470
  elif agent_confidence > 0.6:
1471
+ severity = EventSeverity.HIGH
1472
  elif agent_confidence > 0.4:
1473
+ severity = EventSeverity.MEDIUM
1474
  else:
1475
+ severity = EventSeverity.LOW
1476
+
1477
+ # Create mutable copy with updated severity
1478
+ event = event.model_copy(update={'severity': severity})
1479
 
1480
  # Evaluate healing policies
1481
+ healing_actions = self.policy_engine.evaluate_policies(event)
1482
 
1483
  # Calculate business impact
1484
+ business_impact = self.business_calculator.calculate_impact(event) if is_anomaly else None
1485
 
1486
  # Store in vector database for similarity detection
1487
  if thread_safe_index is not None and model is not None and is_anomaly:
1488
  try:
1489
+ # FIXED: Non-blocking encoding with ProcessPoolExecutor
1490
  analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
1491
  vector_text = f"{component} {latency} {error_rate} {analysis_text}"
1492
+
1493
+ # Encode asynchronously
1494
+ loop = asyncio.get_event_loop()
1495
+ vec = await loop.run_in_executor(
1496
+ thread_safe_index._encoder_pool,
1497
+ model.encode,
1498
+ [vector_text]
1499
+ )
1500
+
1501
+ thread_safe_index.add_async(np.array(vec, dtype=np.float32), vector_text)
1502
  except Exception as e:
1503
  logger.error(f"Error storing vector: {e}", exc_info=True)
1504
 
1505
  # Build comprehensive result
1506
  result = {
1507
+ "timestamp": event.timestamp.isoformat(),
1508
  "component": component,
1509
  "latency_p99": latency,
1510
  "error_rate": error_rate,
 
1522
  }
1523
 
1524
  # Store event in history
1525
+ self.event_store.add(event)
1526
 
1527
  # Update performance metrics
1528
  with self._lock:
 
1535
 
1536
  return result
1537
 
1538
+
1539
+ # === Initialize Engine (with dependency injection) ===
1540
  enhanced_engine = EnhancedReliabilityEngine()
1541
 
1542
+
1543
+ # === Rate Limiting ===
1544
+ class RateLimiter:
1545
+ """Simple rate limiter for request throttling"""
 
 
 
 
 
 
1546
 
1547
+ def __init__(self, max_per_minute: int = Constants.MAX_REQUESTS_PER_MINUTE):
1548
+ self.max_per_minute = max_per_minute
1549
+ self.requests: deque = deque(maxlen=max_per_minute)
1550
+ self._lock = threading.RLock()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1551
 
1552
+ def is_allowed(self) -> Tuple[bool, str]:
1553
+ """Check if request is allowed"""
1554
+ with self._lock:
1555
+ now = datetime.datetime.now(datetime.timezone.utc)
1556
+
1557
+ # Remove requests older than 1 minute
1558
+ one_minute_ago = now - datetime.timedelta(minutes=1)
1559
+ while self.requests and self.requests[0] < one_minute_ago:
1560
+ self.requests.popleft()
1561
+
1562
+ # Check rate limit
1563
+ if len(self.requests) >= self.max_per_minute:
1564
+ return False, f"Rate limit exceeded: {self.max_per_minute} requests/minute"
1565
+
1566
+ # Add current request
1567
+ self.requests.append(now)
1568
+ return True, ""
1569
+
1570
+
1571
+ rate_limiter = RateLimiter()
1572
+
1573
 
1574
  # === Gradio UI ===
1575
  def create_enhanced_ui():
1576
  """
1577
+ Create the comprehensive Gradio UI for the reliability framework
1578
+
1579
+ FIXED: Uses native async handlers (no event loop creation)
1580
+ FIXED: Rate limiting on all endpoints
1581
  """
1582
 
1583
  with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
 
1586
  **Multi-Agent AI System for Production Reliability**
1587
 
1588
  *Specialized AI agents working together to detect, diagnose, predict, and heal system issues*
1589
+
1590
+ 🔒 **Security Patched** | ⚡ **Performance Optimized** | 🧪 **Production Ready**
1591
  """)
1592
 
1593
  with gr.Row():
 
1602
  latency = gr.Slider(
1603
  minimum=10, maximum=1000, value=100, step=1,
1604
  label="Latency P99 (ms)",
1605
+ info=f"Alert threshold: >{Constants.LATENCY_WARNING}ms (adaptive)"
1606
  )
1607
  error_rate = gr.Slider(
1608
  minimum=0, maximum=0.5, value=0.02, step=0.001,
1609
  label="Error Rate",
1610
+ info=f"Alert threshold: >{Constants.ERROR_RATE_WARNING}"
1611
  )
1612
  throughput = gr.Number(
1613
  value=1000,
 
1621
  )
1622
  memory_util = gr.Slider(
1623
  minimum=0, maximum=1, value=0.3, step=0.01,
1624
+ label="Memory Utilization",
1625
  info="0.0 - 1.0 scale"
1626
  )
1627
  submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary", size="lg")
 
1638
  gr.Markdown("""
1639
  **Specialized AI Agents:**
1640
  - 🕵️ **Detective**: Anomaly detection & pattern recognition
1641
+ - 🔍 **Diagnostician**: Root cause analysis & investigation
1642
  - 🔮 **Predictive**: Future risk forecasting & trend analysis
1643
  """)
1644
 
 
1651
  gr.Markdown("""
1652
  **Future Risk Forecasting:**
1653
  - 📈 Latency trends and thresholds
1654
+ - 🚨 Error rate predictions
1655
  - 🔥 Resource utilization forecasts
1656
  - ⏰ Time-to-failure estimates
1657
  """)
 
1676
  - **💰 Business Impact**: Revenue and user impact quantification
1677
  - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
1678
  - **📚 Vector Memory**: FAISS-based incident memory for similarity detection
1679
+ - **⚡ Production Ready**: Circuit breakers, cooldowns, thread safety, and enterprise features
1680
+ - **🔒 Security Patched**: All critical CVEs fixed (Gradio 5.50.0+, Requests 2.32.5+)
1681
  """)
1682
+
1683
  with gr.Accordion("🔧 Healing Policies", open=False):
1684
  policy_info = []
1685
+ for policy in enhanced_engine.policy_engine.policies:
1686
  if policy.enabled:
1687
  actions = ", ".join([action.value for action in policy.actions])
1688
+ policy_info.append(
1689
+ f"**{policy.name}** (Priority {policy.priority}): {actions}\n"
1690
+ f" - Cooldown: {policy.cool_down_seconds}s\n"
1691
+ f" - Max executions: {policy.max_executions_per_hour}/hour"
1692
+ )
1693
 
1694
  gr.Markdown("\n\n".join(policy_info))
1695
 
1696
+ # FIXED: Native async handler (no event loop creation needed)
1697
+ async def submit_event_enhanced_async(
1698
+ component, latency, error_rate, throughput, cpu_util, memory_util
1699
+ ):
1700
  """
1701
+ Async event handler - uses Gradio's native async support
 
1702
 
1703
+ CRITICAL FIX: No event loop creation - Gradio handles this
1704
+ FIXED: Rate limiting added
1705
+ FIXED: Comprehensive error handling
 
 
 
1706
  """
1707
  try:
1708
+ # Rate limiting check
1709
+ allowed, rate_msg = rate_limiter.is_allowed()
1710
+ if not allowed:
1711
+ logger.warning(f"Rate limit exceeded")
1712
+ return rate_msg, {}, {}, gr.Dataframe(value=[])
1713
+
1714
  # Type conversion
1715
+ try:
1716
+ latency = float(latency)
1717
+ error_rate = float(error_rate)
1718
+ throughput = float(throughput) if throughput else 1000
1719
+ cpu_util = float(cpu_util) if cpu_util else None
1720
+ memory_util = float(memory_util) if memory_util else None
1721
+ except (ValueError, TypeError) as e:
1722
+ error_msg = f"❌ Invalid input types: {str(e)}"
1723
+ logger.warning(error_msg)
1724
+ return error_msg, {}, {}, gr.Dataframe(value=[])
1725
 
1726
+ # Input validation
1727
+ is_valid, error_msg = validate_inputs(
1728
+ latency, error_rate, throughput, cpu_util, memory_util
1729
+ )
1730
  if not is_valid:
1731
  logger.warning(f"Invalid input: {error_msg}")
1732
  return error_msg, {}, {}, gr.Dataframe(value=[])
1733
 
1734
+ # FIXED: Direct async call - no event loop creation needed
1735
+ result = await enhanced_engine.process_event_enhanced(
1736
+ component, latency, error_rate, throughput, cpu_util, memory_util
1737
+ )
1738
 
1739
+ # Handle errors
1740
+ if 'error' in result:
1741
+ return f"❌ {result['error']}", {}, {}, gr.Dataframe(value=[])
 
 
 
 
 
 
1742
 
1743
+ # Build table data (THREAD-SAFE)
1744
  table_data = []
1745
+ for event in enhanced_engine.event_store.get_recent(15):
1746
  table_data.append([
1747
+ event.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
1748
  event.component,
1749
+ f"{event.latency_p99:.0f}ms",
1750
  f"{event.error_rate:.3f}",
1751
+ f"{event.throughput:.0f}",
1752
  event.severity.value.upper(),
1753
  "Multi-agent analysis"
1754
  ])
1755
 
1756
  # Format output message
1757
  status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
1758
+ output_msg = f"{status_emoji} **{result['status']}**\n"
1759
 
1760
  if "multi_agent_analysis" in result:
1761
  analysis = result["multi_agent_analysis"]
1762
  confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
1763
+ output_msg += f"🎯 **Confidence**: {confidence*100:.1f}%\n"
1764
 
1765
  predictive_data = analysis.get('predictive_insights', {})
1766
  if predictive_data.get('critical_risk_count', 0) > 0:
1767
+ output_msg += f"🔮 **PREDICTIVE**: {predictive_data['critical_risk_count']} critical risks forecast\n"
1768
 
1769
  if analysis.get('recommended_actions'):
1770
  actions_preview = ', '.join(analysis['recommended_actions'][:2])
1771
+ output_msg += f"💡 **Top Insights**: {actions_preview}\n"
1772
 
1773
+ if result.get("business_impact"):
1774
  impact = result["business_impact"]
1775
  output_msg += (
1776
+ f"💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | "
1777
  f"👥 {impact['affected_users_estimate']} users | "
1778
+ f"🚨 {impact['severity_level']}\n"
1779
  )
1780
 
1781
+ if result.get("healing_actions") and result["healing_actions"] != ["no_action"]:
1782
  actions = ", ".join(result["healing_actions"])
1783
+ output_msg += f"🔧 **Auto-Actions**: {actions}"
1784
 
1785
  agent_insights_data = result.get("multi_agent_analysis", {})
1786
  predictive_insights_data = agent_insights_data.get('predictive_insights', {})
 
1796
  )
1797
  )
1798
 
 
 
 
 
1799
  except Exception as e:
1800
  error_msg = f"❌ Error processing event: {str(e)}"
1801
  logger.error(error_msg, exc_info=True)
1802
  return error_msg, {}, {}, gr.Dataframe(value=[])
1803
 
1804
+ # FIXED: Use async handler directly
1805
  submit_btn.click(
1806
+ fn=submit_event_enhanced_async, # Native async support
1807
  inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
1808
  outputs=[output_text, agent_insights, predictive_insights, events_table]
1809
  )
1810
 
1811
  return demo
1812
 
1813
+
1814
  # === Main Entry Point ===
1815
  if __name__ == "__main__":
1816
  logger.info("=" * 80)
1817
+ logger.info("Starting Enterprise Agentic Reliability Framework (PATCHED VERSION)")
1818
  logger.info("=" * 80)
1819
+ logger.info(f"Python version: {os.sys.version}")
1820
+ logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
1821
  logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
1822
+ logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
1823
+ logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
1824
  logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
1825
+ logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1826
  logger.info("=" * 80)
1827
+
1828
+ try:
1829
+ demo = create_enhanced_ui()
1830
+
1831
+ logger.info("Launching Gradio UI on 0.0.0.0:7860...")
1832
+ demo.launch(
1833
+ server_name="0.0.0.0",
1834
+ server_port=7860,
1835
+ share=False,
1836
+ show_error=True
1837
+ )
1838
+ except KeyboardInterrupt:
1839
+ logger.info("Received shutdown signal...")
1840
+ except Exception as e:
1841
+ logger.error(f"Application error: {e}", exc_info=True)
1842
+ finally:
1843
+ # Graceful shutdown
1844
+ logger.info("Shutting down gracefully...")
1845
+
1846
+ if thread_safe_index:
1847
+ logger.info("Saving pending vectors before shutdown...")
1848
+ thread_safe_index.shutdown()
1849
+
1850
+ logger.info("=" * 80)
1851
+ logger.info("Application shutdown complete")
1852
+ logger.info("=" * 80)