petter2025 commited on
Commit
714bfce
·
verified ·
1 Parent(s): 1437f82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +732 -70
app.py CHANGED
@@ -2,8 +2,14 @@
2
  Enterprise Agentic Reliability Framework - Main Application
3
  Multi-Agent AI System for Production Reliability Monitoring
4
 
5
- This module provides the main Gradio UI and orchestrates the reliability
6
- monitoring system with anomaly detection, predictive analytics, and auto-healing.
 
 
 
 
 
 
7
  """
8
 
9
  import os
@@ -20,11 +26,11 @@ from collections import deque
20
  from dataclasses import dataclass, asdict
21
  import hashlib
22
  import asyncio
 
23
 
24
  # Import our modules
25
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
26
  from healing_policies import PolicyEngine
27
- from agent_orchestrator import OrchestrationManager
28
 
29
  # === Logging Configuration ===
30
  logging.basicConfig(
@@ -47,8 +53,10 @@ class Config:
47
  # Thresholds
48
  LATENCY_WARNING: float = 150.0
49
  LATENCY_CRITICAL: float = 300.0
 
50
  ERROR_RATE_WARNING: float = 0.05
51
- ERROR_RATE_CRITICAL: float = 0.15
 
52
  CPU_WARNING: float = 0.8
53
  CPU_CRITICAL: float = 0.9
54
  MEMORY_WARNING: float = 0.8
@@ -59,6 +67,10 @@ class Config:
59
  MAX_EVENTS_STORED: int = 1000
60
  AGENT_TIMEOUT: int = 10
61
  CACHE_EXPIRY_MINUTES: int = 15
 
 
 
 
62
 
63
  config = Config()
64
 
@@ -214,7 +226,10 @@ class ForecastResult:
214
  risk_level: str = "low" # low, medium, high, critical
215
 
216
  class SimplePredictiveEngine:
217
- """Lightweight forecasting engine optimized for Hugging Face Spaces"""
 
 
 
218
 
219
  def __init__(self, history_window: int = config.HISTORY_WINDOW):
220
  self.history_window = history_window
@@ -225,7 +240,13 @@ class SimplePredictiveEngine:
225
  logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
226
 
227
  def add_telemetry(self, service: str, event_data: Dict) -> None:
228
- """Add telemetry data to service history"""
 
 
 
 
 
 
229
  with self._lock:
230
  if service not in self.service_history:
231
  self.service_history[service] = deque(maxlen=self.history_window)
@@ -256,7 +277,16 @@ class SimplePredictiveEngine:
256
  logger.debug(f"Cleaned {len(expired)} expired cache entries")
257
 
258
  def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
259
- """Forecast service health metrics"""
 
 
 
 
 
 
 
 
 
260
  with self._lock:
261
  if service not in self.service_history or len(self.service_history[service]) < 10:
262
  return []
@@ -288,7 +318,16 @@ class SimplePredictiveEngine:
288
  return forecasts
289
 
290
  def _forecast_latency(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
291
- """Forecast latency using linear regression and trend analysis"""
 
 
 
 
 
 
 
 
 
292
  try:
293
  latencies = [point['latency'] for point in history[-20:]]
294
 
@@ -307,25 +346,25 @@ class SimplePredictiveEngine:
307
  residuals = latencies - (slope * x + intercept)
308
  confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
309
 
310
- # Determine trend
311
  if slope > 5:
312
  trend = "increasing"
313
- risk = "high" if predicted_latency > config.LATENCY_CRITICAL else "medium"
314
  elif slope < -2:
315
  trend = "decreasing"
316
  risk = "low"
317
  else:
318
  trend = "stable"
319
- risk = "low"
320
 
321
  # Calculate time to reach critical threshold (500ms)
322
  time_to_critical = None
323
- if slope > 0 and predicted_latency < 500:
324
  denominator = predicted_latency - latencies[-1]
325
  if abs(denominator) > 0.1: # Avoid division by very small numbers
326
- time_to_critical = datetime.timedelta(
327
- minutes=lookahead_minutes * (500 - predicted_latency) / denominator
328
- )
329
 
330
  return ForecastResult(
331
  metric="latency",
@@ -341,7 +380,16 @@ class SimplePredictiveEngine:
341
  return None
342
 
343
  def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
344
- """Forecast error rate using exponential smoothing"""
 
 
 
 
 
 
 
 
 
345
  try:
346
  error_rates = [point['error_rate'] for point in history[-15:]]
347
 
@@ -361,13 +409,13 @@ class SimplePredictiveEngine:
361
 
362
  if recent_trend > 0.02:
363
  trend = "increasing"
364
- risk = "high" if predicted_rate > 0.1 else "medium"
365
  elif recent_trend < -0.01:
366
  trend = "decreasing"
367
  risk = "low"
368
  else:
369
  trend = "stable"
370
- risk = "low"
371
 
372
  # Confidence based on volatility
373
  confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
@@ -385,7 +433,16 @@ class SimplePredictiveEngine:
385
  return None
386
 
387
  def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
388
- """Forecast CPU and memory utilization"""
 
 
 
 
 
 
 
 
 
389
  forecasts = []
390
 
391
  # CPU forecast
@@ -441,7 +498,15 @@ class SimplePredictiveEngine:
441
  return forecasts
442
 
443
  def get_predictive_insights(self, service: str) -> Dict[str, Any]:
444
- """Generate actionable insights from forecasts"""
 
 
 
 
 
 
 
 
445
  forecasts = self.forecast_service_health(service)
446
 
447
  critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
@@ -483,7 +548,10 @@ events_history_store = ThreadSafeEventStore()
483
  predictive_engine = SimplePredictiveEngine()
484
 
485
  class BusinessImpactCalculator:
486
- """Calculate business impact of anomalies"""
 
 
 
487
 
488
  def __init__(self, revenue_per_request: float = 0.01):
489
  self.revenue_per_request = revenue_per_request
@@ -498,12 +566,13 @@ class BusinessImpactCalculator:
498
  duration_minutes: Assumed duration of the incident
499
 
500
  Returns:
501
- Dictionary containing impact estimates
502
  """
503
- base_revenue_per_minute = 100
504
 
505
  impact_multiplier = 1.0
506
 
 
507
  if event.latency_p99 > config.LATENCY_CRITICAL:
508
  impact_multiplier += 0.5
509
  if event.error_rate > 0.1:
@@ -513,10 +582,11 @@ class BusinessImpactCalculator:
513
 
514
  revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
515
 
516
- base_users_affected = 1000
517
  user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
518
  affected_users = int(base_users_affected * user_impact_multiplier)
519
 
 
520
  if revenue_loss > 500 or affected_users > 5000:
521
  severity = "CRITICAL"
522
  elif revenue_loss > 100 or affected_users > 1000:
@@ -526,7 +596,7 @@ class BusinessImpactCalculator:
526
  else:
527
  severity = "LOW"
528
 
529
- logger.info(f"Business impact calculated: ${revenue_loss:.2f} revenue loss, {affected_users} users affected, {severity} severity")
530
 
531
  return {
532
  'revenue_loss_estimate': round(revenue_loss, 2),
@@ -538,7 +608,10 @@ class BusinessImpactCalculator:
538
  business_calculator = BusinessImpactCalculator()
539
 
540
  class AdvancedAnomalyDetector:
541
- """Enhanced anomaly detection with adaptive thresholds"""
 
 
 
542
 
543
  def __init__(self):
544
  self.historical_data = deque(maxlen=100)
@@ -551,7 +624,7 @@ class AdvancedAnomalyDetector:
551
 
552
  def detect_anomaly(self, event: ReliabilityEvent) -> bool:
553
  """
554
- Detect if event is anomalous
555
 
556
  Args:
557
  event: The reliability event to check
@@ -590,41 +663,597 @@ class AdvancedAnomalyDetector:
590
 
591
  anomaly_detector = AdvancedAnomalyDetector()
592
 
593
- # === Predictive Agent Integration ===
594
- class PredictiveAgent:
595
- """Predictive agent that uses SimplePredictiveEngine"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  def __init__(self):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
  self.engine = predictive_engine
599
  logger.info("Initialized PredictiveAgent")
600
 
601
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
602
- """Predictive analysis for future risks"""
603
- event_data = {
604
- 'latency_p99': event.latency_p99,
605
- 'error_rate': event.error_rate,
606
- 'throughput': event.throughput,
607
- 'cpu_util': event.cpu_util,
608
- 'memory_util': event.memory_util
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
  }
610
- self.engine.add_telemetry(event.component, event_data)
 
 
 
 
611
 
612
- insights = self.engine.get_predictive_insights(event.component)
 
 
 
 
 
 
 
 
 
613
 
614
- return {
615
- 'specialization': 'predictive_analytics',
616
- 'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
617
- 'findings': insights,
618
- 'recommendations': insights['recommendations']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
 
621
- # Initialize orchestration with predictive agent
622
  orchestration_manager = OrchestrationManager()
623
- orchestration_manager.agents['predictive_analytics'] = PredictiveAgent()
624
 
625
  # === Enhanced Reliability Engine ===
626
  class EnhancedReliabilityEngine:
627
- """Main engine for processing reliability events"""
 
 
 
628
 
629
  def __init__(self):
630
  self.performance_metrics = {
@@ -645,7 +1274,7 @@ class EnhancedReliabilityEngine:
645
  memory_util: Optional[float] = None
646
  ) -> Dict[str, Any]:
647
  """
648
- Process a reliability event through the multi-agent system
649
 
650
  Args:
651
  component: Service component name
@@ -656,7 +1285,7 @@ class EnhancedReliabilityEngine:
656
  memory_util: Memory utilization (0-1)
657
 
658
  Returns:
659
- Dictionary containing analysis results
660
  """
661
  logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
662
 
@@ -684,6 +1313,7 @@ class EnhancedReliabilityEngine:
684
  else:
685
  agent_confidence = 0.8 if is_anomaly else 0.1
686
 
 
687
  if agent_confidence > 0.8:
688
  event.severity = EventSeverity.CRITICAL
689
  elif agent_confidence > 0.6:
@@ -699,7 +1329,7 @@ class EnhancedReliabilityEngine:
699
  # Calculate business impact
700
  business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
701
 
702
- # Store in vector database
703
  if thread_safe_index is not None and model is not None and is_anomaly:
704
  try:
705
  analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
@@ -709,7 +1339,7 @@ class EnhancedReliabilityEngine:
709
  except Exception as e:
710
  logger.error(f"Error storing vector: {e}", exc_info=True)
711
 
712
- # Build result
713
  result = {
714
  "timestamp": event.timestamp,
715
  "component": component,
@@ -728,10 +1358,10 @@ class EnhancedReliabilityEngine:
728
  }
729
  }
730
 
731
- # Store event
732
  events_history_store.add(event)
733
 
734
- # Update metrics
735
  with self._lock:
736
  self.performance_metrics['total_incidents_processed'] += 1
737
  self.performance_metrics['multi_agent_analyses'] += 1
@@ -754,10 +1384,17 @@ def validate_inputs(
754
  memory_util: Optional[float]
755
  ) -> Tuple[bool, str]:
756
  """
757
- Validate user inputs
758
 
 
 
 
 
 
 
 
759
  Returns:
760
- Tuple of (is_valid, error_message)
761
  """
762
  if not (0 <= latency <= 10000):
763
  return False, "❌ Invalid latency: must be between 0-10000ms"
@@ -772,9 +1409,13 @@ def validate_inputs(
772
 
773
  return True, ""
774
 
775
- # === Enhanced UI with Multi-Agent Insights ===
776
  def create_enhanced_ui():
777
- """Create the Gradio UI for the reliability framework"""
 
 
 
 
778
 
779
  with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
780
  gr.Markdown("""
@@ -882,9 +1523,19 @@ def create_enhanced_ui():
882
 
883
  gr.Markdown("\n\n".join(policy_info))
884
 
885
- # ✅ FIXED: Synchronous wrapper for async function
886
  def submit_event_enhanced_sync(component, latency, error_rate, throughput, cpu_util, memory_util):
887
- """Synchronous wrapper for async event processing - FIXES GRADIO ASYNC ISSUE"""
 
 
 
 
 
 
 
 
 
 
888
  try:
889
  # Type conversion
890
  latency = float(latency)
@@ -893,13 +1544,13 @@ def create_enhanced_ui():
893
  cpu_util = float(cpu_util) if cpu_util else None
894
  memory_util = float(memory_util) if memory_util else None
895
 
896
- # Input validation
897
  is_valid, error_msg = validate_inputs(latency, error_rate, throughput, cpu_util, memory_util)
898
  if not is_valid:
899
  logger.warning(f"Invalid input: {error_msg}")
900
  return error_msg, {}, {}, gr.Dataframe(value=[])
901
 
902
- # Create new event loop for async execution
903
  loop = asyncio.new_event_loop()
904
  asyncio.set_event_loop(loop)
905
 
@@ -913,7 +1564,7 @@ def create_enhanced_ui():
913
  finally:
914
  loop.close()
915
 
916
- # Build table data
917
  table_data = []
918
  for event in events_history_store.get_recent(15):
919
  table_data.append([
@@ -945,7 +1596,11 @@ def create_enhanced_ui():
945
 
946
  if result["business_impact"]:
947
  impact = result["business_impact"]
948
- output_msg += f"\n💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | 👥 {impact['affected_users_estimate']} users | 🚨 {impact['severity_level']}"
 
 
 
 
949
 
950
  if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
951
  actions = ", ".join(result["healing_actions"])
@@ -974,32 +1629,39 @@ def create_enhanced_ui():
974
  logger.error(error_msg, exc_info=True)
975
  return error_msg, {}, {}, gr.Dataframe(value=[])
976
 
977
- # ✅ FIXED: Use sync wrapper instead of async function
978
  submit_btn.click(
979
- fn=submit_event_enhanced_sync,
980
  inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
981
  outputs=[output_text, agent_insights, predictive_insights, events_table]
982
  )
983
 
984
  return demo
985
 
 
986
  if __name__ == "__main__":
987
- logger.info("Starting Enterprise Agentic Reliability Framework...")
 
 
988
  logger.info(f"Total events in history: {events_history_store.count()}")
989
  logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
 
 
990
 
991
  demo = create_enhanced_ui()
992
 
993
- logger.info("Launching Gradio UI...")
994
  demo.launch(
995
  server_name="0.0.0.0",
996
  server_port=7860,
997
  share=False
998
  )
999
 
1000
- # Save any pending vectors on shutdown
1001
  if thread_safe_index:
1002
- logger.info("Saving pending vectors...")
1003
  thread_safe_index.force_save()
1004
 
1005
- logger.info("Application shutdown complete")
 
 
 
2
  Enterprise Agentic Reliability Framework - Main Application
3
  Multi-Agent AI System for Production Reliability Monitoring
4
 
5
+ This module provides the complete reliability monitoring system including:
6
+ - Multi-agent anomaly detection and root cause analysis
7
+ - Predictive analytics and forecasting
8
+ - Policy-based auto-healing
9
+ - Business impact quantification
10
+ - Vector-based incident memory
11
+ - Adaptive thresholds
12
+ - Thread-safe concurrent operations
13
  """
14
 
15
  import os
 
26
  from dataclasses import dataclass, asdict
27
  import hashlib
28
  import asyncio
29
+ from enum import Enum
30
 
31
  # Import our modules
32
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
33
  from healing_policies import PolicyEngine
 
34
 
35
  # === Logging Configuration ===
36
  logging.basicConfig(
 
53
  # Thresholds
54
  LATENCY_WARNING: float = 150.0
55
  LATENCY_CRITICAL: float = 300.0
56
+ LATENCY_EXTREME: float = 500.0
57
  ERROR_RATE_WARNING: float = 0.05
58
+ ERROR_RATE_HIGH: float = 0.15
59
+ ERROR_RATE_CRITICAL: float = 0.3
60
  CPU_WARNING: float = 0.8
61
  CPU_CRITICAL: float = 0.9
62
  MEMORY_WARNING: float = 0.8
 
67
  MAX_EVENTS_STORED: int = 1000
68
  AGENT_TIMEOUT: int = 10
69
  CACHE_EXPIRY_MINUTES: int = 15
70
+
71
+ # Business metrics
72
+ BASE_REVENUE_PER_MINUTE: float = 100.0
73
+ BASE_USERS: int = 1000
74
 
75
  config = Config()
76
 
 
226
  risk_level: str = "low" # low, medium, high, critical
227
 
228
  class SimplePredictiveEngine:
229
+ """
230
+ Lightweight forecasting engine optimized for Hugging Face Spaces.
231
+ Uses statistical methods for time-series prediction.
232
+ """
233
 
234
  def __init__(self, history_window: int = config.HISTORY_WINDOW):
235
  self.history_window = history_window
 
240
  logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
241
 
242
  def add_telemetry(self, service: str, event_data: Dict) -> None:
243
+ """
244
+ Add telemetry data to service history
245
+
246
+ Args:
247
+ service: Service name
248
+ event_data: Dictionary containing metrics (latency_p99, error_rate, etc.)
249
+ """
250
  with self._lock:
251
  if service not in self.service_history:
252
  self.service_history[service] = deque(maxlen=self.history_window)
 
277
  logger.debug(f"Cleaned {len(expired)} expired cache entries")
278
 
279
  def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
280
+ """
281
+ Forecast service health metrics
282
+
283
+ Args:
284
+ service: Service name to forecast
285
+ lookahead_minutes: Time horizon in minutes
286
+
287
+ Returns:
288
+ List of forecast results for different metrics
289
+ """
290
  with self._lock:
291
  if service not in self.service_history or len(self.service_history[service]) < 10:
292
  return []
 
318
  return forecasts
319
 
320
  def _forecast_latency(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
321
+ """
322
+ Forecast latency using linear regression and trend analysis
323
+
324
+ Args:
325
+ history: Historical telemetry data
326
+ lookahead_minutes: Forecast horizon
327
+
328
+ Returns:
329
+ ForecastResult or None if insufficient data
330
+ """
331
  try:
332
  latencies = [point['latency'] for point in history[-20:]]
333
 
 
346
  residuals = latencies - (slope * x + intercept)
347
  confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
348
 
349
+ # Determine trend and risk
350
  if slope > 5:
351
  trend = "increasing"
352
+ risk = "critical" if predicted_latency > config.LATENCY_EXTREME else "high"
353
  elif slope < -2:
354
  trend = "decreasing"
355
  risk = "low"
356
  else:
357
  trend = "stable"
358
+ risk = "low" if predicted_latency < config.LATENCY_WARNING else "medium"
359
 
360
  # Calculate time to reach critical threshold (500ms)
361
  time_to_critical = None
362
+ if slope > 0 and predicted_latency < config.LATENCY_EXTREME:
363
  denominator = predicted_latency - latencies[-1]
364
  if abs(denominator) > 0.1: # Avoid division by very small numbers
365
+ minutes_to_critical = lookahead_minutes * (config.LATENCY_EXTREME - predicted_latency) / denominator
366
+ if minutes_to_critical > 0:
367
+ time_to_critical = datetime.timedelta(minutes=minutes_to_critical)
368
 
369
  return ForecastResult(
370
  metric="latency",
 
380
  return None
381
 
382
  def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
383
+ """
384
+ Forecast error rate using exponential smoothing
385
+
386
+ Args:
387
+ history: Historical telemetry data
388
+ lookahead_minutes: Forecast horizon
389
+
390
+ Returns:
391
+ ForecastResult or None if insufficient data
392
+ """
393
  try:
394
  error_rates = [point['error_rate'] for point in history[-15:]]
395
 
 
409
 
410
  if recent_trend > 0.02:
411
  trend = "increasing"
412
+ risk = "critical" if predicted_rate > config.ERROR_RATE_CRITICAL else "high"
413
  elif recent_trend < -0.01:
414
  trend = "decreasing"
415
  risk = "low"
416
  else:
417
  trend = "stable"
418
+ risk = "low" if predicted_rate < config.ERROR_RATE_WARNING else "medium"
419
 
420
  # Confidence based on volatility
421
  confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
 
433
  return None
434
 
435
  def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
436
+ """
437
+ Forecast CPU and memory utilization
438
+
439
+ Args:
440
+ history: Historical telemetry data
441
+ lookahead_minutes: Forecast horizon
442
+
443
+ Returns:
444
+ List of forecast results for CPU and memory
445
+ """
446
  forecasts = []
447
 
448
  # CPU forecast
 
498
  return forecasts
499
 
500
  def get_predictive_insights(self, service: str) -> Dict[str, Any]:
501
+ """
502
+ Generate actionable insights from forecasts
503
+
504
+ Args:
505
+ service: Service name
506
+
507
+ Returns:
508
+ Dictionary containing warnings, recommendations, and forecast data
509
+ """
510
  forecasts = self.forecast_service_health(service)
511
 
512
  critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
 
548
  predictive_engine = SimplePredictiveEngine()
549
 
550
  class BusinessImpactCalculator:
551
+ """
552
+ Calculate business impact of anomalies including revenue loss
553
+ and user impact estimation
554
+ """
555
 
556
  def __init__(self, revenue_per_request: float = 0.01):
557
  self.revenue_per_request = revenue_per_request
 
566
  duration_minutes: Assumed duration of the incident
567
 
568
  Returns:
569
+ Dictionary containing revenue loss, user impact, and severity
570
  """
571
+ base_revenue_per_minute = config.BASE_REVENUE_PER_MINUTE
572
 
573
  impact_multiplier = 1.0
574
 
575
+ # Impact factors
576
  if event.latency_p99 > config.LATENCY_CRITICAL:
577
  impact_multiplier += 0.5
578
  if event.error_rate > 0.1:
 
582
 
583
  revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
584
 
585
+ base_users_affected = config.BASE_USERS
586
  user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
587
  affected_users = int(base_users_affected * user_impact_multiplier)
588
 
589
+ # Severity classification
590
  if revenue_loss > 500 or affected_users > 5000:
591
  severity = "CRITICAL"
592
  elif revenue_loss > 100 or affected_users > 1000:
 
596
  else:
597
  severity = "LOW"
598
 
599
+ logger.info(f"Business impact: ${revenue_loss:.2f} revenue loss, {affected_users} users, {severity} severity")
600
 
601
  return {
602
  'revenue_loss_estimate': round(revenue_loss, 2),
 
608
  business_calculator = BusinessImpactCalculator()
609
 
610
  class AdvancedAnomalyDetector:
611
+ """
612
+ Enhanced anomaly detection with adaptive thresholds that learn
613
+ from historical data patterns
614
+ """
615
 
616
  def __init__(self):
617
  self.historical_data = deque(maxlen=100)
 
624
 
625
  def detect_anomaly(self, event: ReliabilityEvent) -> bool:
626
  """
627
+ Detect if event is anomalous using adaptive thresholds
628
 
629
  Args:
630
  event: The reliability event to check
 
663
 
664
  anomaly_detector = AdvancedAnomalyDetector()
665
 
666
+ # === Multi-Agent System ===
667
+ class AgentSpecialization(Enum):
668
+ """Agent specialization types"""
669
+ DETECTIVE = "anomaly_detection"
670
+ DIAGNOSTICIAN = "root_cause_analysis"
671
+ PREDICTIVE = "predictive_analytics"
672
+
673
+ class BaseAgent:
674
+ """Base class for all specialized agents"""
675
+
676
+ def __init__(self, specialization: AgentSpecialization):
677
+ self.specialization = specialization
678
+ self.performance_metrics = {
679
+ 'processed_events': 0,
680
+ 'successful_analyses': 0,
681
+ 'average_confidence': 0.0
682
+ }
683
+
684
+ async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
685
+ """Base analysis method to be implemented by specialized agents"""
686
+ raise NotImplementedError
687
+
688
+ class AnomalyDetectionAgent(BaseAgent):
689
+ """
690
+ Specialized agent for anomaly detection and pattern recognition.
691
+ Calculates multi-dimensional anomaly scores and identifies affected metrics.
692
+ """
693
 
694
  def __init__(self):
695
+ super().__init__(AgentSpecialization.DETECTIVE)
696
+ logger.info("Initialized AnomalyDetectionAgent")
697
+
698
+ async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
699
+ """
700
+ Perform comprehensive anomaly analysis
701
+
702
+ Args:
703
+ event: Reliability event to analyze
704
+
705
+ Returns:
706
+ Dictionary containing anomaly score, severity, affected metrics, and recommendations
707
+ """
708
+ try:
709
+ anomaly_score = self._calculate_anomaly_score(event)
710
+
711
+ return {
712
+ 'specialization': self.specialization.value,
713
+ 'confidence': anomaly_score,
714
+ 'findings': {
715
+ 'anomaly_score': anomaly_score,
716
+ 'severity_tier': self._classify_severity(anomaly_score),
717
+ 'primary_metrics_affected': self._identify_affected_metrics(event)
718
+ },
719
+ 'recommendations': self._generate_detection_recommendations(event, anomaly_score)
720
+ }
721
+ except Exception as e:
722
+ logger.error(f"AnomalyDetectionAgent error: {e}", exc_info=True)
723
+ return {
724
+ 'specialization': self.specialization.value,
725
+ 'confidence': 0.0,
726
+ 'findings': {},
727
+ 'recommendations': [f"Analysis error: {str(e)}"]
728
+ }
729
+
730
+ def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
731
+ """
732
+ Calculate comprehensive anomaly score (0-1) using weighted metrics
733
+
734
+ Args:
735
+ event: Reliability event
736
+
737
+ Returns:
738
+ Float between 0 and 1 representing anomaly severity
739
+ """
740
+ scores = []
741
+
742
+ # Latency anomaly (weighted 40%)
743
+ if event.latency_p99 > config.LATENCY_WARNING:
744
+ latency_score = min(1.0, (event.latency_p99 - config.LATENCY_WARNING) / 500)
745
+ scores.append(0.4 * latency_score)
746
+
747
+ # Error rate anomaly (weighted 30%)
748
+ if event.error_rate > config.ERROR_RATE_WARNING:
749
+ error_score = min(1.0, event.error_rate / 0.3)
750
+ scores.append(0.3 * error_score)
751
+
752
+ # Resource anomaly (weighted 30%)
753
+ resource_score = 0
754
+ if event.cpu_util and event.cpu_util > config.CPU_WARNING:
755
+ resource_score += 0.15 * min(1.0, (event.cpu_util - config.CPU_WARNING) / 0.2)
756
+ if event.memory_util and event.memory_util > config.MEMORY_WARNING:
757
+ resource_score += 0.15 * min(1.0, (event.memory_util - config.MEMORY_WARNING) / 0.2)
758
+ scores.append(resource_score)
759
+
760
+ return min(1.0, sum(scores))
761
+
762
+ def _classify_severity(self, anomaly_score: float) -> str:
763
+ """
764
+ Classify severity tier based on anomaly score
765
+
766
+ Args:
767
+ anomaly_score: Score between 0 and 1
768
+
769
+ Returns:
770
+ Severity tier string (LOW, MEDIUM, HIGH, CRITICAL)
771
+ """
772
+ if anomaly_score > 0.8:
773
+ return "CRITICAL"
774
+ elif anomaly_score > 0.6:
775
+ return "HIGH"
776
+ elif anomaly_score > 0.4:
777
+ return "MEDIUM"
778
+ else:
779
+ return "LOW"
780
+
781
+ def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
782
+ """
783
+ Identify which metrics are outside normal ranges
784
+
785
+ Args:
786
+ event: Reliability event
787
+
788
+ Returns:
789
+ List of dictionaries describing affected metrics with severity
790
+ """
791
+ affected = []
792
+
793
+ # Latency checks
794
+ if event.latency_p99 > config.LATENCY_EXTREME:
795
+ affected.append({
796
+ "metric": "latency",
797
+ "value": event.latency_p99,
798
+ "severity": "CRITICAL",
799
+ "threshold": config.LATENCY_WARNING
800
+ })
801
+ elif event.latency_p99 > config.LATENCY_CRITICAL:
802
+ affected.append({
803
+ "metric": "latency",
804
+ "value": event.latency_p99,
805
+ "severity": "HIGH",
806
+ "threshold": config.LATENCY_WARNING
807
+ })
808
+ elif event.latency_p99 > config.LATENCY_WARNING:
809
+ affected.append({
810
+ "metric": "latency",
811
+ "value": event.latency_p99,
812
+ "severity": "MEDIUM",
813
+ "threshold": config.LATENCY_WARNING
814
+ })
815
+
816
+ # Error rate checks
817
+ if event.error_rate > config.ERROR_RATE_CRITICAL:
818
+ affected.append({
819
+ "metric": "error_rate",
820
+ "value": event.error_rate,
821
+ "severity": "CRITICAL",
822
+ "threshold": config.ERROR_RATE_WARNING
823
+ })
824
+ elif event.error_rate > config.ERROR_RATE_HIGH:
825
+ affected.append({
826
+ "metric": "error_rate",
827
+ "value": event.error_rate,
828
+ "severity": "HIGH",
829
+ "threshold": config.ERROR_RATE_WARNING
830
+ })
831
+ elif event.error_rate > config.ERROR_RATE_WARNING:
832
+ affected.append({
833
+ "metric": "error_rate",
834
+ "value": event.error_rate,
835
+ "severity": "MEDIUM",
836
+ "threshold": config.ERROR_RATE_WARNING
837
+ })
838
+
839
+ # CPU checks
840
+ if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
841
+ affected.append({
842
+ "metric": "cpu",
843
+ "value": event.cpu_util,
844
+ "severity": "CRITICAL",
845
+ "threshold": config.CPU_WARNING
846
+ })
847
+ elif event.cpu_util and event.cpu_util > config.CPU_WARNING:
848
+ affected.append({
849
+ "metric": "cpu",
850
+ "value": event.cpu_util,
851
+ "severity": "HIGH",
852
+ "threshold": config.CPU_WARNING
853
+ })
854
+
855
+ # Memory checks
856
+ if event.memory_util and event.memory_util > config.MEMORY_CRITICAL:
857
+ affected.append({
858
+ "metric": "memory",
859
+ "value": event.memory_util,
860
+ "severity": "CRITICAL",
861
+ "threshold": config.MEMORY_WARNING
862
+ })
863
+ elif event.memory_util and event.memory_util > config.MEMORY_WARNING:
864
+ affected.append({
865
+ "metric": "memory",
866
+ "value": event.memory_util,
867
+ "severity": "HIGH",
868
+ "threshold": config.MEMORY_WARNING
869
+ })
870
+
871
+ return affected
872
+
873
+ def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
874
+ """
875
+ Generate actionable recommendations based on detected anomalies
876
+
877
+ Args:
878
+ event: Reliability event
879
+ anomaly_score: Calculated anomaly score
880
+
881
+ Returns:
882
+ List of recommendation strings with emojis for visibility
883
+ """
884
+ recommendations = []
885
+ affected_metrics = self._identify_affected_metrics(event)
886
+
887
+ for metric in affected_metrics:
888
+ metric_name = metric["metric"]
889
+ severity = metric["severity"]
890
+ value = metric["value"]
891
+ threshold = metric["threshold"]
892
+
893
+ if metric_name == "latency":
894
+ if severity == "CRITICAL":
895
+ recommendations.append(
896
+ f"🚨 CRITICAL: Latency {value:.0f}ms (>{threshold}ms) - "
897
+ f"Check database & external dependencies"
898
+ )
899
+ elif severity == "HIGH":
900
+ recommendations.append(
901
+ f"⚠️ HIGH: Latency {value:.0f}ms (>{threshold}ms) - "
902
+ f"Investigate service performance"
903
+ )
904
+ else:
905
+ recommendations.append(
906
+ f"📈 Latency elevated: {value:.0f}ms (>{threshold}ms) - Monitor trend"
907
+ )
908
+
909
+ elif metric_name == "error_rate":
910
+ if severity == "CRITICAL":
911
+ recommendations.append(
912
+ f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100:.1f}%) - "
913
+ f"Check recent deployments"
914
+ )
915
+ elif severity == "HIGH":
916
+ recommendations.append(
917
+ f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100:.1f}%) - "
918
+ f"Review application logs"
919
+ )
920
+ else:
921
+ recommendations.append(
922
+ f"📈 Errors increasing: {value*100:.1f}% (>{threshold*100:.1f}%)"
923
+ )
924
+
925
+ elif metric_name == "cpu":
926
+ recommendations.append(
927
+ f"🔥 CPU {severity}: {value*100:.1f}% utilization - Consider scaling"
928
+ )
929
+
930
+ elif metric_name == "memory":
931
+ recommendations.append(
932
+ f"💾 Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks"
933
+ )
934
+
935
+ # Overall severity recommendations
936
+ if anomaly_score > 0.8:
937
+ recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
938
+ elif anomaly_score > 0.6:
939
+ recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
940
+ elif anomaly_score > 0.4:
941
+ recommendations.append("📊 MONITOR: Early warning signs detected")
942
+
943
+ return recommendations[:4] # Return top 4 recommendations
944
+
945
+ class RootCauseAgent(BaseAgent):
946
+ """
947
+ Specialized agent for root cause analysis.
948
+ Analyzes failure patterns and provides investigation guidance.
949
+ """
950
+
951
+ def __init__(self):
952
+ super().__init__(AgentSpecialization.DIAGNOSTICIAN)
953
+ logger.info("Initialized RootCauseAgent")
954
+
955
+ async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
956
+ """
957
+ Perform root cause analysis
958
+
959
+ Args:
960
+ event: Reliability event to analyze
961
+
962
+ Returns:
963
+ Dictionary containing likely root causes and investigation guidance
964
+ """
965
+ try:
966
+ causes = self._analyze_potential_causes(event)
967
+
968
+ return {
969
+ 'specialization': self.specialization.value,
970
+ 'confidence': 0.7,
971
+ 'findings': {
972
+ 'likely_root_causes': causes,
973
+ 'evidence_patterns': self._identify_evidence(event),
974
+ 'investigation_priority': self._prioritize_investigation(causes)
975
+ },
976
+ 'recommendations': [
977
+ f"Check {cause['cause']} for issues" for cause in causes[:2]
978
+ ]
979
+ }
980
+ except Exception as e:
981
+ logger.error(f"RootCauseAgent error: {e}", exc_info=True)
982
+ return {
983
+ 'specialization': self.specialization.value,
984
+ 'confidence': 0.0,
985
+ 'findings': {},
986
+ 'recommendations': [f"Analysis error: {str(e)}"]
987
+ }
988
+
989
+ def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
990
+ """
991
+ Analyze potential root causes based on event patterns
992
+
993
+ Args:
994
+ event: Reliability event
995
+
996
+ Returns:
997
+ List of potential root causes with confidence scores
998
+ """
999
+ causes = []
1000
+
1001
+ # Pattern 1: Database/External Dependency Failure
1002
+ if event.latency_p99 > config.LATENCY_EXTREME and event.error_rate > 0.2:
1003
+ causes.append({
1004
+ "cause": "Database/External Dependency Failure",
1005
+ "confidence": 0.85,
1006
+ "evidence": f"Extreme latency ({event.latency_p99:.0f}ms) with high errors ({event.error_rate*100:.1f}%)",
1007
+ "investigation": "Check database connection pool, external API health"
1008
+ })
1009
+
1010
+ # Pattern 2: Resource Exhaustion
1011
+ if (event.cpu_util and event.cpu_util > config.CPU_CRITICAL and
1012
+ event.memory_util and event.memory_util > config.MEMORY_CRITICAL):
1013
+ causes.append({
1014
+ "cause": "Resource Exhaustion",
1015
+ "confidence": 0.90,
1016
+ "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
1017
+ "investigation": "Check for memory leaks, infinite loops, insufficient resources"
1018
+ })
1019
+
1020
+ # Pattern 3: Application Bug / Configuration Issue
1021
+ if event.error_rate > config.ERROR_RATE_CRITICAL and event.latency_p99 < 200:
1022
+ causes.append({
1023
+ "cause": "Application Bug / Configuration Issue",
1024
+ "confidence": 0.75,
1025
+ "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
1026
+ "investigation": "Review recent deployments, configuration changes, application logs"
1027
+ })
1028
+
1029
+ # Pattern 4: Gradual Performance Degradation
1030
+ if (200 <= event.latency_p99 <= 400 and
1031
+ config.ERROR_RATE_WARNING <= event.error_rate <= config.ERROR_RATE_HIGH):
1032
+ causes.append({
1033
+ "cause": "Gradual Performance Degradation",
1034
+ "confidence": 0.65,
1035
+ "evidence": f"Moderate latency ({event.latency_p99:.0f}ms) and errors ({event.error_rate*100:.1f}%)",
1036
+ "investigation": "Check resource trends, dependency performance, capacity planning"
1037
+ })
1038
+
1039
+ # Default: Unknown pattern
1040
+ if not causes:
1041
+ causes.append({
1042
+ "cause": "Unknown - Requires Investigation",
1043
+ "confidence": 0.3,
1044
+ "evidence": "Pattern does not match known failure modes",
1045
+ "investigation": "Complete system review needed"
1046
+ })
1047
+
1048
+ return causes
1049
+
1050
+ def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
1051
+ """
1052
+ Identify evidence patterns in the event data
1053
+
1054
+ Args:
1055
+ event: Reliability event
1056
+
1057
+ Returns:
1058
+ List of evidence pattern identifiers
1059
+ """
1060
+ evidence = []
1061
+
1062
+ if event.latency_p99 > event.error_rate * 1000:
1063
+ evidence.append("latency_disproportionate_to_errors")
1064
+
1065
+ if (event.cpu_util and event.cpu_util > config.CPU_WARNING and
1066
+ event.memory_util and event.memory_util > config.MEMORY_WARNING):
1067
+ evidence.append("correlated_resource_exhaustion")
1068
+
1069
+ if event.error_rate > config.ERROR_RATE_HIGH and event.latency_p99 < config.LATENCY_CRITICAL:
1070
+ evidence.append("errors_without_latency_impact")
1071
+
1072
+ return evidence
1073
+
1074
+ def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
1075
+ """
1076
+ Determine investigation priority based on identified causes
1077
+
1078
+ Args:
1079
+ causes: List of potential root causes
1080
+
1081
+ Returns:
1082
+ Priority level (HIGH, MEDIUM, LOW)
1083
+ """
1084
+ for cause in causes:
1085
+ if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
1086
+ return "HIGH"
1087
+ return "MEDIUM"
1088
+
1089
+ class PredictiveAgent(BaseAgent):
1090
+ """
1091
+ Specialized agent for predictive analytics.
1092
+ Forecasts future risks and trends using statistical models.
1093
+ """
1094
+
1095
+ def __init__(self):
1096
+ super().__init__(AgentSpecialization.PREDICTIVE)
1097
  self.engine = predictive_engine
1098
  logger.info("Initialized PredictiveAgent")
1099
 
1100
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
1101
+ """
1102
+ Perform predictive analysis for future risks
1103
+
1104
+ Args:
1105
+ event: Current reliability event
1106
+
1107
+ Returns:
1108
+ Dictionary containing forecasts and predictive insights
1109
+ """
1110
+ try:
1111
+ event_data = {
1112
+ 'latency_p99': event.latency_p99,
1113
+ 'error_rate': event.error_rate,
1114
+ 'throughput': event.throughput,
1115
+ 'cpu_util': event.cpu_util,
1116
+ 'memory_util': event.memory_util
1117
+ }
1118
+ self.engine.add_telemetry(event.component, event_data)
1119
+
1120
+ insights = self.engine.get_predictive_insights(event.component)
1121
+
1122
+ return {
1123
+ 'specialization': self.specialization.value,
1124
+ 'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
1125
+ 'findings': insights,
1126
+ 'recommendations': insights['recommendations']
1127
+ }
1128
+ except Exception as e:
1129
+ logger.error(f"PredictiveAgent error: {e}", exc_info=True)
1130
+ return {
1131
+ 'specialization': self.specialization.value,
1132
+ 'confidence': 0.0,
1133
+ 'findings': {},
1134
+ 'recommendations': [f"Analysis error: {str(e)}"]
1135
+ }
1136
+
1137
+ class OrchestrationManager:
1138
+ """
1139
+ Orchestrates multiple specialized agents for comprehensive analysis.
1140
+ Runs agents in parallel and synthesizes their findings.
1141
+ """
1142
+
1143
+ def __init__(self):
1144
+ self.agents = {
1145
+ AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
1146
+ AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
1147
+ AgentSpecialization.PREDICTIVE: PredictiveAgent(),
1148
  }
1149
+ logger.info(f"Initialized OrchestrationManager with {len(self.agents)} agents")
1150
+
1151
+ async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
1152
+ """
1153
+ Coordinate multiple agents for comprehensive analysis
1154
 
1155
+ Args:
1156
+ event: Reliability event to analyze
1157
+
1158
+ Returns:
1159
+ Synthesized findings from all agents
1160
+ """
1161
+ agent_tasks = {
1162
+ spec: agent.analyze(event)
1163
+ for spec, agent in self.agents.items()
1164
+ }
1165
 
1166
+ # Parallel agent execution with timeout protection
1167
+ agent_results = {}
1168
+ for specialization, task in agent_tasks.items():
1169
+ try:
1170
+ result = await asyncio.wait_for(task, timeout=5.0)
1171
+ agent_results[specialization.value] = result
1172
+ logger.debug(f"Agent {specialization.value} completed successfully")
1173
+ except asyncio.TimeoutError:
1174
+ logger.warning(f"Agent {specialization.value} timed out")
1175
+ continue
1176
+ except Exception as e:
1177
+ logger.error(f"Agent {specialization.value} error: {e}", exc_info=True)
1178
+ continue
1179
+
1180
+ return self._synthesize_agent_findings(event, agent_results)
1181
+
1182
+ def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
1183
+ """
1184
+ Combine insights from all specialized agents
1185
+
1186
+ Args:
1187
+ event: Original reliability event
1188
+ agent_results: Results from each agent
1189
+
1190
+ Returns:
1191
+ Synthesized analysis combining all agent findings
1192
+ """
1193
+ detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
1194
+ diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
1195
+ predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
1196
+
1197
+ if not detective_result:
1198
+ logger.warning("No detective agent results available")
1199
+ return {'error': 'No agent results available'}
1200
+
1201
+ synthesis = {
1202
+ 'incident_summary': {
1203
+ 'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
1204
+ 'anomaly_confidence': detective_result['confidence'],
1205
+ 'primary_metrics_affected': [
1206
+ metric["metric"] for metric in
1207
+ detective_result['findings'].get('primary_metrics_affected', [])
1208
+ ]
1209
+ },
1210
+ 'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
1211
+ 'predictive_insights': predictive_result['findings'] if predictive_result else {},
1212
+ 'recommended_actions': self._prioritize_actions(
1213
+ detective_result.get('recommendations', []),
1214
+ diagnostician_result.get('recommendations', []) if diagnostician_result else [],
1215
+ predictive_result.get('recommendations', []) if predictive_result else []
1216
+ ),
1217
+ 'agent_metadata': {
1218
+ 'participating_agents': list(agent_results.keys()),
1219
+ 'analysis_timestamp': datetime.datetime.now().isoformat()
1220
+ }
1221
  }
1222
+
1223
+ return synthesis
1224
+
1225
+ def _prioritize_actions(self, detection_actions: List[str],
1226
+ diagnosis_actions: List[str],
1227
+ predictive_actions: List[str]) -> List[str]:
1228
+ """
1229
+ Combine and prioritize actions from multiple agents
1230
+
1231
+ Args:
1232
+ detection_actions: Actions from detective agent
1233
+ diagnosis_actions: Actions from diagnostician agent
1234
+ predictive_actions: Actions from predictive agent
1235
+
1236
+ Returns:
1237
+ Prioritized list of unique actions
1238
+ """
1239
+ all_actions = detection_actions + diagnosis_actions + predictive_actions
1240
+ seen = set()
1241
+ unique_actions = []
1242
+ for action in all_actions:
1243
+ if action not in seen:
1244
+ seen.add(action)
1245
+ unique_actions.append(action)
1246
+ return unique_actions[:5] # Return top 5 actions
1247
 
1248
+ # Initialize orchestration manager
1249
  orchestration_manager = OrchestrationManager()
 
1250
 
1251
  # === Enhanced Reliability Engine ===
1252
  class EnhancedReliabilityEngine:
1253
+ """
1254
+ Main engine for processing reliability events through the multi-agent system.
1255
+ Coordinates anomaly detection, agent analysis, policy evaluation, and impact calculation.
1256
+ """
1257
 
1258
  def __init__(self):
1259
  self.performance_metrics = {
 
1274
  memory_util: Optional[float] = None
1275
  ) -> Dict[str, Any]:
1276
  """
1277
+ Process a reliability event through the complete analysis pipeline
1278
 
1279
  Args:
1280
  component: Service component name
 
1285
  memory_util: Memory utilization (0-1)
1286
 
1287
  Returns:
1288
+ Comprehensive analysis results including agent findings, healing actions, and business impact
1289
  """
1290
  logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
1291
 
 
1313
  else:
1314
  agent_confidence = 0.8 if is_anomaly else 0.1
1315
 
1316
+ # Set event severity
1317
  if agent_confidence > 0.8:
1318
  event.severity = EventSeverity.CRITICAL
1319
  elif agent_confidence > 0.6:
 
1329
  # Calculate business impact
1330
  business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
1331
 
1332
+ # Store in vector database for similarity detection
1333
  if thread_safe_index is not None and model is not None and is_anomaly:
1334
  try:
1335
  analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
 
1339
  except Exception as e:
1340
  logger.error(f"Error storing vector: {e}", exc_info=True)
1341
 
1342
+ # Build comprehensive result
1343
  result = {
1344
  "timestamp": event.timestamp,
1345
  "component": component,
 
1358
  }
1359
  }
1360
 
1361
+ # Store event in history
1362
  events_history_store.add(event)
1363
 
1364
+ # Update performance metrics
1365
  with self._lock:
1366
  self.performance_metrics['total_incidents_processed'] += 1
1367
  self.performance_metrics['multi_agent_analyses'] += 1
 
1384
  memory_util: Optional[float]
1385
  ) -> Tuple[bool, str]:
1386
  """
1387
+ Validate user inputs for bounds and type correctness
1388
 
1389
+ Args:
1390
+ latency: Latency value in milliseconds
1391
+ error_rate: Error rate (0-1)
1392
+ throughput: Throughput in requests/sec
1393
+ cpu_util: CPU utilization (0-1)
1394
+ memory_util: Memory utilization (0-1)
1395
+
1396
  Returns:
1397
+ Tuple of (is_valid: bool, error_message: str)
1398
  """
1399
  if not (0 <= latency <= 10000):
1400
  return False, "❌ Invalid latency: must be between 0-10000ms"
 
1409
 
1410
  return True, ""
1411
 
1412
+ # === Gradio UI ===
1413
  def create_enhanced_ui():
1414
+ """
1415
+ Create the comprehensive Gradio UI for the reliability framework.
1416
+ Includes telemetry input, multi-agent analysis display, predictive insights,
1417
+ and event history visualization.
1418
+ """
1419
 
1420
  with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
1421
  gr.Markdown("""
 
1523
 
1524
  gr.Markdown("\n\n".join(policy_info))
1525
 
1526
+ # ✅ FIXED: Synchronous wrapper for async function (CRITICAL FIX)
1527
  def submit_event_enhanced_sync(component, latency, error_rate, throughput, cpu_util, memory_util):
1528
+ """
1529
+ Synchronous wrapper for async event processing.
1530
+ FIXES GRADIO ASYNC/SYNC COMPATIBILITY ISSUE.
1531
+
1532
+ This wrapper:
1533
+ 1. Validates inputs
1534
+ 2. Creates new event loop for async execution
1535
+ 3. Calls the async processing function
1536
+ 4. Formats results for display
1537
+ 5. Handles all errors gracefully
1538
+ """
1539
  try:
1540
  # Type conversion
1541
  latency = float(latency)
 
1544
  cpu_util = float(cpu_util) if cpu_util else None
1545
  memory_util = float(memory_util) if memory_util else None
1546
 
1547
+ # Input validation (CRITICAL FIX)
1548
  is_valid, error_msg = validate_inputs(latency, error_rate, throughput, cpu_util, memory_util)
1549
  if not is_valid:
1550
  logger.warning(f"Invalid input: {error_msg}")
1551
  return error_msg, {}, {}, gr.Dataframe(value=[])
1552
 
1553
+ # Create new event loop for async execution (CRITICAL FIX)
1554
  loop = asyncio.new_event_loop()
1555
  asyncio.set_event_loop(loop)
1556
 
 
1564
  finally:
1565
  loop.close()
1566
 
1567
+ # Build table data (THREAD-SAFE FIX)
1568
  table_data = []
1569
  for event in events_history_store.get_recent(15):
1570
  table_data.append([
 
1596
 
1597
  if result["business_impact"]:
1598
  impact = result["business_impact"]
1599
+ output_msg += (
1600
+ f"\n💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | "
1601
+ f"👥 {impact['affected_users_estimate']} users | "
1602
+ f"🚨 {impact['severity_level']}"
1603
+ )
1604
 
1605
  if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
1606
  actions = ", ".join(result["healing_actions"])
 
1629
  logger.error(error_msg, exc_info=True)
1630
  return error_msg, {}, {}, gr.Dataframe(value=[])
1631
 
1632
+ # ✅ FIXED: Use sync wrapper instead of async function (CRITICAL FIX)
1633
  submit_btn.click(
1634
+ fn=submit_event_enhanced_sync, # Synchronous wrapper
1635
  inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
1636
  outputs=[output_text, agent_insights, predictive_insights, events_table]
1637
  )
1638
 
1639
  return demo
1640
 
1641
+ # === Main Entry Point ===
1642
  if __name__ == "__main__":
1643
+ logger.info("=" * 80)
1644
+ logger.info("Starting Enterprise Agentic Reliability Framework")
1645
+ logger.info("=" * 80)
1646
  logger.info(f"Total events in history: {events_history_store.count()}")
1647
  logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
1648
+ logger.info(f"Agents initialized: {len(orchestration_manager.agents)}")
1649
+ logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
1650
 
1651
  demo = create_enhanced_ui()
1652
 
1653
+ logger.info("Launching Gradio UI on 0.0.0.0:7860...")
1654
  demo.launch(
1655
  server_name="0.0.0.0",
1656
  server_port=7860,
1657
  share=False
1658
  )
1659
 
1660
+ # Graceful shutdown: Save any pending vectors
1661
  if thread_safe_index:
1662
+ logger.info("Saving pending vectors before shutdown...")
1663
  thread_safe_index.force_save()
1664
 
1665
+ logger.info("=" * 80)
1666
+ logger.info("Application shutdown complete")
1667
+ logger.info("=" * 80)