petter2025 commited on
Commit
d6f7e9f
ยท
verified ยท
1 Parent(s): 905f518

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +323 -84
app.py CHANGED
@@ -8,6 +8,8 @@ import datetime
8
  from typing import List, Dict, Any
9
  import hashlib
10
  import asyncio
 
 
11
 
12
  # Import our modules
13
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
@@ -56,6 +58,257 @@ def save_index():
56
  with open(TEXTS_FILE, "w") as f:
57
  json.dump(incident_texts, f)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # === Core Engine Components ===
60
  policy_engine = PolicyEngine()
61
  events_history: List[ReliabilityEvent] = []
@@ -67,29 +320,23 @@ class BusinessImpactCalculator:
67
  self.revenue_per_request = revenue_per_request
68
 
69
  def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
70
- """Enhanced business impact calculation"""
71
 
72
- # More realistic impact calculation
73
- base_revenue_per_minute = 100 # Base revenue per minute for the service
74
-
75
- # Calculate impact based on severity of anomalies
76
  impact_multiplier = 1.0
77
 
78
  if event.latency_p99 > 300:
79
- impact_multiplier += 0.5 # High latency impact
80
  if event.error_rate > 0.1:
81
- impact_multiplier += 0.8 # High error rate impact
82
  if event.cpu_util and event.cpu_util > 0.9:
83
- impact_multiplier += 0.3 # Resource exhaustion impact
84
 
85
  revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
86
 
87
- # More realistic user impact
88
- base_users_affected = 1000 # Base user count
89
  user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
90
  affected_users = int(base_users_affected * user_impact_multiplier)
91
 
92
- # Severity classification
93
  if revenue_loss > 500 or affected_users > 5000:
94
  severity = "CRITICAL"
95
  elif revenue_loss > 100 or affected_users > 1000:
@@ -114,38 +361,30 @@ class AdvancedAnomalyDetector:
114
  def __init__(self):
115
  self.historical_data = []
116
  self.adaptive_thresholds = {
117
- 'latency_p99': 150, # Will adapt based on history
118
  'error_rate': 0.05
119
  }
120
 
121
  def detect_anomaly(self, event: ReliabilityEvent) -> bool:
122
- """Enhanced anomaly detection with adaptive thresholds"""
123
-
124
- # Basic threshold checks
125
  latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
126
  error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
127
 
128
- # Resource-based anomalies
129
  resource_anomaly = False
130
  if event.cpu_util and event.cpu_util > 0.9:
131
  resource_anomaly = True
132
  if event.memory_util and event.memory_util > 0.9:
133
  resource_anomaly = True
134
 
135
- # Update adaptive thresholds (simplified)
136
  self._update_thresholds(event)
137
 
138
  return latency_anomaly or error_anomaly or resource_anomaly
139
 
140
  def _update_thresholds(self, event: ReliabilityEvent):
141
- """Update adaptive thresholds based on historical data"""
142
  self.historical_data.append(event)
143
 
144
- # Keep only recent history
145
  if len(self.historical_data) > 100:
146
  self.historical_data.pop(0)
147
 
148
- # Update latency threshold to 90th percentile of recent data
149
  if len(self.historical_data) > 10:
150
  recent_latencies = [e.latency_p99 for e in self.historical_data[-20:]]
151
  self.adaptive_thresholds['latency_p99'] = np.percentile(recent_latencies, 90)
@@ -153,11 +392,10 @@ class AdvancedAnomalyDetector:
153
  anomaly_detector = AdvancedAnomalyDetector()
154
 
155
  # === Multi-Agent Foundation ===
156
- from enum import Enum
157
-
158
  class AgentSpecialization(Enum):
159
  DETECTIVE = "anomaly_detection"
160
  DIAGNOSTICIAN = "root_cause_analysis"
 
161
 
162
  class BaseAgent:
163
  def __init__(self, specialization: AgentSpecialization):
@@ -171,7 +409,6 @@ class AnomalyDetectionAgent(BaseAgent):
171
  super().__init__(AgentSpecialization.DETECTIVE)
172
 
173
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
174
- """Enhanced anomaly detection with confidence scoring"""
175
  anomaly_score = self._calculate_anomaly_score(event)
176
 
177
  return {
@@ -186,20 +423,16 @@ class AnomalyDetectionAgent(BaseAgent):
186
  }
187
 
188
  def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
189
- """Calculate comprehensive anomaly score (0-1)"""
190
  scores = []
191
 
192
- # Latency anomaly (weighted 40%)
193
  if event.latency_p99 > 150:
194
  latency_score = min(1.0, (event.latency_p99 - 150) / 500)
195
  scores.append(0.4 * latency_score)
196
 
197
- # Error rate anomaly (weighted 30%)
198
  if event.error_rate > 0.05:
199
  error_score = min(1.0, event.error_rate / 0.3)
200
  scores.append(0.3 * error_score)
201
 
202
- # Resource anomaly (weighted 30%)
203
  resource_score = 0
204
  if event.cpu_util and event.cpu_util > 0.8:
205
  resource_score += 0.15 * min(1.0, (event.cpu_util - 0.8) / 0.2)
@@ -210,7 +443,6 @@ class AnomalyDetectionAgent(BaseAgent):
210
  return min(1.0, sum(scores))
211
 
212
  def _classify_severity(self, anomaly_score: float) -> str:
213
- """Classify severity based on anomaly score"""
214
  if anomaly_score > 0.8:
215
  return "CRITICAL"
216
  elif anomaly_score > 0.6:
@@ -221,10 +453,8 @@ class AnomalyDetectionAgent(BaseAgent):
221
  return "LOW"
222
 
223
  def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
224
- """Enhanced metric analysis with severity levels"""
225
  affected = []
226
 
227
- # Latency analysis
228
  if event.latency_p99 > 500:
229
  affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
230
  elif event.latency_p99 > 300:
@@ -232,7 +462,6 @@ class AnomalyDetectionAgent(BaseAgent):
232
  elif event.latency_p99 > 150:
233
  affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
234
 
235
- # Error rate analysis
236
  if event.error_rate > 0.3:
237
  affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
238
  elif event.error_rate > 0.15:
@@ -240,7 +469,6 @@ class AnomalyDetectionAgent(BaseAgent):
240
  elif event.error_rate > 0.05:
241
  affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
242
 
243
- # Resource analysis
244
  if event.cpu_util and event.cpu_util > 0.9:
245
  affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
246
  elif event.cpu_util and event.cpu_util > 0.8:
@@ -254,7 +482,6 @@ class AnomalyDetectionAgent(BaseAgent):
254
  return affected
255
 
256
  def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
257
- """Generate specific, actionable recommendations"""
258
  recommendations = []
259
  affected_metrics = self._identify_affected_metrics(event)
260
 
@@ -286,7 +513,6 @@ class AnomalyDetectionAgent(BaseAgent):
286
  elif metric_name == "memory":
287
  recommendations.append(f"๐Ÿ’พ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
288
 
289
- # Add overall recommendations based on anomaly score
290
  if anomaly_score > 0.8:
291
  recommendations.append("๐ŸŽฏ IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
292
  elif anomaly_score > 0.6:
@@ -294,19 +520,18 @@ class AnomalyDetectionAgent(BaseAgent):
294
  elif anomaly_score > 0.4:
295
  recommendations.append("๐Ÿ“Š MONITOR: Early warning signs detected")
296
 
297
- return recommendations[:4] # Return top 4 most important recommendations
298
 
299
  class RootCauseAgent(BaseAgent):
300
  def __init__(self):
301
  super().__init__(AgentSpecialization.DIAGNOSTICIAN)
302
 
303
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
304
- """Basic root cause analysis"""
305
  causes = self._analyze_potential_causes(event)
306
 
307
  return {
308
  'specialization': self.specialization.value,
309
- 'confidence': 0.7, # Base confidence
310
  'findings': {
311
  'likely_root_causes': causes,
312
  'evidence_patterns': self._identify_evidence(event),
@@ -318,10 +543,8 @@ class RootCauseAgent(BaseAgent):
318
  }
319
 
320
  def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
321
- """Enhanced root cause analysis with confidence scoring"""
322
  causes = []
323
 
324
- # High latency + high errors pattern
325
  if event.latency_p99 > 500 and event.error_rate > 0.2:
326
  causes.append({
327
  "cause": "Database/External Dependency Failure",
@@ -330,7 +553,6 @@ class RootCauseAgent(BaseAgent):
330
  "investigation": "Check database connection pool, external API health"
331
  })
332
 
333
- # Resource exhaustion pattern
334
  if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
335
  causes.append({
336
  "cause": "Resource Exhaustion",
@@ -339,7 +561,6 @@ class RootCauseAgent(BaseAgent):
339
  "investigation": "Check for memory leaks, infinite loops, insufficient resources"
340
  })
341
 
342
- # Error spike pattern
343
  if event.error_rate > 0.3 and event.latency_p99 < 200:
344
  causes.append({
345
  "cause": "Application Bug / Configuration Issue",
@@ -348,7 +569,6 @@ class RootCauseAgent(BaseAgent):
348
  "investigation": "Review recent deployments, configuration changes, application logs"
349
  })
350
 
351
- # Gradual degradation pattern
352
  if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
353
  causes.append({
354
  "cause": "Gradual Performance Degradation",
@@ -368,7 +588,6 @@ class RootCauseAgent(BaseAgent):
368
  return causes
369
 
370
  def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
371
- """Identify evidence patterns"""
372
  evidence = []
373
  if event.latency_p99 > event.error_rate * 1000:
374
  evidence.append("latency_disproportionate_to_errors")
@@ -377,27 +596,50 @@ class RootCauseAgent(BaseAgent):
377
  return evidence
378
 
379
  def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
380
- """Prioritize investigation based on causes"""
381
  for cause in causes:
382
  if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
383
  return "HIGH"
384
  return "MEDIUM"
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  class OrchestrationManager:
387
  def __init__(self):
388
  self.agents = {
389
  AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
390
  AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
 
391
  }
392
 
393
  async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
394
- """Coordinate multiple agents for comprehensive analysis"""
395
  agent_tasks = {
396
  spec: agent.analyze(event)
397
  for spec, agent in self.agents.items()
398
  }
399
 
400
- # Execute agents in parallel
401
  agent_results = {}
402
  for specialization, task in agent_tasks.items():
403
  try:
@@ -409,9 +651,9 @@ class OrchestrationManager:
409
  return self._synthesize_agent_findings(event, agent_results)
410
 
411
  def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
412
- """Combine insights from all specialized agents"""
413
  detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
414
  diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
 
415
 
416
  if not detective_result:
417
  return {'error': 'No agent results available'}
@@ -423,9 +665,11 @@ class OrchestrationManager:
423
  'primary_metrics_affected': [metric["metric"] for metric in detective_result['findings'].get('primary_metrics_affected', [])]
424
  },
425
  'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
 
426
  'recommended_actions': self._prioritize_actions(
427
  detective_result.get('recommendations', []),
428
- diagnostician_result.get('recommendations', []) if diagnostician_result else []
 
429
  ),
430
  'agent_metadata': {
431
  'participating_agents': list(agent_results.keys()),
@@ -435,17 +679,15 @@ class OrchestrationManager:
435
 
436
  return synthesis
437
 
438
- def _prioritize_actions(self, detection_actions: List[str], diagnosis_actions: List[str]) -> List[str]:
439
- """Combine and prioritize actions from different agents"""
440
- all_actions = detection_actions + diagnosis_actions
441
- # Remove duplicates while preserving order
442
  seen = set()
443
  unique_actions = []
444
  for action in all_actions:
445
  if action not in seen:
446
  seen.add(action)
447
  unique_actions.append(action)
448
- return unique_actions[:4] # Return top 4 actions
449
 
450
  # Initialize enhanced components
451
  orchestration_manager = OrchestrationManager()
@@ -460,9 +702,7 @@ class EnhancedReliabilityEngine:
460
  async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
461
  throughput: float = 1000, cpu_util: float = None,
462
  memory_util: float = None) -> Dict[str, Any]:
463
- """Enhanced event processing with multi-agent orchestration"""
464
 
465
- # Create event
466
  event = ReliabilityEvent(
467
  component=component,
468
  latency_p99=latency,
@@ -473,21 +713,16 @@ class EnhancedReliabilityEngine:
473
  upstream_deps=["auth-service", "database"] if component == "api-service" else []
474
  )
475
 
476
- # Multi-agent analysis
477
  agent_analysis = await orchestration_manager.orchestrate_analysis(event)
478
 
479
- # Traditional detection (for compatibility)
480
  is_anomaly = anomaly_detector.detect_anomaly(event)
481
 
482
- # Enhanced severity classification using agent confidence
483
  agent_confidence = 0.0
484
  if agent_analysis and 'incident_summary' in agent_analysis:
485
  agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
486
  else:
487
- # Fallback to basic anomaly detection confidence
488
  agent_confidence = 0.8 if is_anomaly else 0.1
489
 
490
- # Set severity based on confidence
491
  if agent_confidence > 0.8:
492
  event.severity = EventSeverity.CRITICAL
493
  elif agent_confidence > 0.6:
@@ -497,13 +732,10 @@ class EnhancedReliabilityEngine:
497
  else:
498
  event.severity = EventSeverity.LOW
499
 
500
- # Policy evaluation
501
  healing_actions = policy_engine.evaluate_policies(event)
502
 
503
- # Business impact
504
  business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
505
 
506
- # Vector memory learning
507
  if index is not None and is_anomaly:
508
  analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
509
  vector_text = f"{component} {latency} {error_rate} {analysis_text}"
@@ -512,7 +744,6 @@ class EnhancedReliabilityEngine:
512
  incident_texts.append(vector_text)
513
  save_index()
514
 
515
- # Prepare comprehensive result
516
  result = {
517
  "timestamp": event.timestamp,
518
  "component": component,
@@ -541,7 +772,6 @@ class EnhancedReliabilityEngine:
541
  enhanced_engine = EnhancedReliabilityEngine()
542
 
543
  def call_huggingface_analysis(prompt: str) -> str:
544
- """Use HF Inference API or fallback simulation"""
545
  if not HF_TOKEN:
546
  fallback_insights = [
547
  "High latency detected - possible resource contention or network issues",
@@ -588,13 +818,12 @@ def call_huggingface_analysis(prompt: str) -> str:
588
 
589
  # === Enhanced UI with Multi-Agent Insights ===
590
  def create_enhanced_ui():
591
- """Create enhanced UI with multi-agent capabilities"""
592
  with gr.Blocks(title="๐Ÿง  Enterprise Agentic Reliability Framework", theme="soft") as demo:
593
  gr.Markdown("""
594
  # ๐Ÿง  Enterprise Agentic Reliability Framework
595
  **Multi-Agent AI System for Production Reliability**
596
 
597
- *Specialized AI agents working together to detect, diagnose, and heal system issues*
598
  """)
599
 
600
  with gr.Row():
@@ -638,15 +867,15 @@ def create_enhanced_ui():
638
  output_text = gr.Textbox(
639
  label="Agent Synthesis",
640
  placeholder="AI agents are analyzing...",
641
- lines=5
642
  )
643
 
644
- # New agent insights section
645
  with gr.Accordion("๐Ÿค– Agent Specialists Analysis", open=False):
646
  gr.Markdown("""
647
  **Specialized AI Agents:**
648
  - ๐Ÿ•ต๏ธ **Detective**: Anomaly detection & pattern recognition
649
- - ๐Ÿ” **Diagnostician**: Root cause analysis & investigation
 
650
  """)
651
 
652
  agent_insights = gr.JSON(
@@ -654,6 +883,20 @@ def create_enhanced_ui():
654
  value={}
655
  )
656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
  gr.Markdown("### ๐Ÿ“ˆ Recent Events (Last 15)")
658
  events_table = gr.Dataframe(
659
  headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
@@ -661,10 +904,10 @@ def create_enhanced_ui():
661
  wrap=True,
662
  )
663
 
664
- # Information sections
665
  with gr.Accordion("โ„น๏ธ Framework Capabilities", open=False):
666
  gr.Markdown("""
667
- - **๐Ÿค– Multi-Agent AI**: Specialized agents for detection, diagnosis, and healing
 
668
  - **๐Ÿ”ง Policy-Based Healing**: Automated recovery actions based on severity and context
669
  - **๐Ÿ’ฐ Business Impact**: Revenue and user impact quantification
670
  - **๐ŸŽฏ Adaptive Detection**: ML-powered thresholds that learn from your environment
@@ -681,23 +924,18 @@ def create_enhanced_ui():
681
 
682
  gr.Markdown("\n\n".join(policy_info))
683
 
684
- # Event handling
685
  async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util):
686
- """Enhanced event submission with async processing"""
687
  try:
688
- # Convert inputs
689
  latency = float(latency)
690
  error_rate = float(error_rate)
691
  throughput = float(throughput) if throughput else 1000
692
  cpu_util = float(cpu_util) if cpu_util else None
693
  memory_util = float(memory_util) if memory_util else None
694
 
695
- # Process with enhanced engine
696
  result = await enhanced_engine.process_event_enhanced(
697
  component, latency, error_rate, throughput, cpu_util, memory_util
698
  )
699
 
700
- # Prepare table data
701
  table_data = []
702
  for event in events_history[-15:]:
703
  table_data.append([
@@ -710,35 +948,36 @@ def create_enhanced_ui():
710
  "Multi-agent analysis" if 'multi_agent_analysis' in result else 'N/A'
711
  ])
712
 
713
- # Enhanced output formatting
714
  status_emoji = "๐Ÿšจ" if result["status"] == "ANOMALY" else "โœ…"
715
  output_msg = f"{status_emoji} {result['status']}"
716
 
717
- # Add multi-agent insights
718
  if "multi_agent_analysis" in result:
719
  analysis = result["multi_agent_analysis"]
720
  confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
721
  output_msg += f"\n๐ŸŽฏ Confidence: {confidence*100:.1f}%"
722
 
 
 
 
 
723
  if analysis.get('recommended_actions'):
724
  output_msg += f"\n๐Ÿ’ก Insights: {', '.join(analysis['recommended_actions'][:2])}"
725
 
726
- # Add business impact
727
  if result["business_impact"]:
728
  impact = result["business_impact"]
729
  output_msg += f"\n๐Ÿ’ฐ Business Impact: ${impact['revenue_loss_estimate']} | ๐Ÿ‘ฅ {impact['affected_users_estimate']} users | ๐Ÿšจ {impact['severity_level']}"
730
 
731
- # Add healing actions
732
  if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
733
  actions = ", ".join(result["healing_actions"])
734
  output_msg += f"\n๐Ÿ”ง Auto-Actions: {actions}"
735
 
736
- # Prepare agent insights for JSON display
737
  agent_insights_data = result.get("multi_agent_analysis", {})
 
738
 
739
  return (
740
  output_msg,
741
  agent_insights_data,
 
742
  gr.Dataframe(
743
  headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
744
  value=table_data,
@@ -747,12 +986,12 @@ def create_enhanced_ui():
747
  )
748
 
749
  except Exception as e:
750
- return f"โŒ Error processing event: {str(e)}", {}, gr.Dataframe(value=[])
751
 
752
  submit_btn.click(
753
  fn=submit_event_enhanced,
754
  inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
755
- outputs=[output_text, agent_insights, events_table]
756
  )
757
 
758
  return demo
 
8
  from typing import List, Dict, Any
9
  import hashlib
10
  import asyncio
11
+ from enum import Enum
12
+ from dataclasses import dataclass
13
 
14
  # Import our modules
15
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
 
58
  with open(TEXTS_FILE, "w") as f:
59
  json.dump(incident_texts, f)
60
 
61
+ # === Predictive Models ===
62
+ @dataclass
63
+ class ForecastResult:
64
+ metric: str
65
+ predicted_value: float
66
+ confidence: float
67
+ trend: str # "increasing", "decreasing", "stable"
68
+ time_to_threshold: Any = None
69
+ risk_level: str = "low" # low, medium, high, critical
70
+
71
+ class SimplePredictiveEngine:
72
+ """Lightweight forecasting engine optimized for Hugging Face Spaces"""
73
+
74
+ def __init__(self, history_window: int = 50):
75
+ self.history_window = history_window
76
+ self.service_history: Dict[str, List] = {}
77
+ self.prediction_cache: Dict[str, ForecastResult] = {}
78
+
79
+ def add_telemetry(self, service: str, event_data: Dict):
80
+ """Add telemetry data to service history"""
81
+ if service not in self.service_history:
82
+ self.service_history[service] = []
83
+
84
+ telemetry_point = {
85
+ 'timestamp': datetime.datetime.now(),
86
+ 'latency': event_data.get('latency_p99', 0),
87
+ 'error_rate': event_data.get('error_rate', 0),
88
+ 'throughput': event_data.get('throughput', 0),
89
+ 'cpu_util': event_data.get('cpu_util'),
90
+ 'memory_util': event_data.get('memory_util')
91
+ }
92
+
93
+ self.service_history[service].append(telemetry_point)
94
+
95
+ # Keep only recent history
96
+ if len(self.service_history[service]) > self.history_window:
97
+ self.service_history[service].pop(0)
98
+
99
+ def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
100
+ """Forecast service health metrics"""
101
+ if service not in self.service_history or len(self.service_history[service]) < 10:
102
+ return []
103
+
104
+ history = self.service_history[service]
105
+ forecasts = []
106
+
107
+ # Forecast latency
108
+ latency_forecast = self._forecast_latency(history, lookahead_minutes)
109
+ if latency_forecast:
110
+ forecasts.append(latency_forecast)
111
+
112
+ # Forecast error rate
113
+ error_forecast = self._forecast_error_rate(history, lookahead_minutes)
114
+ if error_forecast:
115
+ forecasts.append(error_forecast)
116
+
117
+ # Forecast resource utilization
118
+ resource_forecasts = self._forecast_resources(history, lookahead_minutes)
119
+ forecasts.extend(resource_forecasts)
120
+
121
+ # Cache results
122
+ for forecast in forecasts:
123
+ cache_key = f"{service}_{forecast.metric}"
124
+ self.prediction_cache[cache_key] = forecast
125
+
126
+ return forecasts
127
+
128
+ def _forecast_latency(self, history: List, lookahead_minutes: int) -> Any:
129
+ """Forecast latency using linear regression and trend analysis"""
130
+ try:
131
+ latencies = [point['latency'] for point in history[-20:]]
132
+
133
+ if len(latencies) < 5:
134
+ return None
135
+
136
+ # Simple linear trend
137
+ x = np.arange(len(latencies))
138
+ slope, intercept = np.polyfit(x, latencies, 1)
139
+
140
+ # Predict next value
141
+ next_x = len(latencies)
142
+ predicted_latency = slope * next_x + intercept
143
+
144
+ # Calculate confidence based on data quality
145
+ residuals = latencies - (slope * x + intercept)
146
+ confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
147
+
148
+ # Determine trend
149
+ if slope > 5:
150
+ trend = "increasing"
151
+ risk = "high" if predicted_latency > 300 else "medium"
152
+ elif slope < -2:
153
+ trend = "decreasing"
154
+ risk = "low"
155
+ else:
156
+ trend = "stable"
157
+ risk = "low"
158
+
159
+ # Calculate time to reach critical threshold (500ms)
160
+ time_to_critical = None
161
+ if slope > 0 and predicted_latency < 500:
162
+ time_to_critical = datetime.timedelta(
163
+ minutes=lookahead_minutes * (500 - predicted_latency) / max(0.1, (predicted_latency - latencies[-1]))
164
+ )
165
+
166
+ return ForecastResult(
167
+ metric="latency",
168
+ predicted_value=predicted_latency,
169
+ confidence=confidence,
170
+ trend=trend,
171
+ time_to_threshold=time_to_critical,
172
+ risk_level=risk
173
+ )
174
+
175
+ except Exception as e:
176
+ print(f"Latency forecast error: {e}")
177
+ return None
178
+
179
+ def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Any:
180
+ """Forecast error rate using exponential smoothing"""
181
+ try:
182
+ error_rates = [point['error_rate'] for point in history[-15:]]
183
+
184
+ if len(error_rates) < 5:
185
+ return None
186
+
187
+ # Exponential smoothing
188
+ alpha = 0.3
189
+ forecast = error_rates[0]
190
+ for rate in error_rates[1:]:
191
+ forecast = alpha * rate + (1 - alpha) * forecast
192
+
193
+ predicted_rate = forecast
194
+
195
+ # Trend analysis
196
+ recent_trend = np.mean(error_rates[-3:]) - np.mean(error_rates[-6:-3])
197
+
198
+ if recent_trend > 0.02:
199
+ trend = "increasing"
200
+ risk = "high" if predicted_rate > 0.1 else "medium"
201
+ elif recent_trend < -0.01:
202
+ trend = "decreasing"
203
+ risk = "low"
204
+ else:
205
+ trend = "stable"
206
+ risk = "low"
207
+
208
+ # Confidence based on volatility
209
+ confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
210
+
211
+ return ForecastResult(
212
+ metric="error_rate",
213
+ predicted_value=predicted_rate,
214
+ confidence=confidence,
215
+ trend=trend,
216
+ risk_level=risk
217
+ )
218
+
219
+ except Exception as e:
220
+ print(f"Error rate forecast error: {e}")
221
+ return None
222
+
223
+ def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
224
+ """Forecast CPU and memory utilization"""
225
+ forecasts = []
226
+
227
+ # CPU forecast
228
+ cpu_values = [point['cpu_util'] for point in history if point.get('cpu_util') is not None]
229
+ if len(cpu_values) >= 5:
230
+ try:
231
+ predicted_cpu = np.mean(cpu_values[-5:])
232
+ trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
233
+
234
+ risk = "low"
235
+ if predicted_cpu > 0.8:
236
+ risk = "critical" if predicted_cpu > 0.9 else "high"
237
+ elif predicted_cpu > 0.7:
238
+ risk = "medium"
239
+
240
+ forecasts.append(ForecastResult(
241
+ metric="cpu_util",
242
+ predicted_value=predicted_cpu,
243
+ confidence=0.7,
244
+ trend=trend,
245
+ risk_level=risk
246
+ ))
247
+ except Exception as e:
248
+ print(f"CPU forecast error: {e}")
249
+
250
+ # Memory forecast
251
+ memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
252
+ if len(memory_values) >= 5:
253
+ try:
254
+ predicted_memory = np.mean(memory_values[-5:])
255
+ trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
256
+
257
+ risk = "low"
258
+ if predicted_memory > 0.8:
259
+ risk = "critical" if predicted_memory > 0.9 else "high"
260
+ elif predicted_memory > 0.7:
261
+ risk = "medium"
262
+
263
+ forecasts.append(ForecastResult(
264
+ metric="memory_util",
265
+ predicted_value=predicted_memory,
266
+ confidence=0.7,
267
+ trend=trend,
268
+ risk_level=risk
269
+ ))
270
+ except Exception as e:
271
+ print(f"Memory forecast error: {e}")
272
+
273
+ return forecasts
274
+
275
+ def get_predictive_insights(self, service: str) -> Dict[str, Any]:
276
+ """Generate actionable insights from forecasts"""
277
+ forecasts = self.forecast_service_health(service)
278
+
279
+ critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
280
+ warnings = []
281
+ recommendations = []
282
+
283
+ for forecast in critical_risks:
284
+ if forecast.metric == "latency" and forecast.risk_level in ["high", "critical"]:
285
+ warnings.append(f"๐Ÿ“ˆ Latency expected to reach {forecast.predicted_value:.0f}ms")
286
+ if forecast.time_to_threshold:
287
+ minutes = int(forecast.time_to_threshold.total_seconds() / 60)
288
+ recommendations.append(f"โฐ Critical latency (~500ms) in ~{minutes} minutes")
289
+ recommendations.append("๐Ÿ”ง Consider scaling or optimizing dependencies")
290
+
291
+ elif forecast.metric == "error_rate" and forecast.risk_level in ["high", "critical"]:
292
+ warnings.append(f"๐Ÿšจ Errors expected to reach {forecast.predicted_value*100:.1f}%")
293
+ recommendations.append("๐Ÿ› Investigate recent deployments or dependency issues")
294
+
295
+ elif forecast.metric == "cpu_util" and forecast.risk_level in ["high", "critical"]:
296
+ warnings.append(f"๐Ÿ”ฅ CPU expected at {forecast.predicted_value*100:.1f}%")
297
+ recommendations.append("โšก Consider scaling compute resources")
298
+
299
+ elif forecast.metric == "memory_util" and forecast.risk_level in ["high", "critical"]:
300
+ warnings.append(f"๐Ÿ’พ Memory expected at {forecast.predicted_value*100:.1f}%")
301
+ recommendations.append("๐Ÿงน Check for memory leaks or optimize usage")
302
+
303
+ return {
304
+ 'service': service,
305
+ 'forecasts': [f.__dict__ for f in forecasts],
306
+ 'warnings': warnings[:3],
307
+ 'recommendations': list(dict.fromkeys(recommendations))[:3],
308
+ 'critical_risk_count': len(critical_risks),
309
+ 'forecast_timestamp': datetime.datetime.now().isoformat()
310
+ }
311
+
312
  # === Core Engine Components ===
313
  policy_engine = PolicyEngine()
314
  events_history: List[ReliabilityEvent] = []
 
320
  self.revenue_per_request = revenue_per_request
321
 
322
  def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
323
+ base_revenue_per_minute = 100
324
 
 
 
 
 
325
  impact_multiplier = 1.0
326
 
327
  if event.latency_p99 > 300:
328
+ impact_multiplier += 0.5
329
  if event.error_rate > 0.1:
330
+ impact_multiplier += 0.8
331
  if event.cpu_util and event.cpu_util > 0.9:
332
+ impact_multiplier += 0.3
333
 
334
  revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
335
 
336
+ base_users_affected = 1000
 
337
  user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
338
  affected_users = int(base_users_affected * user_impact_multiplier)
339
 
 
340
  if revenue_loss > 500 or affected_users > 5000:
341
  severity = "CRITICAL"
342
  elif revenue_loss > 100 or affected_users > 1000:
 
361
  def __init__(self):
362
  self.historical_data = []
363
  self.adaptive_thresholds = {
364
+ 'latency_p99': 150,
365
  'error_rate': 0.05
366
  }
367
 
368
  def detect_anomaly(self, event: ReliabilityEvent) -> bool:
 
 
 
369
  latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
370
  error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
371
 
 
372
  resource_anomaly = False
373
  if event.cpu_util and event.cpu_util > 0.9:
374
  resource_anomaly = True
375
  if event.memory_util and event.memory_util > 0.9:
376
  resource_anomaly = True
377
 
 
378
  self._update_thresholds(event)
379
 
380
  return latency_anomaly or error_anomaly or resource_anomaly
381
 
382
  def _update_thresholds(self, event: ReliabilityEvent):
 
383
  self.historical_data.append(event)
384
 
 
385
  if len(self.historical_data) > 100:
386
  self.historical_data.pop(0)
387
 
 
388
  if len(self.historical_data) > 10:
389
  recent_latencies = [e.latency_p99 for e in self.historical_data[-20:]]
390
  self.adaptive_thresholds['latency_p99'] = np.percentile(recent_latencies, 90)
 
392
  anomaly_detector = AdvancedAnomalyDetector()
393
 
394
  # === Multi-Agent Foundation ===
 
 
395
  class AgentSpecialization(Enum):
396
  DETECTIVE = "anomaly_detection"
397
  DIAGNOSTICIAN = "root_cause_analysis"
398
+ PREDICTIVE = "predictive_analytics"
399
 
400
  class BaseAgent:
401
  def __init__(self, specialization: AgentSpecialization):
 
409
  super().__init__(AgentSpecialization.DETECTIVE)
410
 
411
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
 
412
  anomaly_score = self._calculate_anomaly_score(event)
413
 
414
  return {
 
423
  }
424
 
425
  def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
 
426
  scores = []
427
 
 
428
  if event.latency_p99 > 150:
429
  latency_score = min(1.0, (event.latency_p99 - 150) / 500)
430
  scores.append(0.4 * latency_score)
431
 
 
432
  if event.error_rate > 0.05:
433
  error_score = min(1.0, event.error_rate / 0.3)
434
  scores.append(0.3 * error_score)
435
 
 
436
  resource_score = 0
437
  if event.cpu_util and event.cpu_util > 0.8:
438
  resource_score += 0.15 * min(1.0, (event.cpu_util - 0.8) / 0.2)
 
443
  return min(1.0, sum(scores))
444
 
445
  def _classify_severity(self, anomaly_score: float) -> str:
 
446
  if anomaly_score > 0.8:
447
  return "CRITICAL"
448
  elif anomaly_score > 0.6:
 
453
  return "LOW"
454
 
455
  def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
 
456
  affected = []
457
 
 
458
  if event.latency_p99 > 500:
459
  affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
460
  elif event.latency_p99 > 300:
 
462
  elif event.latency_p99 > 150:
463
  affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
464
 
 
465
  if event.error_rate > 0.3:
466
  affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
467
  elif event.error_rate > 0.15:
 
469
  elif event.error_rate > 0.05:
470
  affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
471
 
 
472
  if event.cpu_util and event.cpu_util > 0.9:
473
  affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
474
  elif event.cpu_util and event.cpu_util > 0.8:
 
482
  return affected
483
 
484
  def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
 
485
  recommendations = []
486
  affected_metrics = self._identify_affected_metrics(event)
487
 
 
513
  elif metric_name == "memory":
514
  recommendations.append(f"๐Ÿ’พ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
515
 
 
516
  if anomaly_score > 0.8:
517
  recommendations.append("๐ŸŽฏ IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
518
  elif anomaly_score > 0.6:
 
520
  elif anomaly_score > 0.4:
521
  recommendations.append("๐Ÿ“Š MONITOR: Early warning signs detected")
522
 
523
+ return recommendations[:4]
524
 
525
  class RootCauseAgent(BaseAgent):
526
  def __init__(self):
527
  super().__init__(AgentSpecialization.DIAGNOSTICIAN)
528
 
529
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
 
530
  causes = self._analyze_potential_causes(event)
531
 
532
  return {
533
  'specialization': self.specialization.value,
534
+ 'confidence': 0.7,
535
  'findings': {
536
  'likely_root_causes': causes,
537
  'evidence_patterns': self._identify_evidence(event),
 
543
  }
544
 
545
  def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
 
546
  causes = []
547
 
 
548
  if event.latency_p99 > 500 and event.error_rate > 0.2:
549
  causes.append({
550
  "cause": "Database/External Dependency Failure",
 
553
  "investigation": "Check database connection pool, external API health"
554
  })
555
 
 
556
  if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
557
  causes.append({
558
  "cause": "Resource Exhaustion",
 
561
  "investigation": "Check for memory leaks, infinite loops, insufficient resources"
562
  })
563
 
 
564
  if event.error_rate > 0.3 and event.latency_p99 < 200:
565
  causes.append({
566
  "cause": "Application Bug / Configuration Issue",
 
569
  "investigation": "Review recent deployments, configuration changes, application logs"
570
  })
571
 
 
572
  if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
573
  causes.append({
574
  "cause": "Gradual Performance Degradation",
 
588
  return causes
589
 
590
  def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
 
591
  evidence = []
592
  if event.latency_p99 > event.error_rate * 1000:
593
  evidence.append("latency_disproportionate_to_errors")
 
596
  return evidence
597
 
598
  def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
 
599
  for cause in causes:
600
  if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
601
  return "HIGH"
602
  return "MEDIUM"
603
 
604
+ class PredictiveAgent(BaseAgent):
605
+ def __init__(self):
606
+ super().__init__(AgentSpecialization.PREDICTIVE)
607
+ self.engine = SimplePredictiveEngine()
608
+
609
+ async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
610
+ """Predictive analysis for future risks"""
611
+ event_data = {
612
+ 'latency_p99': event.latency_p99,
613
+ 'error_rate': event.error_rate,
614
+ 'throughput': event.throughput,
615
+ 'cpu_util': event.cpu_util,
616
+ 'memory_util': event.memory_util
617
+ }
618
+ self.engine.add_telemetry(event.component, event_data)
619
+
620
+ insights = self.engine.get_predictive_insights(event.component)
621
+
622
+ return {
623
+ 'specialization': self.specialization.value,
624
+ 'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
625
+ 'findings': insights,
626
+ 'recommendations': insights['recommendations']
627
+ }
628
+
629
  class OrchestrationManager:
630
  def __init__(self):
631
  self.agents = {
632
  AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
633
  AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
634
+ AgentSpecialization.PREDICTIVE: PredictiveAgent(),
635
  }
636
 
637
  async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
 
638
  agent_tasks = {
639
  spec: agent.analyze(event)
640
  for spec, agent in self.agents.items()
641
  }
642
 
 
643
  agent_results = {}
644
  for specialization, task in agent_tasks.items():
645
  try:
 
651
  return self._synthesize_agent_findings(event, agent_results)
652
 
653
  def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
 
654
  detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
655
  diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
656
+ predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
657
 
658
  if not detective_result:
659
  return {'error': 'No agent results available'}
 
665
  'primary_metrics_affected': [metric["metric"] for metric in detective_result['findings'].get('primary_metrics_affected', [])]
666
  },
667
  'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
668
+ 'predictive_insights': predictive_result['findings'] if predictive_result else {},
669
  'recommended_actions': self._prioritize_actions(
670
  detective_result.get('recommendations', []),
671
+ diagnostician_result.get('recommendations', []) if diagnostician_result else [],
672
+ predictive_result.get('recommendations', []) if predictive_result else []
673
  ),
674
  'agent_metadata': {
675
  'participating_agents': list(agent_results.keys()),
 
679
 
680
  return synthesis
681
 
682
+ def _prioritize_actions(self, detection_actions: List[str], diagnosis_actions: List[str], predictive_actions: List[str]) -> List[str]:
683
+ all_actions = detection_actions + diagnosis_actions + predictive_actions
 
 
684
  seen = set()
685
  unique_actions = []
686
  for action in all_actions:
687
  if action not in seen:
688
  seen.add(action)
689
  unique_actions.append(action)
690
+ return unique_actions[:5]
691
 
692
  # Initialize enhanced components
693
  orchestration_manager = OrchestrationManager()
 
702
  async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
703
  throughput: float = 1000, cpu_util: float = None,
704
  memory_util: float = None) -> Dict[str, Any]:
 
705
 
 
706
  event = ReliabilityEvent(
707
  component=component,
708
  latency_p99=latency,
 
713
  upstream_deps=["auth-service", "database"] if component == "api-service" else []
714
  )
715
 
 
716
  agent_analysis = await orchestration_manager.orchestrate_analysis(event)
717
 
 
718
  is_anomaly = anomaly_detector.detect_anomaly(event)
719
 
 
720
  agent_confidence = 0.0
721
  if agent_analysis and 'incident_summary' in agent_analysis:
722
  agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
723
  else:
 
724
  agent_confidence = 0.8 if is_anomaly else 0.1
725
 
 
726
  if agent_confidence > 0.8:
727
  event.severity = EventSeverity.CRITICAL
728
  elif agent_confidence > 0.6:
 
732
  else:
733
  event.severity = EventSeverity.LOW
734
 
 
735
  healing_actions = policy_engine.evaluate_policies(event)
736
 
 
737
  business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
738
 
 
739
  if index is not None and is_anomaly:
740
  analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
741
  vector_text = f"{component} {latency} {error_rate} {analysis_text}"
 
744
  incident_texts.append(vector_text)
745
  save_index()
746
 
 
747
  result = {
748
  "timestamp": event.timestamp,
749
  "component": component,
 
772
  enhanced_engine = EnhancedReliabilityEngine()
773
 
774
  def call_huggingface_analysis(prompt: str) -> str:
 
775
  if not HF_TOKEN:
776
  fallback_insights = [
777
  "High latency detected - possible resource contention or network issues",
 
818
 
819
  # === Enhanced UI with Multi-Agent Insights ===
820
  def create_enhanced_ui():
 
821
  with gr.Blocks(title="๐Ÿง  Enterprise Agentic Reliability Framework", theme="soft") as demo:
822
  gr.Markdown("""
823
  # ๐Ÿง  Enterprise Agentic Reliability Framework
824
  **Multi-Agent AI System for Production Reliability**
825
 
826
+ *Specialized AI agents working together to detect, diagnose, predict, and heal system issues*
827
  """)
828
 
829
  with gr.Row():
 
867
  output_text = gr.Textbox(
868
  label="Agent Synthesis",
869
  placeholder="AI agents are analyzing...",
870
+ lines=6
871
  )
872
 
 
873
  with gr.Accordion("๐Ÿค– Agent Specialists Analysis", open=False):
874
  gr.Markdown("""
875
  **Specialized AI Agents:**
876
  - ๐Ÿ•ต๏ธ **Detective**: Anomaly detection & pattern recognition
877
+ - ๐Ÿ” **Diagnostician**: Root cause analysis & investigation
878
+ - ๐Ÿ”ฎ **Predictive**: Future risk forecasting & trend analysis
879
  """)
880
 
881
  agent_insights = gr.JSON(
 
883
  value={}
884
  )
885
 
886
+ with gr.Accordion("๐Ÿ”ฎ Predictive Analytics & Forecasting", open=False):
887
+ gr.Markdown("""
888
+ **Future Risk Forecasting:**
889
+ - ๐Ÿ“ˆ Latency trends and thresholds
890
+ - ๐Ÿšจ Error rate predictions
891
+ - ๐Ÿ”ฅ Resource utilization forecasts
892
+ - โฐ Time-to-failure estimates
893
+ """)
894
+
895
+ predictive_insights = gr.JSON(
896
+ label="Predictive Forecasts",
897
+ value={}
898
+ )
899
+
900
  gr.Markdown("### ๐Ÿ“ˆ Recent Events (Last 15)")
901
  events_table = gr.Dataframe(
902
  headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
 
904
  wrap=True,
905
  )
906
 
 
907
  with gr.Accordion("โ„น๏ธ Framework Capabilities", open=False):
908
  gr.Markdown("""
909
+ - **๐Ÿค– Multi-Agent AI**: Specialized agents for detection, diagnosis, prediction, and healing
910
+ - **๐Ÿ”ฎ Predictive Analytics**: Forecast future risks and performance degradation
911
  - **๐Ÿ”ง Policy-Based Healing**: Automated recovery actions based on severity and context
912
  - **๐Ÿ’ฐ Business Impact**: Revenue and user impact quantification
913
  - **๐ŸŽฏ Adaptive Detection**: ML-powered thresholds that learn from your environment
 
924
 
925
  gr.Markdown("\n\n".join(policy_info))
926
 
 
927
  async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util):
 
928
  try:
 
929
  latency = float(latency)
930
  error_rate = float(error_rate)
931
  throughput = float(throughput) if throughput else 1000
932
  cpu_util = float(cpu_util) if cpu_util else None
933
  memory_util = float(memory_util) if memory_util else None
934
 
 
935
  result = await enhanced_engine.process_event_enhanced(
936
  component, latency, error_rate, throughput, cpu_util, memory_util
937
  )
938
 
 
939
  table_data = []
940
  for event in events_history[-15:]:
941
  table_data.append([
 
948
  "Multi-agent analysis" if 'multi_agent_analysis' in result else 'N/A'
949
  ])
950
 
 
951
  status_emoji = "๐Ÿšจ" if result["status"] == "ANOMALY" else "โœ…"
952
  output_msg = f"{status_emoji} {result['status']}"
953
 
 
954
  if "multi_agent_analysis" in result:
955
  analysis = result["multi_agent_analysis"]
956
  confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
957
  output_msg += f"\n๐ŸŽฏ Confidence: {confidence*100:.1f}%"
958
 
959
+ predictive_data = analysis.get('predictive_insights', {})
960
+ if predictive_data.get('critical_risk_count', 0) > 0:
961
+ output_msg += f"\n๐Ÿ”ฎ PREDICTIVE: {predictive_data['critical_risk_count']} critical risks forecast"
962
+
963
  if analysis.get('recommended_actions'):
964
  output_msg += f"\n๐Ÿ’ก Insights: {', '.join(analysis['recommended_actions'][:2])}"
965
 
 
966
  if result["business_impact"]:
967
  impact = result["business_impact"]
968
  output_msg += f"\n๐Ÿ’ฐ Business Impact: ${impact['revenue_loss_estimate']} | ๐Ÿ‘ฅ {impact['affected_users_estimate']} users | ๐Ÿšจ {impact['severity_level']}"
969
 
 
970
  if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
971
  actions = ", ".join(result["healing_actions"])
972
  output_msg += f"\n๐Ÿ”ง Auto-Actions: {actions}"
973
 
 
974
  agent_insights_data = result.get("multi_agent_analysis", {})
975
+ predictive_insights_data = agent_insights_data.get('predictive_insights', {})
976
 
977
  return (
978
  output_msg,
979
  agent_insights_data,
980
+ predictive_insights_data,
981
  gr.Dataframe(
982
  headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
983
  value=table_data,
 
986
  )
987
 
988
  except Exception as e:
989
+ return f"โŒ Error processing event: {str(e)}", {}, {}, gr.Dataframe(value=[])
990
 
991
  submit_btn.click(
992
  fn=submit_event_enhanced,
993
  inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
994
+ outputs=[output_text, agent_insights, predictive_insights, events_table]
995
  )
996
 
997
  return demo