petter2025 commited on
Commit
1437f82
ยท
verified ยท
1 Parent(s): fc7752d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +452 -452
app.py CHANGED
@@ -1,3 +1,11 @@
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  import numpy as np
@@ -5,103 +13,256 @@ import gradio as gr
5
  import requests
6
  import pandas as pd
7
  import datetime
8
- from typing import List, Dict, Any
 
 
 
 
9
  import hashlib
10
  import asyncio
11
- from enum import Enum
12
- from dataclasses import dataclass
13
 
14
  # Import our modules
15
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
16
  from healing_policies import PolicyEngine
 
 
 
 
 
 
 
 
17
 
18
  # === Configuration ===
19
- HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
20
- HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
21
- HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # === FAISS & Embeddings Setup ===
24
  try:
25
  from sentence_transformers import SentenceTransformer
26
  import faiss
27
 
28
- VECTOR_DIM = 384
29
- INDEX_FILE = "incident_vectors.index"
30
- TEXTS_FILE = "incident_texts.json"
31
-
32
- # Try to load model with error handling
33
- try:
34
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
35
- except Exception as e:
36
- print(f"Model loading warning: {e}")
37
- from sentence_transformers import SentenceTransformer as ST
38
- model = ST("sentence-transformers/all-MiniLM-L6-v2")
39
-
40
- if os.path.exists(INDEX_FILE):
41
- index = faiss.read_index(INDEX_FILE)
42
- with open(TEXTS_FILE, "r") as f:
43
- incident_texts = json.load(f)
 
44
  else:
45
- index = faiss.IndexFlatL2(VECTOR_DIM)
 
46
  incident_texts = []
47
-
 
 
48
  except ImportError as e:
49
- print(f"Warning: FAISS or SentenceTransformers not available: {e}")
50
  index = None
51
  incident_texts = []
52
  model = None
53
-
54
- def save_index():
55
- """Save FAISS index and incident texts"""
56
- if index is not None:
57
- faiss.write_index(index, INDEX_FILE)
58
- with open(TEXTS_FILE, "w") as f:
59
- json.dump(incident_texts, f)
60
 
61
  # === Predictive Models ===
62
  @dataclass
63
  class ForecastResult:
 
64
  metric: str
65
  predicted_value: float
66
  confidence: float
67
  trend: str # "increasing", "decreasing", "stable"
68
- time_to_threshold: Any = None
69
  risk_level: str = "low" # low, medium, high, critical
70
 
71
  class SimplePredictiveEngine:
72
  """Lightweight forecasting engine optimized for Hugging Face Spaces"""
73
 
74
- def __init__(self, history_window: int = 50):
75
  self.history_window = history_window
76
- self.service_history: Dict[str, List] = {}
77
- self.prediction_cache: Dict[str, ForecastResult] = {}
78
-
79
- def add_telemetry(self, service: str, event_data: Dict):
 
 
 
80
  """Add telemetry data to service history"""
81
- if service not in self.service_history:
82
- self.service_history[service] = []
83
-
84
- telemetry_point = {
85
- 'timestamp': datetime.datetime.now(),
86
- 'latency': event_data.get('latency_p99', 0),
87
- 'error_rate': event_data.get('error_rate', 0),
88
- 'throughput': event_data.get('throughput', 0),
89
- 'cpu_util': event_data.get('cpu_util'),
90
- 'memory_util': event_data.get('memory_util')
91
- }
92
-
93
- self.service_history[service].append(telemetry_point)
94
-
95
- # Keep only recent history
96
- if len(self.service_history[service]) > self.history_window:
97
- self.service_history[service].pop(0)
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
100
  """Forecast service health metrics"""
101
- if service not in self.service_history or len(self.service_history[service]) < 10:
102
- return []
 
 
 
103
 
104
- history = self.service_history[service]
105
  forecasts = []
106
 
107
  # Forecast latency
@@ -119,13 +280,14 @@ class SimplePredictiveEngine:
119
  forecasts.extend(resource_forecasts)
120
 
121
  # Cache results
122
- for forecast in forecasts:
123
- cache_key = f"{service}_{forecast.metric}"
124
- self.prediction_cache[cache_key] = forecast
 
125
 
126
  return forecasts
127
 
128
- def _forecast_latency(self, history: List, lookahead_minutes: int) -> Any:
129
  """Forecast latency using linear regression and trend analysis"""
130
  try:
131
  latencies = [point['latency'] for point in history[-20:]]
@@ -148,7 +310,7 @@ class SimplePredictiveEngine:
148
  # Determine trend
149
  if slope > 5:
150
  trend = "increasing"
151
- risk = "high" if predicted_latency > 300 else "medium"
152
  elif slope < -2:
153
  trend = "decreasing"
154
  risk = "low"
@@ -159,9 +321,11 @@ class SimplePredictiveEngine:
159
  # Calculate time to reach critical threshold (500ms)
160
  time_to_critical = None
161
  if slope > 0 and predicted_latency < 500:
162
- time_to_critical = datetime.timedelta(
163
- minutes=lookahead_minutes * (500 - predicted_latency) / max(0.1, (predicted_latency - latencies[-1]))
164
- )
 
 
165
 
166
  return ForecastResult(
167
  metric="latency",
@@ -173,10 +337,10 @@ class SimplePredictiveEngine:
173
  )
174
 
175
  except Exception as e:
176
- print(f"Latency forecast error: {e}")
177
  return None
178
 
179
- def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Any:
180
  """Forecast error rate using exponential smoothing"""
181
  try:
182
  error_rates = [point['error_rate'] for point in history[-15:]]
@@ -217,7 +381,7 @@ class SimplePredictiveEngine:
217
  )
218
 
219
  except Exception as e:
220
- print(f"Error rate forecast error: {e}")
221
  return None
222
 
223
  def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
@@ -232,8 +396,10 @@ class SimplePredictiveEngine:
232
  trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
233
 
234
  risk = "low"
235
- if predicted_cpu > 0.8:
236
- risk = "critical" if predicted_cpu > 0.9 else "high"
 
 
237
  elif predicted_cpu > 0.7:
238
  risk = "medium"
239
 
@@ -245,7 +411,7 @@ class SimplePredictiveEngine:
245
  risk_level=risk
246
  ))
247
  except Exception as e:
248
- print(f"CPU forecast error: {e}")
249
 
250
  # Memory forecast
251
  memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
@@ -255,8 +421,10 @@ class SimplePredictiveEngine:
255
  trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
256
 
257
  risk = "low"
258
- if predicted_memory > 0.8:
259
- risk = "critical" if predicted_memory > 0.9 else "high"
 
 
260
  elif predicted_memory > 0.7:
261
  risk = "medium"
262
 
@@ -268,7 +436,7 @@ class SimplePredictiveEngine:
268
  risk_level=risk
269
  ))
270
  except Exception as e:
271
- print(f"Memory forecast error: {e}")
272
 
273
  return forecasts
274
 
@@ -302,7 +470,7 @@ class SimplePredictiveEngine:
302
 
303
  return {
304
  'service': service,
305
- 'forecasts': [f.__dict__ for f in forecasts],
306
  'warnings': warnings[:3],
307
  'recommendations': list(dict.fromkeys(recommendations))[:3],
308
  'critical_risk_count': len(critical_risks),
@@ -311,24 +479,36 @@ class SimplePredictiveEngine:
311
 
312
  # === Core Engine Components ===
313
  policy_engine = PolicyEngine()
314
- events_history: List[ReliabilityEvent] = []
 
315
 
316
  class BusinessImpactCalculator:
317
  """Calculate business impact of anomalies"""
318
 
319
  def __init__(self, revenue_per_request: float = 0.01):
320
  self.revenue_per_request = revenue_per_request
 
321
 
322
  def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
323
  base_revenue_per_minute = 100
324
 
325
  impact_multiplier = 1.0
326
 
327
- if event.latency_p99 > 300:
328
  impact_multiplier += 0.5
329
  if event.error_rate > 0.1:
330
  impact_multiplier += 0.8
331
- if event.cpu_util and event.cpu_util > 0.9:
332
  impact_multiplier += 0.3
333
 
334
  revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
@@ -346,6 +526,8 @@ class BusinessImpactCalculator:
346
  else:
347
  severity = "LOW"
348
 
 
 
349
  return {
350
  'revenue_loss_estimate': round(revenue_loss, 2),
351
  'affected_users_estimate': affected_users,
@@ -359,252 +541,62 @@ class AdvancedAnomalyDetector:
359
  """Enhanced anomaly detection with adaptive thresholds"""
360
 
361
  def __init__(self):
362
- self.historical_data = []
363
  self.adaptive_thresholds = {
364
- 'latency_p99': 150,
365
- 'error_rate': 0.05
366
  }
 
 
367
 
368
  def detect_anomaly(self, event: ReliabilityEvent) -> bool:
369
- latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
370
- error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
371
-
372
- resource_anomaly = False
373
- if event.cpu_util and event.cpu_util > 0.9:
374
- resource_anomaly = True
375
- if event.memory_util and event.memory_util > 0.9:
376
- resource_anomaly = True
377
-
378
- self._update_thresholds(event)
379
 
380
- return latency_anomaly or error_anomaly or resource_anomaly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
- def _update_thresholds(self, event: ReliabilityEvent):
 
383
  self.historical_data.append(event)
384
 
385
- if len(self.historical_data) > 100:
386
- self.historical_data.pop(0)
387
-
388
  if len(self.historical_data) > 10:
389
- recent_latencies = [e.latency_p99 for e in self.historical_data[-20:]]
390
- self.adaptive_thresholds['latency_p99'] = np.percentile(recent_latencies, 90)
 
 
391
 
392
  anomaly_detector = AdvancedAnomalyDetector()
393
 
394
- # === Multi-Agent Foundation ===
395
- class AgentSpecialization(Enum):
396
- DETECTIVE = "anomaly_detection"
397
- DIAGNOSTICIAN = "root_cause_analysis"
398
- PREDICTIVE = "predictive_analytics"
399
-
400
- class BaseAgent:
401
- def __init__(self, specialization: AgentSpecialization):
402
- self.specialization = specialization
403
 
404
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
405
- raise NotImplementedError
406
-
407
- class AnomalyDetectionAgent(BaseAgent):
408
  def __init__(self):
409
- super().__init__(AgentSpecialization.DETECTIVE)
410
-
411
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
412
- anomaly_score = self._calculate_anomaly_score(event)
413
-
414
- return {
415
- 'specialization': self.specialization.value,
416
- 'confidence': anomaly_score,
417
- 'findings': {
418
- 'anomaly_score': anomaly_score,
419
- 'severity_tier': self._classify_severity(anomaly_score),
420
- 'primary_metrics_affected': self._identify_affected_metrics(event)
421
- },
422
- 'recommendations': self._generate_detection_recommendations(event, anomaly_score)
423
- }
424
-
425
- def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
426
- scores = []
427
-
428
- if event.latency_p99 > 150:
429
- latency_score = min(1.0, (event.latency_p99 - 150) / 500)
430
- scores.append(0.4 * latency_score)
431
-
432
- if event.error_rate > 0.05:
433
- error_score = min(1.0, event.error_rate / 0.3)
434
- scores.append(0.3 * error_score)
435
-
436
- resource_score = 0
437
- if event.cpu_util and event.cpu_util > 0.8:
438
- resource_score += 0.15 * min(1.0, (event.cpu_util - 0.8) / 0.2)
439
- if event.memory_util and event.memory_util > 0.8:
440
- resource_score += 0.15 * min(1.0, (event.memory_util - 0.8) / 0.2)
441
- scores.append(resource_score)
442
-
443
- return min(1.0, sum(scores))
444
-
445
- def _classify_severity(self, anomaly_score: float) -> str:
446
- if anomaly_score > 0.8:
447
- return "CRITICAL"
448
- elif anomaly_score > 0.6:
449
- return "HIGH"
450
- elif anomaly_score > 0.4:
451
- return "MEDIUM"
452
- else:
453
- return "LOW"
454
-
455
- def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
456
- affected = []
457
-
458
- if event.latency_p99 > 500:
459
- affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
460
- elif event.latency_p99 > 300:
461
- affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
462
- elif event.latency_p99 > 150:
463
- affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
464
-
465
- if event.error_rate > 0.3:
466
- affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
467
- elif event.error_rate > 0.15:
468
- affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
469
- elif event.error_rate > 0.05:
470
- affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
471
-
472
- if event.cpu_util and event.cpu_util > 0.9:
473
- affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
474
- elif event.cpu_util and event.cpu_util > 0.8:
475
- affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
476
-
477
- if event.memory_util and event.memory_util > 0.9:
478
- affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
479
- elif event.memory_util and event.memory_util > 0.8:
480
- affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
481
-
482
- return affected
483
-
484
- def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
485
- recommendations = []
486
- affected_metrics = self._identify_affected_metrics(event)
487
-
488
- for metric in affected_metrics:
489
- metric_name = metric["metric"]
490
- severity = metric["severity"]
491
- value = metric["value"]
492
- threshold = metric["threshold"]
493
-
494
- if metric_name == "latency":
495
- if severity == "CRITICAL":
496
- recommendations.append(f"๐Ÿšจ CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
497
- elif severity == "HIGH":
498
- recommendations.append(f"โš ๏ธ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
499
- else:
500
- recommendations.append(f"๐Ÿ“ˆ Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
501
-
502
- elif metric_name == "error_rate":
503
- if severity == "CRITICAL":
504
- recommendations.append(f"๐Ÿšจ CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
505
- elif severity == "HIGH":
506
- recommendations.append(f"โš ๏ธ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
507
- else:
508
- recommendations.append(f"๐Ÿ“ˆ Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
509
-
510
- elif metric_name == "cpu":
511
- recommendations.append(f"๐Ÿ”ฅ CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
512
-
513
- elif metric_name == "memory":
514
- recommendations.append(f"๐Ÿ’พ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
515
-
516
- if anomaly_score > 0.8:
517
- recommendations.append("๐ŸŽฏ IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
518
- elif anomaly_score > 0.6:
519
- recommendations.append("๐ŸŽฏ INVESTIGATE: Significant performance degradation detected")
520
- elif anomaly_score > 0.4:
521
- recommendations.append("๐Ÿ“Š MONITOR: Early warning signs detected")
522
-
523
- return recommendations[:4]
524
-
525
- class RootCauseAgent(BaseAgent):
526
- def __init__(self):
527
- super().__init__(AgentSpecialization.DIAGNOSTICIAN)
528
-
529
- async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
530
- causes = self._analyze_potential_causes(event)
531
-
532
- return {
533
- 'specialization': self.specialization.value,
534
- 'confidence': 0.7,
535
- 'findings': {
536
- 'likely_root_causes': causes,
537
- 'evidence_patterns': self._identify_evidence(event),
538
- 'investigation_priority': self._prioritize_investigation(causes)
539
- },
540
- 'recommendations': [
541
- f"Check {cause['cause']} for issues" for cause in causes[:2]
542
- ]
543
- }
544
-
545
- def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
546
- causes = []
547
-
548
- if event.latency_p99 > 500 and event.error_rate > 0.2:
549
- causes.append({
550
- "cause": "Database/External Dependency Failure",
551
- "confidence": 0.85,
552
- "evidence": f"Extreme latency ({event.latency_p99}ms) with high errors ({event.error_rate*100:.1f}%)",
553
- "investigation": "Check database connection pool, external API health"
554
- })
555
-
556
- if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
557
- causes.append({
558
- "cause": "Resource Exhaustion",
559
- "confidence": 0.90,
560
- "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
561
- "investigation": "Check for memory leaks, infinite loops, insufficient resources"
562
- })
563
-
564
- if event.error_rate > 0.3 and event.latency_p99 < 200:
565
- causes.append({
566
- "cause": "Application Bug / Configuration Issue",
567
- "confidence": 0.75,
568
- "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
569
- "investigation": "Review recent deployments, configuration changes, application logs"
570
- })
571
-
572
- if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
573
- causes.append({
574
- "cause": "Gradual Performance Degradation",
575
- "confidence": 0.65,
576
- "evidence": f"Moderate latency ({event.latency_p99}ms) and errors ({event.error_rate*100:.1f}%)",
577
- "investigation": "Check resource trends, dependency performance, capacity planning"
578
- })
579
-
580
- if not causes:
581
- causes.append({
582
- "cause": "Unknown - Requires Investigation",
583
- "confidence": 0.3,
584
- "evidence": "Pattern does not match known failure modes",
585
- "investigation": "Complete system review needed"
586
- })
587
-
588
- return causes
589
-
590
- def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
591
- evidence = []
592
- if event.latency_p99 > event.error_rate * 1000:
593
- evidence.append("latency_disproportionate_to_errors")
594
- if event.cpu_util and event.cpu_util > 0.8 and event.memory_util and event.memory_util > 0.8:
595
- evidence.append("correlated_resource_exhaustion")
596
- return evidence
597
-
598
- def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
599
- for cause in causes:
600
- if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
601
- return "HIGH"
602
- return "MEDIUM"
603
-
604
- class PredictiveAgent(BaseAgent):
605
- def __init__(self):
606
- super().__init__(AgentSpecialization.PREDICTIVE)
607
- self.engine = SimplePredictiveEngine()
608
 
609
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
610
  """Predictive analysis for future risks"""
@@ -620,89 +612,55 @@ class PredictiveAgent(BaseAgent):
620
  insights = self.engine.get_predictive_insights(event.component)
621
 
622
  return {
623
- 'specialization': self.specialization.value,
624
  'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
625
  'findings': insights,
626
  'recommendations': insights['recommendations']
627
  }
628
 
629
- class OrchestrationManager:
630
- def __init__(self):
631
- self.agents = {
632
- AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
633
- AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
634
- AgentSpecialization.PREDICTIVE: PredictiveAgent(),
635
- }
636
-
637
- async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
638
- agent_tasks = {
639
- spec: agent.analyze(event)
640
- for spec, agent in self.agents.items()
641
- }
642
-
643
- agent_results = {}
644
- for specialization, task in agent_tasks.items():
645
- try:
646
- result = await asyncio.wait_for(task, timeout=5.0)
647
- agent_results[specialization.value] = result
648
- except asyncio.TimeoutError:
649
- continue
650
-
651
- return self._synthesize_agent_findings(event, agent_results)
652
-
653
- def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
654
- detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
655
- diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
656
- predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
657
-
658
- if not detective_result:
659
- return {'error': 'No agent results available'}
660
-
661
- synthesis = {
662
- 'incident_summary': {
663
- 'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
664
- 'anomaly_confidence': detective_result['confidence'],
665
- 'primary_metrics_affected': [metric["metric"] for metric in detective_result['findings'].get('primary_metrics_affected', [])]
666
- },
667
- 'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
668
- 'predictive_insights': predictive_result['findings'] if predictive_result else {},
669
- 'recommended_actions': self._prioritize_actions(
670
- detective_result.get('recommendations', []),
671
- diagnostician_result.get('recommendations', []) if diagnostician_result else [],
672
- predictive_result.get('recommendations', []) if predictive_result else []
673
- ),
674
- 'agent_metadata': {
675
- 'participating_agents': list(agent_results.keys()),
676
- 'analysis_timestamp': datetime.datetime.now().isoformat()
677
- }
678
- }
679
-
680
- return synthesis
681
-
682
- def _prioritize_actions(self, detection_actions: List[str], diagnosis_actions: List[str], predictive_actions: List[str]) -> List[str]:
683
- all_actions = detection_actions + diagnosis_actions + predictive_actions
684
- seen = set()
685
- unique_actions = []
686
- for action in all_actions:
687
- if action not in seen:
688
- seen.add(action)
689
- unique_actions.append(action)
690
- return unique_actions[:5]
691
-
692
- # Initialize enhanced components
693
  orchestration_manager = OrchestrationManager()
 
694
 
 
695
  class EnhancedReliabilityEngine:
 
 
696
  def __init__(self):
697
  self.performance_metrics = {
698
  'total_incidents_processed': 0,
699
- 'multi_agent_analyses': 0
 
700
  }
 
 
701
 
702
- async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
703
- throughput: float = 1000, cpu_util: float = None,
704
- memory_util: float = None) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
 
 
706
  event = ReliabilityEvent(
707
  component=component,
708
  latency_p99=latency,
@@ -713,10 +671,13 @@ class EnhancedReliabilityEngine:
713
  upstream_deps=["auth-service", "database"] if component == "api-service" else []
714
  )
715
 
 
716
  agent_analysis = await orchestration_manager.orchestrate_analysis(event)
717
 
 
718
  is_anomaly = anomaly_detector.detect_anomaly(event)
719
 
 
720
  agent_confidence = 0.0
721
  if agent_analysis and 'incident_summary' in agent_analysis:
722
  agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
@@ -732,18 +693,23 @@ class EnhancedReliabilityEngine:
732
  else:
733
  event.severity = EventSeverity.LOW
734
 
 
735
  healing_actions = policy_engine.evaluate_policies(event)
736
 
 
737
  business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
738
 
739
- if index is not None and is_anomaly:
740
- analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
741
- vector_text = f"{component} {latency} {error_rate} {analysis_text}"
742
- vec = model.encode([vector_text])
743
- index.add(np.array(vec, dtype=np.float32))
744
- incident_texts.append(vector_text)
745
- save_index()
 
 
746
 
 
747
  result = {
748
  "timestamp": event.timestamp,
749
  "component": component,
@@ -755,69 +721,61 @@ class EnhancedReliabilityEngine:
755
  "healing_actions": [action.value for action in healing_actions],
756
  "business_impact": business_impact,
757
  "severity": event.severity.value,
758
- "similar_incidents_count": len(incident_texts) if is_anomaly else 0,
759
  "processing_metadata": {
760
  "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
761
  "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
762
  }
763
  }
764
 
765
- events_history.append(event)
766
- self.performance_metrics['total_incidents_processed'] += 1
767
- self.performance_metrics['multi_agent_analyses'] += 1
 
 
 
 
 
 
 
 
768
 
769
  return result
770
 
771
  # Initialize enhanced engine
772
  enhanced_engine = EnhancedReliabilityEngine()
773
 
774
- def call_huggingface_analysis(prompt: str) -> str:
775
- if not HF_TOKEN:
776
- fallback_insights = [
777
- "High latency detected - possible resource contention or network issues",
778
- "Error rate increase suggests recent deployment instability",
779
- "Latency spike correlates with increased user traffic patterns",
780
- "Intermittent failures indicate potential dependency service degradation",
781
- "Performance degradation detected - consider scaling compute resources"
782
- ]
783
- import random
784
- return random.choice(fallback_insights)
785
-
786
- try:
787
- enhanced_prompt = f"""
788
- As a senior reliability engineer, analyze this telemetry event and provide a concise root cause analysis:
789
-
790
- {prompt}
791
-
792
- Focus on:
793
- - Potential infrastructure or application issues
794
- - Correlation between metrics
795
- - Business impact assessment
796
- - Recommended investigation areas
797
-
798
- Provide 1-2 sentences maximum with actionable insights.
799
- """
800
-
801
- payload = {
802
- "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
803
- "prompt": enhanced_prompt,
804
- "max_tokens": 150,
805
- "temperature": 0.4,
806
- }
807
- response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=15)
808
- if response.status_code == 200:
809
- result = response.json()
810
- analysis_text = result.get("choices", [{}])[0].get("text", "").strip()
811
- if analysis_text and len(analysis_text) > 10:
812
- return analysis_text.split('\n')[0]
813
- return analysis_text
814
- else:
815
- return f"API Error {response.status_code}: Service temporarily unavailable"
816
- except Exception as e:
817
- return f"Analysis service error: {str(e)}"
818
 
819
  # === Enhanced UI with Multi-Agent Insights ===
820
  def create_enhanced_ui():
 
 
821
  with gr.Blocks(title="๐Ÿง  Enterprise Agentic Reliability Framework", theme="soft") as demo:
822
  gr.Markdown("""
823
  # ๐Ÿง  Enterprise Agentic Reliability Framework
@@ -838,12 +796,12 @@ def create_enhanced_ui():
838
  latency = gr.Slider(
839
  minimum=10, maximum=1000, value=100, step=1,
840
  label="Latency P99 (ms)",
841
- info="Alert threshold: >150ms (adaptive)"
842
  )
843
  error_rate = gr.Slider(
844
  minimum=0, maximum=0.5, value=0.02, step=0.001,
845
  label="Error Rate",
846
- info="Alert threshold: >0.05"
847
  )
848
  throughput = gr.Number(
849
  value=1000,
@@ -924,20 +882,40 @@ def create_enhanced_ui():
924
 
925
  gr.Markdown("\n\n".join(policy_info))
926
 
927
- async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util):
 
 
928
  try:
 
929
  latency = float(latency)
930
  error_rate = float(error_rate)
931
  throughput = float(throughput) if throughput else 1000
932
  cpu_util = float(cpu_util) if cpu_util else None
933
  memory_util = float(memory_util) if memory_util else None
934
 
935
- result = await enhanced_engine.process_event_enhanced(
936
- component, latency, error_rate, throughput, cpu_util, memory_util
937
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
 
 
939
  table_data = []
940
- for event in events_history[-15:]:
941
  table_data.append([
942
  event.timestamp[:19],
943
  event.component,
@@ -945,31 +923,33 @@ def create_enhanced_ui():
945
  f"{event.error_rate:.3f}",
946
  event.throughput,
947
  event.severity.value.upper(),
948
- "Multi-agent analysis" if 'multi_agent_analysis' in result else 'N/A'
949
  ])
950
 
 
951
  status_emoji = "๐Ÿšจ" if result["status"] == "ANOMALY" else "โœ…"
952
- output_msg = f"{status_emoji} {result['status']}"
953
 
954
  if "multi_agent_analysis" in result:
955
  analysis = result["multi_agent_analysis"]
956
  confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
957
- output_msg += f"\n๐ŸŽฏ Confidence: {confidence*100:.1f}%"
958
 
959
  predictive_data = analysis.get('predictive_insights', {})
960
  if predictive_data.get('critical_risk_count', 0) > 0:
961
- output_msg += f"\n๐Ÿ”ฎ PREDICTIVE: {predictive_data['critical_risk_count']} critical risks forecast"
962
 
963
  if analysis.get('recommended_actions'):
964
- output_msg += f"\n๐Ÿ’ก Insights: {', '.join(analysis['recommended_actions'][:2])}"
 
965
 
966
  if result["business_impact"]:
967
  impact = result["business_impact"]
968
- output_msg += f"\n๐Ÿ’ฐ Business Impact: ${impact['revenue_loss_estimate']} | ๐Ÿ‘ฅ {impact['affected_users_estimate']} users | ๐Ÿšจ {impact['severity_level']}"
969
 
970
  if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
971
  actions = ", ".join(result["healing_actions"])
972
- output_msg += f"\n๐Ÿ”ง Auto-Actions: {actions}"
973
 
974
  agent_insights_data = result.get("multi_agent_analysis", {})
975
  predictive_insights_data = agent_insights_data.get('predictive_insights', {})
@@ -985,11 +965,18 @@ def create_enhanced_ui():
985
  )
986
  )
987
 
 
 
 
 
988
  except Exception as e:
989
- return f"โŒ Error processing event: {str(e)}", {}, {}, gr.Dataframe(value=[])
 
 
990
 
 
991
  submit_btn.click(
992
- fn=submit_event_enhanced,
993
  inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
994
  outputs=[output_text, agent_insights, predictive_insights, events_table]
995
  )
@@ -997,9 +984,22 @@ def create_enhanced_ui():
997
  return demo
998
 
999
  if __name__ == "__main__":
 
 
 
 
1000
  demo = create_enhanced_ui()
 
 
1001
  demo.launch(
1002
  server_name="0.0.0.0",
1003
  server_port=7860,
1004
  share=False
1005
- )
 
 
 
 
 
 
 
 
1
+ """
2
+ Enterprise Agentic Reliability Framework - Main Application
3
+ Multi-Agent AI System for Production Reliability Monitoring
4
+
5
+ This module provides the main Gradio UI and orchestrates the reliability
6
+ monitoring system with anomaly detection, predictive analytics, and auto-healing.
7
+ """
8
+
9
  import os
10
  import json
11
  import numpy as np
 
13
  import requests
14
  import pandas as pd
15
  import datetime
16
+ import threading
17
+ import logging
18
+ from typing import List, Dict, Any, Optional, Tuple
19
+ from collections import deque
20
+ from dataclasses import dataclass, asdict
21
  import hashlib
22
  import asyncio
 
 
23
 
24
  # Import our modules
25
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
26
  from healing_policies import PolicyEngine
27
+ from agent_orchestrator import OrchestrationManager
28
+
29
+ # === Logging Configuration ===
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
33
+ )
34
+ logger = logging.getLogger(__name__)
35
 
36
  # === Configuration ===
37
+ class Config:
38
+ """Centralized configuration for the reliability framework"""
39
+ HF_TOKEN: str = os.getenv("HF_TOKEN", "").strip()
40
+ HF_API_URL: str = "https://router.huggingface.co/hf-inference/v1/completions"
41
+
42
+ # Vector storage
43
+ VECTOR_DIM: int = 384
44
+ INDEX_FILE: str = "incident_vectors.index"
45
+ TEXTS_FILE: str = "incident_texts.json"
46
+
47
+ # Thresholds
48
+ LATENCY_WARNING: float = 150.0
49
+ LATENCY_CRITICAL: float = 300.0
50
+ ERROR_RATE_WARNING: float = 0.05
51
+ ERROR_RATE_CRITICAL: float = 0.15
52
+ CPU_WARNING: float = 0.8
53
+ CPU_CRITICAL: float = 0.9
54
+ MEMORY_WARNING: float = 0.8
55
+ MEMORY_CRITICAL: float = 0.9
56
+
57
+ # Performance
58
+ HISTORY_WINDOW: int = 50
59
+ MAX_EVENTS_STORED: int = 1000
60
+ AGENT_TIMEOUT: int = 10
61
+ CACHE_EXPIRY_MINUTES: int = 15
62
+
63
+ config = Config()
64
+
65
+ HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
66
+
67
+ # === Thread-Safe Data Structures ===
68
+ class ThreadSafeEventStore:
69
+ """Thread-safe storage for reliability events"""
70
+
71
+ def __init__(self, max_size: int = config.MAX_EVENTS_STORED):
72
+ self._events = deque(maxlen=max_size)
73
+ self._lock = threading.RLock()
74
+ logger.info(f"Initialized ThreadSafeEventStore with max_size={max_size}")
75
+
76
+ def add(self, event: ReliabilityEvent) -> None:
77
+ """Add event to store"""
78
+ with self._lock:
79
+ self._events.append(event)
80
+ logger.debug(f"Added event for {event.component}: {event.severity.value}")
81
+
82
+ def get_recent(self, n: int = 15) -> List[ReliabilityEvent]:
83
+ """Get n most recent events"""
84
+ with self._lock:
85
+ return list(self._events)[-n:] if self._events else []
86
+
87
+ def get_all(self) -> List[ReliabilityEvent]:
88
+ """Get all events"""
89
+ with self._lock:
90
+ return list(self._events)
91
+
92
+ def count(self) -> int:
93
+ """Get total event count"""
94
+ with self._lock:
95
+ return len(self._events)
96
+
97
+ class ThreadSafeFAISSIndex:
98
+ """Thread-safe wrapper for FAISS index operations with batching"""
99
+
100
+ def __init__(self, index, texts: List[str]):
101
+ self.index = index
102
+ self.texts = texts
103
+ self._lock = threading.RLock()
104
+ self.last_save = datetime.datetime.now()
105
+ self.save_interval = datetime.timedelta(seconds=30)
106
+ self.pending_vectors = []
107
+ self.pending_texts = []
108
+ logger.info(f"Initialized ThreadSafeFAISSIndex with {len(texts)} existing vectors")
109
+
110
+ def add(self, vector: np.ndarray, text: str) -> None:
111
+ """Add vector and text with batching"""
112
+ with self._lock:
113
+ self.pending_vectors.append(vector)
114
+ self.pending_texts.append(text)
115
+
116
+ # Flush if we have enough pending
117
+ if len(self.pending_vectors) >= 10:
118
+ self._flush()
119
+
120
+ def _flush(self) -> None:
121
+ """Flush pending vectors to index"""
122
+ if not self.pending_vectors:
123
+ return
124
+
125
+ try:
126
+ vectors = np.vstack(self.pending_vectors)
127
+ self.index.add(vectors)
128
+ self.texts.extend(self.pending_texts)
129
+
130
+ logger.info(f"Flushed {len(self.pending_vectors)} vectors to FAISS index")
131
+
132
+ self.pending_vectors = []
133
+ self.pending_texts = []
134
+
135
+ # Save if enough time has passed
136
+ if datetime.datetime.now() - self.last_save > self.save_interval:
137
+ self._save()
138
+ except Exception as e:
139
+ logger.error(f"Error flushing vectors: {e}", exc_info=True)
140
+
141
+ def _save(self) -> None:
142
+ """Save index to disk"""
143
+ try:
144
+ import faiss
145
+ faiss.write_index(self.index, config.INDEX_FILE)
146
+ with open(config.TEXTS_FILE, "w") as f:
147
+ json.dump(self.texts, f)
148
+ self.last_save = datetime.datetime.now()
149
+ logger.info(f"Saved FAISS index with {len(self.texts)} vectors")
150
+ except Exception as e:
151
+ logger.error(f"Error saving index: {e}", exc_info=True)
152
+
153
+ def get_count(self) -> int:
154
+ """Get total count of vectors"""
155
+ with self._lock:
156
+ return len(self.texts) + len(self.pending_texts)
157
+
158
+ def force_save(self) -> None:
159
+ """Force immediate save of pending vectors"""
160
+ with self._lock:
161
+ self._flush()
162
 
163
  # === FAISS & Embeddings Setup ===
164
  try:
165
  from sentence_transformers import SentenceTransformer
166
  import faiss
167
 
168
+ logger.info("Loading SentenceTransformer model...")
169
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
170
+ logger.info("SentenceTransformer model loaded successfully")
171
+
172
+ if os.path.exists(config.INDEX_FILE):
173
+ logger.info(f"Loading existing FAISS index from {config.INDEX_FILE}")
174
+ index = faiss.read_index(config.INDEX_FILE)
175
+
176
+ # Validate dimension
177
+ if index.d != config.VECTOR_DIM:
178
+ logger.warning(f"Index dimension mismatch: {index.d} != {config.VECTOR_DIM}. Creating new index.")
179
+ index = faiss.IndexFlatL2(config.VECTOR_DIM)
180
+ incident_texts = []
181
+ else:
182
+ with open(config.TEXTS_FILE, "r") as f:
183
+ incident_texts = json.load(f)
184
+ logger.info(f"Loaded {len(incident_texts)} incident texts")
185
  else:
186
+ logger.info("Creating new FAISS index")
187
+ index = faiss.IndexFlatL2(config.VECTOR_DIM)
188
  incident_texts = []
189
+
190
+ thread_safe_index = ThreadSafeFAISSIndex(index, incident_texts)
191
+
192
  except ImportError as e:
193
+ logger.warning(f"FAISS or SentenceTransformers not available: {e}")
194
  index = None
195
  incident_texts = []
196
  model = None
197
+ thread_safe_index = None
198
+ except Exception as e:
199
+ logger.error(f"Error initializing FAISS: {e}", exc_info=True)
200
+ index = None
201
+ incident_texts = []
202
+ model = None
203
+ thread_safe_index = None
204
 
205
  # === Predictive Models ===
206
  @dataclass
207
  class ForecastResult:
208
+ """Data class for forecast results"""
209
  metric: str
210
  predicted_value: float
211
  confidence: float
212
  trend: str # "increasing", "decreasing", "stable"
213
+ time_to_threshold: Optional[datetime.timedelta] = None
214
  risk_level: str = "low" # low, medium, high, critical
215
 
216
  class SimplePredictiveEngine:
217
  """Lightweight forecasting engine optimized for Hugging Face Spaces"""
218
 
219
+ def __init__(self, history_window: int = config.HISTORY_WINDOW):
220
  self.history_window = history_window
221
+ self.service_history: Dict[str, deque] = {}
222
+ self.prediction_cache: Dict[str, Tuple[ForecastResult, datetime.datetime]] = {}
223
+ self.max_cache_age = datetime.timedelta(minutes=config.CACHE_EXPIRY_MINUTES)
224
+ self._lock = threading.RLock()
225
+ logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
226
+
227
+ def add_telemetry(self, service: str, event_data: Dict) -> None:
228
  """Add telemetry data to service history"""
229
+ with self._lock:
230
+ if service not in self.service_history:
231
+ self.service_history[service] = deque(maxlen=self.history_window)
232
+
233
+ telemetry_point = {
234
+ 'timestamp': datetime.datetime.now(),
235
+ 'latency': event_data.get('latency_p99', 0),
236
+ 'error_rate': event_data.get('error_rate', 0),
237
+ 'throughput': event_data.get('throughput', 0),
238
+ 'cpu_util': event_data.get('cpu_util'),
239
+ 'memory_util': event_data.get('memory_util')
240
+ }
241
+
242
+ self.service_history[service].append(telemetry_point)
243
+
244
+ # Clean expired cache
245
+ self._clean_cache()
246
+
247
+ def _clean_cache(self) -> None:
248
+ """Remove expired entries from prediction cache"""
249
+ now = datetime.datetime.now()
250
+ expired = [k for k, (_, ts) in self.prediction_cache.items()
251
+ if now - ts > self.max_cache_age]
252
+ for k in expired:
253
+ del self.prediction_cache[k]
254
+
255
+ if expired:
256
+ logger.debug(f"Cleaned {len(expired)} expired cache entries")
257
 
258
  def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
259
  """Forecast service health metrics"""
260
+ with self._lock:
261
+ if service not in self.service_history or len(self.service_history[service]) < 10:
262
+ return []
263
+
264
+ history = list(self.service_history[service])
265
 
 
266
  forecasts = []
267
 
268
  # Forecast latency
 
280
  forecasts.extend(resource_forecasts)
281
 
282
  # Cache results
283
+ with self._lock:
284
+ for forecast in forecasts:
285
+ cache_key = f"{service}_{forecast.metric}"
286
+ self.prediction_cache[cache_key] = (forecast, datetime.datetime.now())
287
 
288
  return forecasts
289
 
290
+ def _forecast_latency(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
291
  """Forecast latency using linear regression and trend analysis"""
292
  try:
293
  latencies = [point['latency'] for point in history[-20:]]
 
310
  # Determine trend
311
  if slope > 5:
312
  trend = "increasing"
313
+ risk = "high" if predicted_latency > config.LATENCY_CRITICAL else "medium"
314
  elif slope < -2:
315
  trend = "decreasing"
316
  risk = "low"
 
321
  # Calculate time to reach critical threshold (500ms)
322
  time_to_critical = None
323
  if slope > 0 and predicted_latency < 500:
324
+ denominator = predicted_latency - latencies[-1]
325
+ if abs(denominator) > 0.1: # Avoid division by very small numbers
326
+ time_to_critical = datetime.timedelta(
327
+ minutes=lookahead_minutes * (500 - predicted_latency) / denominator
328
+ )
329
 
330
  return ForecastResult(
331
  metric="latency",
 
337
  )
338
 
339
  except Exception as e:
340
+ logger.error(f"Latency forecast error: {e}", exc_info=True)
341
  return None
342
 
343
+ def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
344
  """Forecast error rate using exponential smoothing"""
345
  try:
346
  error_rates = [point['error_rate'] for point in history[-15:]]
 
381
  )
382
 
383
  except Exception as e:
384
+ logger.error(f"Error rate forecast error: {e}", exc_info=True)
385
  return None
386
 
387
  def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
 
396
  trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
397
 
398
  risk = "low"
399
+ if predicted_cpu > config.CPU_CRITICAL:
400
+ risk = "critical"
401
+ elif predicted_cpu > config.CPU_WARNING:
402
+ risk = "high"
403
  elif predicted_cpu > 0.7:
404
  risk = "medium"
405
 
 
411
  risk_level=risk
412
  ))
413
  except Exception as e:
414
+ logger.error(f"CPU forecast error: {e}", exc_info=True)
415
 
416
  # Memory forecast
417
  memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
 
421
  trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
422
 
423
  risk = "low"
424
+ if predicted_memory > config.MEMORY_CRITICAL:
425
+ risk = "critical"
426
+ elif predicted_memory > config.MEMORY_WARNING:
427
+ risk = "high"
428
  elif predicted_memory > 0.7:
429
  risk = "medium"
430
 
 
436
  risk_level=risk
437
  ))
438
  except Exception as e:
439
+ logger.error(f"Memory forecast error: {e}", exc_info=True)
440
 
441
  return forecasts
442
 
 
470
 
471
  return {
472
  'service': service,
473
+ 'forecasts': [asdict(f) for f in forecasts],
474
  'warnings': warnings[:3],
475
  'recommendations': list(dict.fromkeys(recommendations))[:3],
476
  'critical_risk_count': len(critical_risks),
 
479
 
480
  # === Core Engine Components ===
481
  policy_engine = PolicyEngine()
482
+ events_history_store = ThreadSafeEventStore()
483
+ predictive_engine = SimplePredictiveEngine()
484
 
485
  class BusinessImpactCalculator:
486
  """Calculate business impact of anomalies"""
487
 
488
  def __init__(self, revenue_per_request: float = 0.01):
489
  self.revenue_per_request = revenue_per_request
490
+ logger.info(f"Initialized BusinessImpactCalculator with revenue_per_request={revenue_per_request}")
491
 
492
  def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
493
+ """
494
+ Calculate business impact for a reliability event
495
+
496
+ Args:
497
+ event: The reliability event to analyze
498
+ duration_minutes: Assumed duration of the incident
499
+
500
+ Returns:
501
+ Dictionary containing impact estimates
502
+ """
503
  base_revenue_per_minute = 100
504
 
505
  impact_multiplier = 1.0
506
 
507
+ if event.latency_p99 > config.LATENCY_CRITICAL:
508
  impact_multiplier += 0.5
509
  if event.error_rate > 0.1:
510
  impact_multiplier += 0.8
511
+ if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
512
  impact_multiplier += 0.3
513
 
514
  revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
 
526
  else:
527
  severity = "LOW"
528
 
529
+ logger.info(f"Business impact calculated: ${revenue_loss:.2f} revenue loss, {affected_users} users affected, {severity} severity")
530
+
531
  return {
532
  'revenue_loss_estimate': round(revenue_loss, 2),
533
  'affected_users_estimate': affected_users,
 
541
  """Enhanced anomaly detection with adaptive thresholds"""
542
 
543
  def __init__(self):
544
+ self.historical_data = deque(maxlen=100)
545
  self.adaptive_thresholds = {
546
+ 'latency_p99': config.LATENCY_WARNING,
547
+ 'error_rate': config.ERROR_RATE_WARNING
548
  }
549
+ self._lock = threading.RLock()
550
+ logger.info("Initialized AdvancedAnomalyDetector")
551
 
552
  def detect_anomaly(self, event: ReliabilityEvent) -> bool:
553
+ """
554
+ Detect if event is anomalous
 
 
 
 
 
 
 
 
555
 
556
+ Args:
557
+ event: The reliability event to check
558
+
559
+ Returns:
560
+ True if anomaly detected, False otherwise
561
+ """
562
+ with self._lock:
563
+ latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
564
+ error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
565
+
566
+ resource_anomaly = False
567
+ if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
568
+ resource_anomaly = True
569
+ if event.memory_util and event.memory_util > config.MEMORY_CRITICAL:
570
+ resource_anomaly = True
571
+
572
+ self._update_thresholds(event)
573
+
574
+ is_anomaly = latency_anomaly or error_anomaly or resource_anomaly
575
+
576
+ if is_anomaly:
577
+ logger.info(f"Anomaly detected for {event.component}: latency={latency_anomaly}, error={error_anomaly}, resource={resource_anomaly}")
578
+
579
+ return is_anomaly
580
 
581
+ def _update_thresholds(self, event: ReliabilityEvent) -> None:
582
+ """Update adaptive thresholds based on historical data"""
583
  self.historical_data.append(event)
584
 
 
 
 
585
  if len(self.historical_data) > 10:
586
+ recent_latencies = [e.latency_p99 for e in list(self.historical_data)[-20:]]
587
+ new_threshold = np.percentile(recent_latencies, 90)
588
+ self.adaptive_thresholds['latency_p99'] = new_threshold
589
+ logger.debug(f"Updated adaptive latency threshold to {new_threshold:.2f}ms")
590
 
591
  anomaly_detector = AdvancedAnomalyDetector()
592
 
593
+ # === Predictive Agent Integration ===
594
+ class PredictiveAgent:
595
+ """Predictive agent that uses SimplePredictiveEngine"""
 
 
 
 
 
 
596
 
 
 
 
 
597
  def __init__(self):
598
+ self.engine = predictive_engine
599
+ logger.info("Initialized PredictiveAgent")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
600
 
601
  async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
602
  """Predictive analysis for future risks"""
 
612
  insights = self.engine.get_predictive_insights(event.component)
613
 
614
  return {
615
+ 'specialization': 'predictive_analytics',
616
  'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
617
  'findings': insights,
618
  'recommendations': insights['recommendations']
619
  }
620
 
621
+ # Initialize orchestration with predictive agent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
622
  orchestration_manager = OrchestrationManager()
623
+ orchestration_manager.agents['predictive_analytics'] = PredictiveAgent()
624
 
625
+ # === Enhanced Reliability Engine ===
626
  class EnhancedReliabilityEngine:
627
+ """Main engine for processing reliability events"""
628
+
629
  def __init__(self):
630
  self.performance_metrics = {
631
  'total_incidents_processed': 0,
632
+ 'multi_agent_analyses': 0,
633
+ 'anomalies_detected': 0
634
  }
635
+ self._lock = threading.RLock()
636
+ logger.info("Initialized EnhancedReliabilityEngine")
637
 
638
+ async def process_event_enhanced(
639
+ self,
640
+ component: str,
641
+ latency: float,
642
+ error_rate: float,
643
+ throughput: float = 1000,
644
+ cpu_util: Optional[float] = None,
645
+ memory_util: Optional[float] = None
646
+ ) -> Dict[str, Any]:
647
+ """
648
+ Process a reliability event through the multi-agent system
649
+
650
+ Args:
651
+ component: Service component name
652
+ latency: P99 latency in milliseconds
653
+ error_rate: Error rate (0-1)
654
+ throughput: Requests per second
655
+ cpu_util: CPU utilization (0-1)
656
+ memory_util: Memory utilization (0-1)
657
+
658
+ Returns:
659
+ Dictionary containing analysis results
660
+ """
661
+ logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
662
 
663
+ # Create event
664
  event = ReliabilityEvent(
665
  component=component,
666
  latency_p99=latency,
 
671
  upstream_deps=["auth-service", "database"] if component == "api-service" else []
672
  )
673
 
674
+ # Multi-agent analysis
675
  agent_analysis = await orchestration_manager.orchestrate_analysis(event)
676
 
677
+ # Anomaly detection
678
  is_anomaly = anomaly_detector.detect_anomaly(event)
679
 
680
+ # Determine severity based on agent confidence
681
  agent_confidence = 0.0
682
  if agent_analysis and 'incident_summary' in agent_analysis:
683
  agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
 
693
  else:
694
  event.severity = EventSeverity.LOW
695
 
696
+ # Evaluate healing policies
697
  healing_actions = policy_engine.evaluate_policies(event)
698
 
699
+ # Calculate business impact
700
  business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
701
 
702
+ # Store in vector database
703
+ if thread_safe_index is not None and model is not None and is_anomaly:
704
+ try:
705
+ analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
706
+ vector_text = f"{component} {latency} {error_rate} {analysis_text}"
707
+ vec = model.encode([vector_text])
708
+ thread_safe_index.add(np.array(vec, dtype=np.float32), vector_text)
709
+ except Exception as e:
710
+ logger.error(f"Error storing vector: {e}", exc_info=True)
711
 
712
+ # Build result
713
  result = {
714
  "timestamp": event.timestamp,
715
  "component": component,
 
721
  "healing_actions": [action.value for action in healing_actions],
722
  "business_impact": business_impact,
723
  "severity": event.severity.value,
724
+ "similar_incidents_count": thread_safe_index.get_count() if thread_safe_index and is_anomaly else 0,
725
  "processing_metadata": {
726
  "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
727
  "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
728
  }
729
  }
730
 
731
+ # Store event
732
+ events_history_store.add(event)
733
+
734
+ # Update metrics
735
+ with self._lock:
736
+ self.performance_metrics['total_incidents_processed'] += 1
737
+ self.performance_metrics['multi_agent_analyses'] += 1
738
+ if is_anomaly:
739
+ self.performance_metrics['anomalies_detected'] += 1
740
+
741
+ logger.info(f"Event processed: {result['status']} with {result['severity']} severity")
742
 
743
  return result
744
 
745
  # Initialize enhanced engine
746
  enhanced_engine = EnhancedReliabilityEngine()
747
 
748
+ # === Input Validation ===
749
+ def validate_inputs(
750
+ latency: float,
751
+ error_rate: float,
752
+ throughput: float,
753
+ cpu_util: Optional[float],
754
+ memory_util: Optional[float]
755
+ ) -> Tuple[bool, str]:
756
+ """
757
+ Validate user inputs
758
+
759
+ Returns:
760
+ Tuple of (is_valid, error_message)
761
+ """
762
+ if not (0 <= latency <= 10000):
763
+ return False, "โŒ Invalid latency: must be between 0-10000ms"
764
+ if not (0 <= error_rate <= 1):
765
+ return False, "โŒ Invalid error rate: must be between 0-1"
766
+ if throughput < 0:
767
+ return False, "โŒ Invalid throughput: must be positive"
768
+ if cpu_util is not None and not (0 <= cpu_util <= 1):
769
+ return False, "โŒ Invalid CPU utilization: must be between 0-1"
770
+ if memory_util is not None and not (0 <= memory_util <= 1):
771
+ return False, "โŒ Invalid memory utilization: must be between 0-1"
772
+
773
+ return True, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
 
775
  # === Enhanced UI with Multi-Agent Insights ===
776
  def create_enhanced_ui():
777
+ """Create the Gradio UI for the reliability framework"""
778
+
779
  with gr.Blocks(title="๐Ÿง  Enterprise Agentic Reliability Framework", theme="soft") as demo:
780
  gr.Markdown("""
781
  # ๐Ÿง  Enterprise Agentic Reliability Framework
 
796
  latency = gr.Slider(
797
  minimum=10, maximum=1000, value=100, step=1,
798
  label="Latency P99 (ms)",
799
+ info=f"Alert threshold: >{config.LATENCY_WARNING}ms (adaptive)"
800
  )
801
  error_rate = gr.Slider(
802
  minimum=0, maximum=0.5, value=0.02, step=0.001,
803
  label="Error Rate",
804
+ info=f"Alert threshold: >{config.ERROR_RATE_WARNING}"
805
  )
806
  throughput = gr.Number(
807
  value=1000,
 
882
 
883
  gr.Markdown("\n\n".join(policy_info))
884
 
885
+ # โœ… FIXED: Synchronous wrapper for async function
886
+ def submit_event_enhanced_sync(component, latency, error_rate, throughput, cpu_util, memory_util):
887
+ """Synchronous wrapper for async event processing - FIXES GRADIO ASYNC ISSUE"""
888
  try:
889
+ # Type conversion
890
  latency = float(latency)
891
  error_rate = float(error_rate)
892
  throughput = float(throughput) if throughput else 1000
893
  cpu_util = float(cpu_util) if cpu_util else None
894
  memory_util = float(memory_util) if memory_util else None
895
 
896
+ # Input validation
897
+ is_valid, error_msg = validate_inputs(latency, error_rate, throughput, cpu_util, memory_util)
898
+ if not is_valid:
899
+ logger.warning(f"Invalid input: {error_msg}")
900
+ return error_msg, {}, {}, gr.Dataframe(value=[])
901
+
902
+ # Create new event loop for async execution
903
+ loop = asyncio.new_event_loop()
904
+ asyncio.set_event_loop(loop)
905
+
906
+ try:
907
+ # Call async function
908
+ result = loop.run_until_complete(
909
+ enhanced_engine.process_event_enhanced(
910
+ component, latency, error_rate, throughput, cpu_util, memory_util
911
+ )
912
+ )
913
+ finally:
914
+ loop.close()
915
 
916
+ # Build table data
917
  table_data = []
918
+ for event in events_history_store.get_recent(15):
919
  table_data.append([
920
  event.timestamp[:19],
921
  event.component,
 
923
  f"{event.error_rate:.3f}",
924
  event.throughput,
925
  event.severity.value.upper(),
926
+ "Multi-agent analysis"
927
  ])
928
 
929
+ # Format output message
930
  status_emoji = "๐Ÿšจ" if result["status"] == "ANOMALY" else "โœ…"
931
+ output_msg = f"{status_emoji} **{result['status']}**"
932
 
933
  if "multi_agent_analysis" in result:
934
  analysis = result["multi_agent_analysis"]
935
  confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
936
+ output_msg += f"\n๐ŸŽฏ **Confidence**: {confidence*100:.1f}%"
937
 
938
  predictive_data = analysis.get('predictive_insights', {})
939
  if predictive_data.get('critical_risk_count', 0) > 0:
940
+ output_msg += f"\n๐Ÿ”ฎ **PREDICTIVE**: {predictive_data['critical_risk_count']} critical risks forecast"
941
 
942
  if analysis.get('recommended_actions'):
943
+ actions_preview = ', '.join(analysis['recommended_actions'][:2])
944
+ output_msg += f"\n๐Ÿ’ก **Top Insights**: {actions_preview}"
945
 
946
  if result["business_impact"]:
947
  impact = result["business_impact"]
948
+ output_msg += f"\n๐Ÿ’ฐ **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | ๐Ÿ‘ฅ {impact['affected_users_estimate']} users | ๐Ÿšจ {impact['severity_level']}"
949
 
950
  if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
951
  actions = ", ".join(result["healing_actions"])
952
+ output_msg += f"\n๐Ÿ”ง **Auto-Actions**: {actions}"
953
 
954
  agent_insights_data = result.get("multi_agent_analysis", {})
955
  predictive_insights_data = agent_insights_data.get('predictive_insights', {})
 
965
  )
966
  )
967
 
968
+ except ValueError as e:
969
+ error_msg = f"โŒ Value error: {str(e)}"
970
+ logger.error(error_msg, exc_info=True)
971
+ return error_msg, {}, {}, gr.Dataframe(value=[])
972
  except Exception as e:
973
+ error_msg = f"โŒ Error processing event: {str(e)}"
974
+ logger.error(error_msg, exc_info=True)
975
+ return error_msg, {}, {}, gr.Dataframe(value=[])
976
 
977
+ # โœ… FIXED: Use sync wrapper instead of async function
978
  submit_btn.click(
979
+ fn=submit_event_enhanced_sync,
980
  inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
981
  outputs=[output_text, agent_insights, predictive_insights, events_table]
982
  )
 
984
  return demo
985
 
986
  if __name__ == "__main__":
987
+ logger.info("Starting Enterprise Agentic Reliability Framework...")
988
+ logger.info(f"Total events in history: {events_history_store.count()}")
989
+ logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
990
+
991
  demo = create_enhanced_ui()
992
+
993
+ logger.info("Launching Gradio UI...")
994
  demo.launch(
995
  server_name="0.0.0.0",
996
  server_port=7860,
997
  share=False
998
+ )
999
+
1000
+ # Save any pending vectors on shutdown
1001
+ if thread_safe_index:
1002
+ logger.info("Saving pending vectors...")
1003
+ thread_safe_index.force_save()
1004
+
1005
+ logger.info("Application shutdown complete")