petter2025 commited on
Commit
1f0be8f
Β·
verified Β·
1 Parent(s): 775c77d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +570 -43
app.py CHANGED
@@ -9,36 +9,363 @@ from typing import List, Dict, Any
9
  import hashlib
10
  import asyncio
11
 
12
- # Import enhanced modules
13
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
14
  from healing_policies import PolicyEngine
15
- from agent_orchestrator import OrchestrationManager, AgentSpecialization
16
 
17
  # === Configuration ===
18
  HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
19
  HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
20
  HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
21
 
22
- # === Initialize Enhanced Components ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  policy_engine = PolicyEngine()
24
- orchestration_manager = OrchestrationManager()
25
  events_history: List[ReliabilityEvent] = []
26
 
27
- # [Keep existing FAISS setup and BusinessImpactCalculator...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  class EnhancedReliabilityEngine:
30
  def __init__(self):
31
  self.performance_metrics = {
32
  'total_incidents_processed': 0,
33
- 'multi_agent_analyses': 0,
34
- 'average_processing_time': 0.0
35
  }
36
 
37
  async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
38
  throughput: float = 1000, cpu_util: float = None,
39
  memory_util: float = None) -> Dict[str, Any]:
40
  """Enhanced event processing with multi-agent orchestration"""
41
- start_time = asyncio.get_event_loop().time()
42
 
43
  # Create event
44
  event = ReliabilityEvent(
@@ -54,15 +381,23 @@ class EnhancedReliabilityEngine:
54
  # Multi-agent analysis
55
  agent_analysis = await orchestration_manager.orchestrate_analysis(event)
56
 
 
 
 
57
  # Policy evaluation
58
  healing_actions = policy_engine.evaluate_policies(event)
59
 
60
  # Business impact
61
- business_impact = business_calculator.calculate_impact(event)
62
 
63
- # Update metrics
64
- processing_time = asyncio.get_event_loop().time() - start_time
65
- self._update_performance_metrics(processing_time)
 
 
 
 
 
66
 
67
  # Prepare comprehensive result
68
  result = {
@@ -71,57 +406,249 @@ class EnhancedReliabilityEngine:
71
  "latency_p99": latency,
72
  "error_rate": error_rate,
73
  "throughput": throughput,
74
- "status": "ANOMALY" if agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0) > 0.5 else "NORMAL",
75
  "multi_agent_analysis": agent_analysis,
76
  "healing_actions": [action.value for action in healing_actions],
77
  "business_impact": business_impact,
 
 
78
  "processing_metadata": {
79
- "processing_time_seconds": round(processing_time, 3),
80
  "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
81
  "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
82
  }
83
  }
84
 
85
  events_history.append(event)
 
 
 
86
  return result
87
 
88
  # Initialize enhanced engine
89
  enhanced_engine = EnhancedReliabilityEngine()
90
 
91
- # [Keep existing UI setup, but enhance the submission function...]
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util):
94
- """Enhanced event submission with async processing"""
95
  try:
96
- # Convert inputs
97
- latency = float(latency)
98
- error_rate = float(error_rate)
99
- throughput = float(throughput) if throughput else 1000
100
- cpu_util = float(cpu_util) if cpu_util else None
101
- memory_util = float(memory_util) if memory_util else None
102
-
103
- # Process with enhanced engine
104
- result = await enhanced_engine.process_event_enhanced(
105
- component, latency, error_rate, throughput, cpu_util, memory_util
106
- )
107
 
108
- # [Keep existing table formatting...]
109
 
110
- # Enhanced output formatting
111
- status_emoji = "🚨" if result["status"] == "ANOMALY" else "βœ…"
112
- output_msg = f"{status_emoji} {result['status']}"
 
 
113
 
114
- # Add multi-agent insights
115
- if "multi_agent_analysis" in result:
116
- analysis = result["multi_agent_analysis"]
117
- output_msg += f"\n🎯 Confidence: {analysis.get('incident_summary', {}).get('anomaly_confidence', 0)*100:.1f}%"
118
-
119
- if analysis.get('recommended_actions'):
120
- output_msg += f"\nπŸ’‘ Insights: {', '.join(analysis['recommended_actions'][:2])}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- # [Keep existing business impact and healing actions display...]
 
123
 
124
- return (output_msg, table_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- except Exception as e:
127
- return f"❌ Error processing event: {str(e)}", gr.Dataframe(value=[])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  import hashlib
10
  import asyncio
11
 
12
+ # Import our modules
13
  from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
14
  from healing_policies import PolicyEngine
 
15
 
16
  # === Configuration ===
17
  HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
18
  HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
19
  HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
20
 
21
+ # === FAISS & Embeddings Setup ===
22
+ try:
23
+ from sentence_transformers import SentenceTransformer
24
+ import faiss
25
+
26
+ VECTOR_DIM = 384
27
+ INDEX_FILE = "incident_vectors.index"
28
+ TEXTS_FILE = "incident_texts.json"
29
+
30
+ # Try to load model with error handling
31
+ try:
32
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
33
+ except Exception as e:
34
+ print(f"Model loading warning: {e}")
35
+ from sentence_transformers import SentenceTransformer as ST
36
+ model = ST("sentence-transformers/all-MiniLM-L6-v2")
37
+
38
+ if os.path.exists(INDEX_FILE):
39
+ index = faiss.read_index(INDEX_FILE)
40
+ with open(TEXTS_FILE, "r") as f:
41
+ incident_texts = json.load(f)
42
+ else:
43
+ index = faiss.IndexFlatL2(VECTOR_DIM)
44
+ incident_texts = []
45
+
46
+ except ImportError as e:
47
+ print(f"Warning: FAISS or SentenceTransformers not available: {e}")
48
+ index = None
49
+ incident_texts = []
50
+ model = None
51
+
52
+ def save_index():
53
+ """Save FAISS index and incident texts"""
54
+ if index is not None:
55
+ faiss.write_index(index, INDEX_FILE)
56
+ with open(TEXTS_FILE, "w") as f:
57
+ json.dump(incident_texts, f)
58
+
59
+ # === Core Engine Components ===
60
  policy_engine = PolicyEngine()
 
61
  events_history: List[ReliabilityEvent] = []
62
 
63
+ class BusinessImpactCalculator:
64
+ """Calculate business impact of anomalies"""
65
+
66
+ def __init__(self, revenue_per_request: float = 0.01):
67
+ self.revenue_per_request = revenue_per_request
68
+
69
+ def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
70
+ """Enhanced business impact calculation"""
71
+
72
+ # More realistic impact calculation
73
+ base_revenue_per_minute = 100 # Base revenue per minute for the service
74
+
75
+ # Calculate impact based on severity of anomalies
76
+ impact_multiplier = 1.0
77
+
78
+ if event.latency_p99 > 300:
79
+ impact_multiplier += 0.5 # High latency impact
80
+ if event.error_rate > 0.1:
81
+ impact_multiplier += 0.8 # High error rate impact
82
+ if event.cpu_util and event.cpu_util > 0.9:
83
+ impact_multiplier += 0.3 # Resource exhaustion impact
84
+
85
+ revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
86
+
87
+ # More realistic user impact
88
+ base_users_affected = 1000 # Base user count
89
+ user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
90
+ affected_users = int(base_users_affected * user_impact_multiplier)
91
+
92
+ # Severity classification
93
+ if revenue_loss > 500 or affected_users > 5000:
94
+ severity = "CRITICAL"
95
+ elif revenue_loss > 100 or affected_users > 1000:
96
+ severity = "HIGH"
97
+ elif revenue_loss > 50 or affected_users > 500:
98
+ severity = "MEDIUM"
99
+ else:
100
+ severity = "LOW"
101
+
102
+ return {
103
+ 'revenue_loss_estimate': round(revenue_loss, 2),
104
+ 'affected_users_estimate': affected_users,
105
+ 'severity_level': severity,
106
+ 'throughput_reduction_pct': round(min(100, user_impact_multiplier * 100), 1)
107
+ }
108
+
109
+ business_calculator = BusinessImpactCalculator()
110
+
111
+ class AdvancedAnomalyDetector:
112
+ """Enhanced anomaly detection with adaptive thresholds"""
113
+
114
+ def __init__(self):
115
+ self.historical_data = []
116
+ self.adaptive_thresholds = {
117
+ 'latency_p99': 150, # Will adapt based on history
118
+ 'error_rate': 0.05
119
+ }
120
+
121
+ def detect_anomaly(self, event: ReliabilityEvent) -> bool:
122
+ """Enhanced anomaly detection with adaptive thresholds"""
123
+
124
+ # Basic threshold checks
125
+ latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
126
+ error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
127
+
128
+ # Resource-based anomalies
129
+ resource_anomaly = False
130
+ if event.cpu_util and event.cpu_util > 0.9:
131
+ resource_anomaly = True
132
+ if event.memory_util and event.memory_util > 0.9:
133
+ resource_anomaly = True
134
+
135
+ # Update adaptive thresholds (simplified)
136
+ self._update_thresholds(event)
137
+
138
+ return latency_anomaly or error_anomaly or resource_anomaly
139
+
140
+ def _update_thresholds(self, event: ReliabilityEvent):
141
+ """Update adaptive thresholds based on historical data"""
142
+ self.historical_data.append(event)
143
+
144
+ # Keep only recent history
145
+ if len(self.historical_data) > 100:
146
+ self.historical_data.pop(0)
147
+
148
+ # Update latency threshold to 90th percentile of recent data
149
+ if len(self.historical_data) > 10:
150
+ recent_latencies = [e.latency_p99 for e in self.historical_data[-20:]]
151
+ self.adaptive_thresholds['latency_p99'] = np.percentile(recent_latencies, 90)
152
+
153
+ anomaly_detector = AdvancedAnomalyDetector()
154
+
155
+ # === Multi-Agent Foundation ===
156
+ from enum import Enum
157
+
158
+ class AgentSpecialization(Enum):
159
+ DETECTIVE = "anomaly_detection"
160
+ DIAGNOSTICIAN = "root_cause_analysis"
161
+
162
+ class BaseAgent:
163
+ def __init__(self, specialization: AgentSpecialization):
164
+ self.specialization = specialization
165
+
166
+ async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
167
+ raise NotImplementedError
168
+
169
+ class AnomalyDetectionAgent(BaseAgent):
170
+ def __init__(self):
171
+ super().__init__(AgentSpecialization.DETECTIVE)
172
+
173
+ async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
174
+ """Enhanced anomaly detection with confidence scoring"""
175
+ anomaly_score = self._calculate_anomaly_score(event)
176
+
177
+ return {
178
+ 'specialization': self.specialization.value,
179
+ 'confidence': anomaly_score,
180
+ 'findings': {
181
+ 'anomaly_score': anomaly_score,
182
+ 'severity_tier': self._classify_severity(anomaly_score),
183
+ 'primary_metrics_affected': self._identify_affected_metrics(event)
184
+ },
185
+ 'recommendations': [
186
+ f"Investigate {metric} anomalies" for metric in self._identify_affected_metrics(event)
187
+ ]
188
+ }
189
+
190
+ def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
191
+ """Calculate comprehensive anomaly score (0-1)"""
192
+ scores = []
193
+
194
+ # Latency anomaly (weighted 40%)
195
+ if event.latency_p99 > 150:
196
+ latency_score = min(1.0, (event.latency_p99 - 150) / 500)
197
+ scores.append(0.4 * latency_score)
198
+
199
+ # Error rate anomaly (weighted 30%)
200
+ if event.error_rate > 0.05:
201
+ error_score = min(1.0, event.error_rate / 0.3)
202
+ scores.append(0.3 * error_score)
203
+
204
+ # Resource anomaly (weighted 30%)
205
+ resource_score = 0
206
+ if event.cpu_util and event.cpu_util > 0.8:
207
+ resource_score += 0.15 * min(1.0, (event.cpu_util - 0.8) / 0.2)
208
+ if event.memory_util and event.memory_util > 0.8:
209
+ resource_score += 0.15 * min(1.0, (event.memory_util - 0.8) / 0.2)
210
+ scores.append(resource_score)
211
+
212
+ return min(1.0, sum(scores))
213
+
214
+ def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[str]:
215
+ """Identify which metrics are contributing to anomalies"""
216
+ affected = []
217
+ if event.latency_p99 > 150:
218
+ affected.append("latency")
219
+ if event.error_rate > 0.05:
220
+ affected.append("error_rate")
221
+ if event.cpu_util and event.cpu_util > 0.8:
222
+ affected.append("cpu_utilization")
223
+ if event.memory_util and event.memory_util > 0.8:
224
+ affected.append("memory_utilization")
225
+ return affected
226
+
227
+ def _classify_severity(self, anomaly_score: float) -> str:
228
+ if anomaly_score > 0.8:
229
+ return "CRITICAL"
230
+ elif anomaly_score > 0.6:
231
+ return "HIGH"
232
+ elif anomaly_score > 0.4:
233
+ return "MEDIUM"
234
+ else:
235
+ return "LOW"
236
+
237
+ class RootCauseAgent(BaseAgent):
238
+ def __init__(self):
239
+ super().__init__(AgentSpecialization.DIAGNOSTICIAN)
240
+
241
+ async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
242
+ """Basic root cause analysis"""
243
+ causes = self._analyze_potential_causes(event)
244
+
245
+ return {
246
+ 'specialization': self.specialization.value,
247
+ 'confidence': 0.7, # Base confidence
248
+ 'findings': {
249
+ 'likely_root_causes': causes,
250
+ 'evidence_patterns': self._identify_evidence(event),
251
+ 'investigation_priority': self._prioritize_investigation(causes)
252
+ },
253
+ 'recommendations': [
254
+ f"Check {cause} for issues" for cause in causes[:2]
255
+ ]
256
+ }
257
+
258
+ def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[str]:
259
+ """Analyze potential root causes based on metrics"""
260
+ causes = []
261
+
262
+ if event.latency_p99 > 300 and event.error_rate > 0.1:
263
+ causes.append("database_connection_pool")
264
+ causes.append("external_dependency_timeout")
265
+ elif event.cpu_util and event.cpu_util > 0.9:
266
+ causes.append("resource_exhaustion")
267
+ causes.append("memory_leak")
268
+ elif event.error_rate > 0.2:
269
+ causes.append("recent_deployment")
270
+ causes.append("configuration_change")
271
+
272
+ return causes if causes else ["unknown_cause_requires_investigation"]
273
+
274
+ def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
275
+ """Identify evidence patterns"""
276
+ evidence = []
277
+ if event.latency_p99 > event.error_rate * 1000:
278
+ evidence.append("latency_disproportionate_to_errors")
279
+ if event.cpu_util and event.cpu_util > 0.8 and event.memory_util and event.memory_util > 0.8:
280
+ evidence.append("correlated_resource_exhaustion")
281
+ return evidence
282
+
283
+ def _prioritize_investigation(self, causes: List[str]) -> str:
284
+ if "database_connection_pool" in causes:
285
+ return "HIGH"
286
+ elif "resource_exhaustion" in causes:
287
+ return "HIGH"
288
+ else:
289
+ return "MEDIUM"
290
+
291
+ class OrchestrationManager:
292
+ def __init__(self):
293
+ self.agents = {
294
+ AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
295
+ AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
296
+ }
297
+
298
+ async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
299
+ """Coordinate multiple agents for comprehensive analysis"""
300
+ agent_tasks = {
301
+ spec: agent.analyze(event)
302
+ for spec, agent in self.agents.items()
303
+ }
304
+
305
+ # Execute agents in parallel
306
+ agent_results = {}
307
+ for specialization, task in agent_tasks.items():
308
+ try:
309
+ result = await asyncio.wait_for(task, timeout=5.0)
310
+ agent_results[specialization.value] = result
311
+ except asyncio.TimeoutError:
312
+ continue
313
+
314
+ return self._synthesize_agent_findings(event, agent_results)
315
+
316
+ def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
317
+ """Combine insights from all specialized agents"""
318
+ detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
319
+ diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
320
+
321
+ if not detective_result:
322
+ return {'error': 'No agent results available'}
323
+
324
+ synthesis = {
325
+ 'incident_summary': {
326
+ 'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
327
+ 'anomaly_confidence': detective_result['confidence'],
328
+ 'primary_metrics_affected': detective_result['findings'].get('primary_metrics_affected', [])
329
+ },
330
+ 'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
331
+ 'recommended_actions': self._prioritize_actions(
332
+ detective_result.get('recommendations', []),
333
+ diagnostician_result.get('recommendations', []) if diagnostician_result else []
334
+ ),
335
+ 'agent_metadata': {
336
+ 'participating_agents': list(agent_results.keys()),
337
+ 'analysis_timestamp': datetime.datetime.now().isoformat()
338
+ }
339
+ }
340
+
341
+ return synthesis
342
+
343
+ def _prioritize_actions(self, detection_actions: List[str], diagnosis_actions: List[str]) -> List[str]:
344
+ """Combine and prioritize actions from different agents"""
345
+ all_actions = detection_actions + diagnosis_actions
346
+ # Remove duplicates while preserving order
347
+ seen = set()
348
+ unique_actions = []
349
+ for action in all_actions:
350
+ if action not in seen:
351
+ seen.add(action)
352
+ unique_actions.append(action)
353
+ return unique_actions[:4] # Return top 4 actions
354
+
355
+ # Initialize enhanced components
356
+ orchestration_manager = OrchestrationManager()
357
 
358
  class EnhancedReliabilityEngine:
359
  def __init__(self):
360
  self.performance_metrics = {
361
  'total_incidents_processed': 0,
362
+ 'multi_agent_analyses': 0
 
363
  }
364
 
365
  async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
366
  throughput: float = 1000, cpu_util: float = None,
367
  memory_util: float = None) -> Dict[str, Any]:
368
  """Enhanced event processing with multi-agent orchestration"""
 
369
 
370
  # Create event
371
  event = ReliabilityEvent(
 
381
  # Multi-agent analysis
382
  agent_analysis = await orchestration_manager.orchestrate_analysis(event)
383
 
384
+ # Traditional detection (for compatibility)
385
+ is_anomaly = anomaly_detector.detect_anomaly(event)
386
+
387
  # Policy evaluation
388
  healing_actions = policy_engine.evaluate_policies(event)
389
 
390
  # Business impact
391
+ business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
392
 
393
+ # Vector memory learning
394
+ if index is not None and is_anomaly:
395
+ analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
396
+ vector_text = f"{component} {latency} {error_rate} {analysis_text}"
397
+ vec = model.encode([vector_text])
398
+ index.add(np.array(vec, dtype=np.float32))
399
+ incident_texts.append(vector_text)
400
+ save_index()
401
 
402
  # Prepare comprehensive result
403
  result = {
 
406
  "latency_p99": latency,
407
  "error_rate": error_rate,
408
  "throughput": throughput,
409
+ "status": "ANOMALY" if is_anomaly else "NORMAL",
410
  "multi_agent_analysis": agent_analysis,
411
  "healing_actions": [action.value for action in healing_actions],
412
  "business_impact": business_impact,
413
+ "severity": event.severity.value,
414
+ "similar_incidents_count": len(incident_texts) if is_anomaly else 0,
415
  "processing_metadata": {
 
416
  "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
417
  "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
418
  }
419
  }
420
 
421
  events_history.append(event)
422
+ self.performance_metrics['total_incidents_processed'] += 1
423
+ self.performance_metrics['multi_agent_analyses'] += 1
424
+
425
  return result
426
 
427
  # Initialize enhanced engine
428
  enhanced_engine = EnhancedReliabilityEngine()
429
 
430
+ def call_huggingface_analysis(prompt: str) -> str:
431
+ """Use HF Inference API or fallback simulation"""
432
+ if not HF_TOKEN:
433
+ fallback_insights = [
434
+ "High latency detected - possible resource contention or network issues",
435
+ "Error rate increase suggests recent deployment instability",
436
+ "Latency spike correlates with increased user traffic patterns",
437
+ "Intermittent failures indicate potential dependency service degradation",
438
+ "Performance degradation detected - consider scaling compute resources"
439
+ ]
440
+ import random
441
+ return random.choice(fallback_insights)
442
 
 
 
443
  try:
444
+ enhanced_prompt = f"""
445
+ As a senior reliability engineer, analyze this telemetry event and provide a concise root cause analysis:
 
 
 
 
 
 
 
 
 
446
 
447
+ {prompt}
448
 
449
+ Focus on:
450
+ - Potential infrastructure or application issues
451
+ - Correlation between metrics
452
+ - Business impact assessment
453
+ - Recommended investigation areas
454
 
455
+ Provide 1-2 sentences maximum with actionable insights.
456
+ """
457
+
458
+ payload = {
459
+ "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
460
+ "prompt": enhanced_prompt,
461
+ "max_tokens": 150,
462
+ "temperature": 0.4,
463
+ }
464
+ response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=15)
465
+ if response.status_code == 200:
466
+ result = response.json()
467
+ analysis_text = result.get("choices", [{}])[0].get("text", "").strip()
468
+ if analysis_text and len(analysis_text) > 10:
469
+ return analysis_text.split('\n')[0]
470
+ return analysis_text
471
+ else:
472
+ return f"API Error {response.status_code}: Service temporarily unavailable"
473
+ except Exception as e:
474
+ return f"Analysis service error: {str(e)}"
475
+
476
+ # === Enhanced UI with Multi-Agent Insights ===
477
+ def create_enhanced_ui():
478
+ """Create enhanced UI with multi-agent capabilities"""
479
+ with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
480
+ gr.Markdown("""
481
+ # 🧠 Enterprise Agentic Reliability Framework
482
+ **Multi-Agent AI System for Production Reliability**
483
 
484
+ *Specialized AI agents working together to detect, diagnose, and heal system issues*
485
+ """)
486
 
487
+ with gr.Row():
488
+ with gr.Column(scale=1):
489
+ gr.Markdown("### πŸ“Š Telemetry Input")
490
+ component = gr.Dropdown(
491
+ choices=["api-service", "auth-service", "payment-service", "database", "cache-service"],
492
+ value="api-service",
493
+ label="Component",
494
+ info="Select the service being monitored"
495
+ )
496
+ latency = gr.Slider(
497
+ minimum=10, maximum=1000, value=100, step=1,
498
+ label="Latency P99 (ms)",
499
+ info="Alert threshold: >150ms (adaptive)"
500
+ )
501
+ error_rate = gr.Slider(
502
+ minimum=0, maximum=0.5, value=0.02, step=0.001,
503
+ label="Error Rate",
504
+ info="Alert threshold: >0.05"
505
+ )
506
+ throughput = gr.Number(
507
+ value=1000,
508
+ label="Throughput (req/sec)",
509
+ info="Current request rate"
510
+ )
511
+ cpu_util = gr.Slider(
512
+ minimum=0, maximum=1, value=0.4, step=0.01,
513
+ label="CPU Utilization",
514
+ info="0.0 - 1.0 scale"
515
+ )
516
+ memory_util = gr.Slider(
517
+ minimum=0, maximum=1, value=0.3, step=0.01,
518
+ label="Memory Utilization",
519
+ info="0.0 - 1.0 scale"
520
+ )
521
+ submit_btn = gr.Button("πŸš€ Submit Telemetry Event", variant="primary", size="lg")
522
+
523
+ with gr.Column(scale=2):
524
+ gr.Markdown("### πŸ” Multi-Agent Analysis")
525
+ output_text = gr.Textbox(
526
+ label="Agent Synthesis",
527
+ placeholder="AI agents are analyzing...",
528
+ lines=5
529
+ )
530
+
531
+ # New agent insights section
532
+ with gr.Accordion("πŸ€– Agent Specialists Analysis", open=False):
533
+ gr.Markdown("""
534
+ **Specialized AI Agents:**
535
+ - πŸ•΅οΈ **Detective**: Anomaly detection & pattern recognition
536
+ - πŸ” **Diagnostician**: Root cause analysis & investigation
537
+ """)
538
+
539
+ agent_insights = gr.JSON(
540
+ label="Detailed Agent Findings",
541
+ value={}
542
+ )
543
+
544
+ gr.Markdown("### πŸ“ˆ Recent Events (Last 15)")
545
+ events_table = gr.Dataframe(
546
+ headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
547
+ label="Event History",
548
+ wrap=True,
549
+ max_height="400px"
550
+ )
551
 
552
+ # Information sections
553
+ with gr.Accordion("ℹ️ Framework Capabilities", open=False):
554
+ gr.Markdown("""
555
+ - **πŸ€– Multi-Agent AI**: Specialized agents for detection, diagnosis, and healing
556
+ - **πŸ”§ Policy-Based Healing**: Automated recovery actions based on severity and context
557
+ - **πŸ’° Business Impact**: Revenue and user impact quantification
558
+ - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
559
+ - **πŸ“š Vector Memory**: FAISS-based incident memory for similarity detection
560
+ - **⚑ Production Ready**: Circuit breakers, cooldowns, and enterprise features
561
+ """)
562
+
563
+ with gr.Accordion("πŸ”§ Healing Policies", open=False):
564
+ policy_info = []
565
+ for policy in policy_engine.policies:
566
+ if policy.enabled:
567
+ actions = ", ".join([action.value for action in policy.actions])
568
+ policy_info.append(f"**{policy.name}**: {actions} (Priority: {policy.priority})")
569
+
570
+ gr.Markdown("\n\n".join(policy_info))
571
+
572
+ # Event handling
573
+ async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util):
574
+ """Enhanced event submission with async processing"""
575
+ try:
576
+ # Convert inputs
577
+ latency = float(latency)
578
+ error_rate = float(error_rate)
579
+ throughput = float(throughput) if throughput else 1000
580
+ cpu_util = float(cpu_util) if cpu_util else None
581
+ memory_util = float(memory_util) if memory_util else None
582
+
583
+ # Process with enhanced engine
584
+ result = await enhanced_engine.process_event_enhanced(
585
+ component, latency, error_rate, throughput, cpu_util, memory_util
586
+ )
587
+
588
+ # Prepare table data
589
+ table_data = []
590
+ for event in events_history[-15:]:
591
+ table_data.append([
592
+ event.timestamp[:19],
593
+ event.component,
594
+ event.latency_p99,
595
+ f"{event.error_rate:.3f}",
596
+ event.throughput,
597
+ event.severity.value.upper(),
598
+ "Multi-agent analysis" if 'multi_agent_analysis' in result else 'N/A'
599
+ ])
600
+
601
+ # Enhanced output formatting
602
+ status_emoji = "🚨" if result["status"] == "ANOMALY" else "βœ…"
603
+ output_msg = f"{status_emoji} {result['status']}"
604
+
605
+ # Add multi-agent insights
606
+ if "multi_agent_analysis" in result:
607
+ analysis = result["multi_agent_analysis"]
608
+ confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
609
+ output_msg += f"\n🎯 Confidence: {confidence*100:.1f}%"
610
+
611
+ if analysis.get('recommended_actions'):
612
+ output_msg += f"\nπŸ’‘ Insights: {', '.join(analysis['recommended_actions'][:2])}"
613
+
614
+ # Add business impact
615
+ if result["business_impact"]:
616
+ impact = result["business_impact"]
617
+ output_msg += f"\nπŸ’° Business Impact: ${impact['revenue_loss_estimate']} | πŸ‘₯ {impact['affected_users_estimate']} users | 🚨 {impact['severity_level']}"
618
+
619
+ # Add healing actions
620
+ if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
621
+ actions = ", ".join(result["healing_actions"])
622
+ output_msg += f"\nπŸ”§ Auto-Actions: {actions}"
623
+
624
+ # Prepare agent insights for JSON display
625
+ agent_insights_data = result.get("multi_agent_analysis", {})
626
+
627
+ return (
628
+ output_msg,
629
+ agent_insights_data,
630
+ gr.Dataframe(
631
+ headers=["Timestamp", "Component", "Latency", "Error Rate", "Throughput", "Severity", "Analysis"],
632
+ value=table_data,
633
+ wrap=True
634
+ )
635
+ )
636
+
637
+ except Exception as e:
638
+ return f"❌ Error processing event: {str(e)}", {}, gr.Dataframe(value=[])
639
+
640
+ submit_btn.click(
641
+ fn=submit_event_enhanced,
642
+ inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
643
+ outputs=[output_text, agent_insights, events_table]
644
+ )
645
+
646
+ return demo
647
+
648
+ if __name__ == "__main__":
649
+ demo = create_enhanced_ui()
650
+ demo.launch(
651
+ server_name="0.0.0.0",
652
+ server_port=7860,
653
+ share=False
654
+ )