petter2025 commited on
Commit
fc7752d
·
verified ·
1 Parent(s): d6f7e9f

Update agent_orchestrator.py

Browse files
Files changed (1) hide show
  1. agent_orchestrator.py +332 -29
agent_orchestrator.py CHANGED
@@ -20,7 +20,7 @@ class BaseAgent:
20
  'successful_analyses': 0,
21
  'average_confidence': 0.0
22
  }
23
-
24
  async def analyze(self, event: ReliabilityEvent) -> AgentResult:
25
  """Base analysis method to be implemented by specialized agents"""
26
  raise NotImplementedError
@@ -28,16 +28,21 @@ class BaseAgent:
28
  class AnomalyDetectionAgent(BaseAgent):
29
  def __init__(self):
30
  super().__init__(AgentSpecialization.DETECTIVE)
31
- self.adaptive_thresholds = {}
32
-
 
 
 
 
 
33
  async def analyze(self, event: ReliabilityEvent) -> AgentResult:
34
  """Enhanced anomaly detection with pattern recognition"""
35
  start_time = asyncio.get_event_loop().time()
36
-
37
  # Multi-dimensional anomaly scoring
38
  anomaly_score = self._calculate_anomaly_score(event)
39
  pattern_match = self._detect_known_patterns(event)
40
-
41
  return AgentResult(
42
  specialization=self.specialization,
43
  confidence=anomaly_score,
@@ -50,42 +55,140 @@ class AnomalyDetectionAgent(BaseAgent):
50
  recommendations=self._generate_detection_recommendations(event, anomaly_score),
51
  processing_time=asyncio.get_event_loop().time() - start_time
52
  )
53
-
54
  def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
55
  """Calculate comprehensive anomaly score (0-1)"""
56
  scores = []
57
-
58
  # Latency anomaly (weighted 40%)
59
- if event.latency_p99 > 150:
60
- latency_score = min(1.0, (event.latency_p99 - 150) / 500)
61
  scores.append(0.4 * latency_score)
62
-
63
  # Error rate anomaly (weighted 30%)
64
- if event.error_rate > 0.05:
65
  error_score = min(1.0, event.error_rate / 0.3)
66
  scores.append(0.3 * error_score)
67
-
68
  # Resource anomaly (weighted 30%)
69
  resource_score = 0
70
- if event.cpu_util and event.cpu_util > 0.8:
71
- resource_score += 0.15 * min(1.0, (event.cpu_util - 0.8) / 0.2)
72
- if event.memory_util and event.memory_util > 0.8:
73
- resource_score += 0.15 * min(1.0, (event.memory_util - 0.8) / 0.2)
74
  scores.append(resource_score)
75
-
76
  return min(1.0, sum(scores))
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  class RootCauseAgent(BaseAgent):
79
  def __init__(self):
80
  super().__init__(AgentSpecialization.DIAGNOSTICIAN)
81
  self.causal_patterns = self._load_causal_patterns()
82
-
83
  async def analyze(self, event: ReliabilityEvent) -> AgentResult:
84
  """AI-powered root cause analysis"""
85
  start_time = asyncio.get_event_loop().time()
86
-
87
  root_cause_analysis = self._perform_causal_analysis(event)
88
-
89
  return AgentResult(
90
  specialization=self.specialization,
91
  confidence=root_cause_analysis['confidence'],
@@ -99,23 +202,174 @@ class RootCauseAgent(BaseAgent):
99
  processing_time=asyncio.get_event_loop().time() - start_time
100
  )
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  class OrchestrationManager:
103
  def __init__(self):
104
  self.agents = {
105
  AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
106
  AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
107
- # Add more agents as we build them
108
  }
109
  self.incident_history = []
110
-
111
  async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
112
  """Coordinate multiple agents for comprehensive analysis"""
113
  agent_tasks = {
114
  spec: agent.analyze(event)
115
  for spec, agent in self.agents.items()
116
  }
117
-
118
- # Parallel agent execution
119
  agent_results = {}
120
  for specialization, task in agent_tasks.items():
121
  try:
@@ -123,19 +377,24 @@ class OrchestrationManager:
123
  agent_results[specialization.value] = result
124
  except asyncio.TimeoutError:
125
  # Agent timeout - continue with others
 
126
  continue
127
-
 
 
 
 
128
  # Synthesize results
129
  return self._synthesize_agent_findings(event, agent_results)
130
-
131
  def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
132
  """Combine insights from all specialized agents"""
133
  detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
134
  diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
135
-
136
  if not detective_result:
137
  return {'error': 'No agent results available'}
138
-
139
  # Build comprehensive analysis
140
  synthesis = {
141
  'incident_summary': {
@@ -154,5 +413,49 @@ class OrchestrationManager:
154
  'processing_times': {k: v.processing_time for k, v in agent_results.items()}
155
  }
156
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- return synthesis
 
 
 
 
 
20
  'successful_analyses': 0,
21
  'average_confidence': 0.0
22
  }
23
+
24
  async def analyze(self, event: ReliabilityEvent) -> AgentResult:
25
  """Base analysis method to be implemented by specialized agents"""
26
  raise NotImplementedError
 
28
  class AnomalyDetectionAgent(BaseAgent):
29
  def __init__(self):
30
  super().__init__(AgentSpecialization.DETECTIVE)
31
+ self.adaptive_thresholds = {
32
+ 'latency_p99': 150,
33
+ 'error_rate': 0.05,
34
+ 'cpu_util': 0.8,
35
+ 'memory_util': 0.8
36
+ }
37
+
38
  async def analyze(self, event: ReliabilityEvent) -> AgentResult:
39
  """Enhanced anomaly detection with pattern recognition"""
40
  start_time = asyncio.get_event_loop().time()
41
+
42
  # Multi-dimensional anomaly scoring
43
  anomaly_score = self._calculate_anomaly_score(event)
44
  pattern_match = self._detect_known_patterns(event)
45
+
46
  return AgentResult(
47
  specialization=self.specialization,
48
  confidence=anomaly_score,
 
55
  recommendations=self._generate_detection_recommendations(event, anomaly_score),
56
  processing_time=asyncio.get_event_loop().time() - start_time
57
  )
58
+
59
  def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
60
  """Calculate comprehensive anomaly score (0-1)"""
61
  scores = []
62
+
63
  # Latency anomaly (weighted 40%)
64
+ if event.latency_p99 > self.adaptive_thresholds['latency_p99']:
65
+ latency_score = min(1.0, (event.latency_p99 - self.adaptive_thresholds['latency_p99']) / 500)
66
  scores.append(0.4 * latency_score)
67
+
68
  # Error rate anomaly (weighted 30%)
69
+ if event.error_rate > self.adaptive_thresholds['error_rate']:
70
  error_score = min(1.0, event.error_rate / 0.3)
71
  scores.append(0.3 * error_score)
72
+
73
  # Resource anomaly (weighted 30%)
74
  resource_score = 0
75
+ if event.cpu_util and event.cpu_util > self.adaptive_thresholds['cpu_util']:
76
+ resource_score += 0.15 * min(1.0, (event.cpu_util - self.adaptive_thresholds['cpu_util']) / 0.2)
77
+ if event.memory_util and event.memory_util > self.adaptive_thresholds['memory_util']:
78
+ resource_score += 0.15 * min(1.0, (event.memory_util - self.adaptive_thresholds['memory_util']) / 0.2)
79
  scores.append(resource_score)
80
+
81
  return min(1.0, sum(scores))
82
 
83
+ def _detect_known_patterns(self, event: ReliabilityEvent) -> List[str]:
84
+ """Detect known failure patterns"""
85
+ patterns = []
86
+
87
+ # Database timeout pattern
88
+ if event.latency_p99 > 500 and event.error_rate > 0.2:
89
+ patterns.append("database_timeout")
90
+
91
+ # Resource exhaustion pattern
92
+ if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
93
+ patterns.append("resource_exhaustion")
94
+
95
+ # Cascading failure pattern
96
+ if event.error_rate > 0.15 and event.latency_p99 > 300:
97
+ patterns.append("cascading_failure")
98
+
99
+ # Traffic spike pattern
100
+ if event.latency_p99 > 200 and event.throughput > 2000:
101
+ patterns.append("traffic_spike")
102
+
103
+ # Gradual degradation
104
+ if 150 < event.latency_p99 < 300 and 0.05 < event.error_rate < 0.15:
105
+ patterns.append("gradual_degradation")
106
+
107
+ return patterns if patterns else ["unknown_pattern"]
108
+
109
+ def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[str]:
110
+ """Identify which metrics are outside normal range"""
111
+ affected = []
112
+
113
+ if event.latency_p99 > self.adaptive_thresholds['latency_p99']:
114
+ affected.append("latency")
115
+
116
+ if event.error_rate > self.adaptive_thresholds['error_rate']:
117
+ affected.append("error_rate")
118
+
119
+ if event.cpu_util and event.cpu_util > self.adaptive_thresholds['cpu_util']:
120
+ affected.append("cpu")
121
+
122
+ if event.memory_util and event.memory_util > self.adaptive_thresholds['memory_util']:
123
+ affected.append("memory")
124
+
125
+ if event.throughput < 500: # Low throughput threshold
126
+ affected.append("throughput")
127
+
128
+ return affected if affected else ["none"]
129
+
130
+ def _classify_severity(self, anomaly_score: float) -> str:
131
+ """Classify severity based on anomaly score"""
132
+ if anomaly_score > 0.8:
133
+ return "CRITICAL"
134
+ elif anomaly_score > 0.6:
135
+ return "HIGH"
136
+ elif anomaly_score > 0.4:
137
+ return "MEDIUM"
138
+ return "LOW"
139
+
140
+ def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
141
+ """Generate actionable recommendations based on detected anomalies"""
142
+ recommendations = []
143
+
144
+ # Latency recommendations
145
+ if event.latency_p99 > 500:
146
+ recommendations.append("🚨 CRITICAL: Latency >500ms - Check database connections and external APIs immediately")
147
+ elif event.latency_p99 > 300:
148
+ recommendations.append("⚠️ HIGH: Latency >300ms - Investigate slow queries and service dependencies")
149
+ elif event.latency_p99 > 150:
150
+ recommendations.append("📈 Latency elevated - Monitor trends and consider optimization")
151
+
152
+ # Error rate recommendations
153
+ if event.error_rate > 0.3:
154
+ recommendations.append("🚨 CRITICAL: Error rate >30% - Rollback recent deployments or enable circuit breaker")
155
+ elif event.error_rate > 0.15:
156
+ recommendations.append("⚠️ HIGH: Error rate >15% - Review application logs for exceptions")
157
+ elif event.error_rate > 0.05:
158
+ recommendations.append("📈 Errors increasing - Check for configuration issues")
159
+
160
+ # Resource recommendations
161
+ if event.cpu_util and event.cpu_util > 0.9:
162
+ recommendations.append("🔥 CPU CRITICAL: >90% utilization - Scale horizontally or optimize hot paths")
163
+ elif event.cpu_util and event.cpu_util > 0.8:
164
+ recommendations.append("⚡ CPU HIGH: >80% utilization - Consider adding capacity")
165
+
166
+ if event.memory_util and event.memory_util > 0.9:
167
+ recommendations.append("💾 MEMORY CRITICAL: >90% utilization - Check for memory leaks")
168
+ elif event.memory_util and event.memory_util > 0.8:
169
+ recommendations.append("💾 MEMORY HIGH: >80% utilization - Monitor for leaks")
170
+
171
+ # Overall severity recommendations
172
+ if anomaly_score > 0.8:
173
+ recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
174
+ elif anomaly_score > 0.6:
175
+ recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
176
+ elif anomaly_score > 0.4:
177
+ recommendations.append("📊 MONITOR: Early warning signs detected")
178
+
179
+ return recommendations[:5] # Return top 5 recommendations
180
+
181
  class RootCauseAgent(BaseAgent):
182
  def __init__(self):
183
  super().__init__(AgentSpecialization.DIAGNOSTICIAN)
184
  self.causal_patterns = self._load_causal_patterns()
185
+
186
  async def analyze(self, event: ReliabilityEvent) -> AgentResult:
187
  """AI-powered root cause analysis"""
188
  start_time = asyncio.get_event_loop().time()
189
+
190
  root_cause_analysis = self._perform_causal_analysis(event)
191
+
192
  return AgentResult(
193
  specialization=self.specialization,
194
  confidence=root_cause_analysis['confidence'],
 
202
  processing_time=asyncio.get_event_loop().time() - start_time
203
  )
204
 
205
+ def _load_causal_patterns(self) -> Dict[str, Any]:
206
+ """Load known causal patterns for root cause analysis"""
207
+ return {
208
+ 'high_latency_high_errors': {
209
+ 'pattern': ['latency > 500', 'error_rate > 0.2'],
210
+ 'cause': 'Database or external dependency failure',
211
+ 'confidence': 0.85
212
+ },
213
+ 'high_cpu_high_memory': {
214
+ 'pattern': ['cpu > 0.9', 'memory > 0.9'],
215
+ 'cause': 'Resource exhaustion or memory leak',
216
+ 'confidence': 0.90
217
+ },
218
+ 'high_errors_normal_latency': {
219
+ 'pattern': ['error_rate > 0.3', 'latency < 200'],
220
+ 'cause': 'Application bug or configuration issue',
221
+ 'confidence': 0.75
222
+ },
223
+ 'gradual_degradation': {
224
+ 'pattern': ['200 < latency < 400', '0.05 < error_rate < 0.15'],
225
+ 'cause': 'Resource saturation or dependency degradation',
226
+ 'confidence': 0.65
227
+ }
228
+ }
229
+
230
+ def _perform_causal_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
231
+ """Analyze likely root causes based on event patterns"""
232
+ causes = []
233
+ evidence = []
234
+ confidence = 0.5
235
+
236
+ # Pattern 1: Database/External Dependency Failure
237
+ if event.latency_p99 > 500 and event.error_rate > 0.2:
238
+ causes.append({
239
+ "cause": "Database/External Dependency Failure",
240
+ "confidence": 0.85,
241
+ "evidence": f"Extreme latency ({event.latency_p99:.0f}ms) with high errors ({event.error_rate*100:.1f}%)",
242
+ "investigation": "Check database connection pool, external API health, network connectivity"
243
+ })
244
+ evidence.append("extreme_latency_with_errors")
245
+ confidence = 0.85
246
+
247
+ # Pattern 2: Resource Exhaustion
248
+ if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
249
+ causes.append({
250
+ "cause": "Resource Exhaustion",
251
+ "confidence": 0.90,
252
+ "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
253
+ "investigation": "Check for memory leaks, infinite loops, insufficient resource allocation"
254
+ })
255
+ evidence.append("correlated_resource_exhaustion")
256
+ confidence = max(confidence, 0.90)
257
+
258
+ # Pattern 3: Application Bug / Configuration Issue
259
+ if event.error_rate > 0.3 and event.latency_p99 < 200:
260
+ causes.append({
261
+ "cause": "Application Bug / Configuration Issue",
262
+ "confidence": 0.75,
263
+ "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
264
+ "investigation": "Review recent deployments, configuration changes, application logs, and error traces"
265
+ })
266
+ evidence.append("errors_without_latency")
267
+ confidence = max(confidence, 0.75)
268
+
269
+ # Pattern 4: Gradual Performance Degradation
270
+ if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
271
+ causes.append({
272
+ "cause": "Gradual Performance Degradation",
273
+ "confidence": 0.65,
274
+ "evidence": f"Moderate latency ({event.latency_p99:.0f}ms) and errors ({event.error_rate*100:.1f}%)",
275
+ "investigation": "Check resource trends, dependency performance, capacity planning, and scaling policies"
276
+ })
277
+ evidence.append("gradual_degradation")
278
+ confidence = max(confidence, 0.65)
279
+
280
+ # Pattern 5: Traffic Spike
281
+ if event.latency_p99 > 200 and event.throughput > 2000:
282
+ causes.append({
283
+ "cause": "Traffic Spike / Capacity Issue",
284
+ "confidence": 0.70,
285
+ "evidence": f"Elevated latency ({event.latency_p99:.0f}ms) with high throughput ({event.throughput:.0f} req/s)",
286
+ "investigation": "Check autoscaling configuration, rate limiting, and load balancer health"
287
+ })
288
+ evidence.append("traffic_spike")
289
+ confidence = max(confidence, 0.70)
290
+
291
+ # Default: Unknown pattern
292
+ if not causes:
293
+ causes.append({
294
+ "cause": "Unknown - Requires Investigation",
295
+ "confidence": 0.3,
296
+ "evidence": "Pattern does not match known failure modes",
297
+ "investigation": "Complete system review needed - check logs, metrics, and recent changes"
298
+ })
299
+ evidence.append("unknown_pattern")
300
+ confidence = 0.3
301
+
302
+ # Generate investigation steps
303
+ investigation_steps = [cause['investigation'] for cause in causes[:3]]
304
+
305
+ return {
306
+ 'confidence': confidence,
307
+ 'causes': causes,
308
+ 'evidence': evidence,
309
+ 'investigation_steps': investigation_steps
310
+ }
311
+
312
+ def _analyze_dependencies(self, event: ReliabilityEvent) -> Dict[str, Any]:
313
+ """Analyze dependency health and potential cascade effects"""
314
+ analysis = {
315
+ 'has_upstream_deps': len(event.upstream_deps) > 0,
316
+ 'upstream_services': event.upstream_deps,
317
+ 'potential_cascade': False,
318
+ 'cascade_risk_score': 0.0
319
+ }
320
+
321
+ # Calculate cascade risk
322
+ if event.error_rate > 0.2:
323
+ analysis['potential_cascade'] = True
324
+ analysis['cascade_risk_score'] = min(1.0, event.error_rate * 2)
325
+
326
+ if event.latency_p99 > 500:
327
+ analysis['potential_cascade'] = True
328
+ analysis['cascade_risk_score'] = max(
329
+ analysis['cascade_risk_score'],
330
+ min(1.0, event.latency_p99 / 1000)
331
+ )
332
+
333
+ return analysis
334
+
335
+ def _check_temporal_patterns(self, event: ReliabilityEvent) -> Dict[str, Any]:
336
+ """Check for time-based correlations"""
337
+ import datetime
338
+
339
+ current_time = datetime.datetime.now()
340
+ hour = current_time.hour
341
+
342
+ # Check for typical patterns
343
+ patterns = {
344
+ 'time_of_day_correlation': False,
345
+ 'is_peak_hours': 9 <= hour <= 17, # Business hours
346
+ 'is_off_hours': hour < 6 or hour > 22,
347
+ 'deployment_window': 14 <= hour <= 16, # Typical deployment window
348
+ 'weekend': current_time.weekday() >= 5
349
+ }
350
+
351
+ # Flag potential correlations
352
+ if patterns['is_peak_hours'] and event.latency_p99 > 200:
353
+ patterns['time_of_day_correlation'] = True
354
+
355
+ return patterns
356
+
357
  class OrchestrationManager:
358
  def __init__(self):
359
  self.agents = {
360
  AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
361
  AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
 
362
  }
363
  self.incident_history = []
364
+
365
  async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
366
  """Coordinate multiple agents for comprehensive analysis"""
367
  agent_tasks = {
368
  spec: agent.analyze(event)
369
  for spec, agent in self.agents.items()
370
  }
371
+
372
+ # Parallel agent execution with error handling
373
  agent_results = {}
374
  for specialization, task in agent_tasks.items():
375
  try:
 
377
  agent_results[specialization.value] = result
378
  except asyncio.TimeoutError:
379
  # Agent timeout - continue with others
380
+ print(f"Agent {specialization.value} timed out")
381
  continue
382
+ except Exception as e:
383
+ # Agent error - log and continue
384
+ print(f"Agent {specialization.value} error: {e}")
385
+ continue
386
+
387
  # Synthesize results
388
  return self._synthesize_agent_findings(event, agent_results)
389
+
390
  def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
391
  """Combine insights from all specialized agents"""
392
  detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
393
  diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
394
+
395
  if not detective_result:
396
  return {'error': 'No agent results available'}
397
+
398
  # Build comprehensive analysis
399
  synthesis = {
400
  'incident_summary': {
 
413
  'processing_times': {k: v.processing_time for k, v in agent_results.items()}
414
  }
415
  }
416
+
417
+ return synthesis
418
+
419
+ def _prioritize_actions(self, detection_actions: List[str], diagnosis_actions: List[str]) -> List[str]:
420
+ """Combine and prioritize actions from multiple agents"""
421
+ all_actions = []
422
+
423
+ # Add critical actions first (those with 🚨)
424
+ critical = [a for a in detection_actions + diagnosis_actions if '🚨' in a]
425
+ all_actions.extend(critical)
426
+
427
+ # Add high priority actions (those with ⚠️)
428
+ high = [a for a in detection_actions + diagnosis_actions if '⚠️' in a and a not in all_actions]
429
+ all_actions.extend(high)
430
+
431
+ # Add remaining actions
432
+ remaining = [a for a in detection_actions + diagnosis_actions if a not in all_actions]
433
+ all_actions.extend(remaining)
434
+
435
+ # Remove duplicates while preserving order
436
+ seen = set()
437
+ unique_actions = []
438
+ for action in all_actions:
439
+ if action not in seen:
440
+ seen.add(action)
441
+ unique_actions.append(action)
442
+
443
+ return unique_actions[:5] # Return top 5 actions
444
+
445
+ def _add_business_context(self, event: ReliabilityEvent, confidence: float) -> Dict[str, Any]:
446
+ """Add business impact context to the analysis"""
447
+ # Calculate business severity
448
+ if confidence > 0.8:
449
+ business_severity = "CRITICAL"
450
+ elif confidence > 0.6:
451
+ business_severity = "HIGH"
452
+ elif confidence > 0.4:
453
+ business_severity = "MEDIUM"
454
+ else:
455
+ business_severity = "LOW"
456
 
457
+ return {
458
+ 'business_severity': business_severity,
459
+ 'estimated_impact': f"{confidence * 100:.0f}% confidence of incident",
460
+ 'recommended_escalation': confidence > 0.7
461
+ }