petter2025 commited on
Commit
36fa36c
Β·
verified Β·
1 Parent(s): 5447ba5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -137
app.py CHANGED
@@ -182,9 +182,7 @@ class AnomalyDetectionAgent(BaseAgent):
182
  'severity_tier': self._classify_severity(anomaly_score),
183
  'primary_metrics_affected': self._identify_affected_metrics(event)
184
  },
185
- 'recommendations': [
186
- f"Investigate {metric} anomalies" for metric in self._identify_affected_metrics(event)
187
- ]
188
  }
189
 
190
  def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
@@ -211,83 +209,8 @@ class AnomalyDetectionAgent(BaseAgent):
211
 
212
  return min(1.0, sum(scores))
213
 
214
- def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
215
- """Enhanced metric analysis with severity levels"""
216
- affected = []
217
-
218
- # Latency analysis
219
- if event.latency_p99 > 500:
220
- affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
221
- elif event.latency_p99 > 300:
222
- affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
223
- elif event.latency_p99 > 150:
224
- affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
225
-
226
- # Error rate analysis
227
- if event.error_rate > 0.3:
228
- affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
229
- elif event.error_rate > 0.15:
230
- affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
231
- elif event.error_rate > 0.05:
232
- affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
233
-
234
- # Resource analysis
235
- if event.cpu_util and event.cpu_util > 0.9:
236
- affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
237
- elif event.cpu_util and event.cpu_util > 0.8:
238
- affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
239
-
240
- if event.memory_util and event.memory_util > 0.9:
241
- affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
242
- elif event.memory_util and event.memory_util > 0.8:
243
- affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
244
-
245
- return affected
246
-
247
- def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
248
- """Generate specific, actionable recommendations"""
249
- recommendations = []
250
- affected_metrics = self._identify_affected_metrics(event)
251
-
252
- for metric in affected_metrics:
253
- metric_name = metric["metric"]
254
- severity = metric["severity"]
255
- value = metric["value"]
256
- threshold = metric["threshold"]
257
-
258
- if metric_name == "latency":
259
- if severity == "CRITICAL":
260
- recommendations.append(f"🚨 CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
261
- elif severity == "HIGH":
262
- recommendations.append(f"⚠️ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
263
- else:
264
- recommendations.append(f"πŸ“ˆ Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
265
-
266
- elif metric_name == "error_rate":
267
- if severity == "CRITICAL":
268
- recommendations.append(f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
269
- elif severity == "HIGH":
270
- recommendations.append(f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
271
- else:
272
- recommendations.append(f"πŸ“ˆ Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
273
-
274
- elif metric_name == "cpu":
275
- recommendations.append(f"πŸ”₯ CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
276
-
277
- elif metric_name == "memory":
278
- recommendations.append(f"πŸ’Ύ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
279
-
280
- # Add overall recommendations based on anomaly score
281
- if anomaly_score > 0.8:
282
- recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
283
- elif anomaly_score > 0.6:
284
- recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
285
- elif anomaly_score > 0.4:
286
- recommendations.append("πŸ“Š MONITOR: Early warning signs detected")
287
-
288
- return recommendations[:4] # Return top 4 most important recommendations
289
-
290
  def _classify_severity(self, anomaly_score: float) -> str:
 
291
  if anomaly_score > 0.8:
292
  return "CRITICAL"
293
  elif anomaly_score > 0.6:
@@ -296,6 +219,82 @@ def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_s
296
  return "MEDIUM"
297
  else:
298
  return "LOW"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
  class RootCauseAgent(BaseAgent):
301
  def __init__(self):
@@ -314,59 +313,59 @@ class RootCauseAgent(BaseAgent):
314
  'investigation_priority': self._prioritize_investigation(causes)
315
  },
316
  'recommendations': [
317
- f"Check {cause} for issues" for cause in causes[:2]
318
  ]
319
  }
320
 
321
- def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
322
- """Enhanced root cause analysis with confidence scoring"""
323
- causes = []
324
-
325
- # High latency + high errors pattern
326
- if event.latency_p99 > 500 and event.error_rate > 0.2:
327
- causes.append({
328
- "cause": "Database/External Dependency Failure",
329
- "confidence": 0.85,
330
- "evidence": f"Extreme latency ({event.latency_p99}ms) with high errors ({event.error_rate*100:.1f}%)",
331
- "investigation": "Check database connection pool, external API health"
332
- })
333
-
334
- # Resource exhaustion pattern
335
- if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
336
- causes.append({
337
- "cause": "Resource Exhaustion",
338
- "confidence": 0.90,
339
- "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
340
- "investigation": "Check for memory leaks, infinite loops, insufficient resources"
341
- })
342
-
343
- # Error spike pattern
344
- if event.error_rate > 0.3 and event.latency_p99 < 200:
345
- causes.append({
346
- "cause": "Application Bug / Configuration Issue",
347
- "confidence": 0.75,
348
- "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
349
- "investigation": "Review recent deployments, configuration changes, application logs"
350
- })
351
-
352
- # Gradual degradation pattern
353
- if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
354
- causes.append({
355
- "cause": "Gradual Performance Degradation",
356
- "confidence": 0.65,
357
- "evidence": f"Moderate latency ({event.latency_p99}ms) and errors ({event.error_rate*100:.1f}%)",
358
- "investigation": "Check resource trends, dependency performance, capacity planning"
359
- })
360
-
361
- if not causes:
362
- causes.append({
363
- "cause": "Unknown - Requires Investigation",
364
- "confidence": 0.3,
365
- "evidence": "Pattern does not match known failure modes",
366
- "investigation": "Complete system review needed"
367
- })
368
-
369
- return causes
370
 
371
  def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
372
  """Identify evidence patterns"""
@@ -377,13 +376,12 @@ def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, A
377
  evidence.append("correlated_resource_exhaustion")
378
  return evidence
379
 
380
- def _prioritize_investigation(self, causes: List[str]) -> str:
381
- if "database_connection_pool" in causes:
382
- return "HIGH"
383
- elif "resource_exhaustion" in causes:
384
- return "HIGH"
385
- else:
386
- return "MEDIUM"
387
 
388
  class OrchestrationManager:
389
  def __init__(self):
@@ -422,7 +420,7 @@ class OrchestrationManager:
422
  'incident_summary': {
423
  'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
424
  'anomaly_confidence': detective_result['confidence'],
425
- 'primary_metrics_affected': detective_result['findings'].get('primary_metrics_affected', [])
426
  },
427
  'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
428
  'recommended_actions': self._prioritize_actions(
 
182
  'severity_tier': self._classify_severity(anomaly_score),
183
  'primary_metrics_affected': self._identify_affected_metrics(event)
184
  },
185
+ 'recommendations': self._generate_detection_recommendations(event, anomaly_score)
 
 
186
  }
187
 
188
  def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
 
209
 
210
  return min(1.0, sum(scores))
211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def _classify_severity(self, anomaly_score: float) -> str:
213
+ """Classify severity based on anomaly score"""
214
  if anomaly_score > 0.8:
215
  return "CRITICAL"
216
  elif anomaly_score > 0.6:
 
219
  return "MEDIUM"
220
  else:
221
  return "LOW"
222
+
223
+ def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
224
+ """Enhanced metric analysis with severity levels"""
225
+ affected = []
226
+
227
+ # Latency analysis
228
+ if event.latency_p99 > 500:
229
+ affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
230
+ elif event.latency_p99 > 300:
231
+ affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
232
+ elif event.latency_p99 > 150:
233
+ affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
234
+
235
+ # Error rate analysis
236
+ if event.error_rate > 0.3:
237
+ affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
238
+ elif event.error_rate > 0.15:
239
+ affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
240
+ elif event.error_rate > 0.05:
241
+ affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
242
+
243
+ # Resource analysis
244
+ if event.cpu_util and event.cpu_util > 0.9:
245
+ affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
246
+ elif event.cpu_util and event.cpu_util > 0.8:
247
+ affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
248
+
249
+ if event.memory_util and event.memory_util > 0.9:
250
+ affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
251
+ elif event.memory_util and event.memory_util > 0.8:
252
+ affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
253
+
254
+ return affected
255
+
256
+ def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
257
+ """Generate specific, actionable recommendations"""
258
+ recommendations = []
259
+ affected_metrics = self._identify_affected_metrics(event)
260
+
261
+ for metric in affected_metrics:
262
+ metric_name = metric["metric"]
263
+ severity = metric["severity"]
264
+ value = metric["value"]
265
+ threshold = metric["threshold"]
266
+
267
+ if metric_name == "latency":
268
+ if severity == "CRITICAL":
269
+ recommendations.append(f"🚨 CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
270
+ elif severity == "HIGH":
271
+ recommendations.append(f"⚠️ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
272
+ else:
273
+ recommendations.append(f"πŸ“ˆ Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
274
+
275
+ elif metric_name == "error_rate":
276
+ if severity == "CRITICAL":
277
+ recommendations.append(f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
278
+ elif severity == "HIGH":
279
+ recommendations.append(f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
280
+ else:
281
+ recommendations.append(f"πŸ“ˆ Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
282
+
283
+ elif metric_name == "cpu":
284
+ recommendations.append(f"πŸ”₯ CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
285
+
286
+ elif metric_name == "memory":
287
+ recommendations.append(f"πŸ’Ύ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
288
+
289
+ # Add overall recommendations based on anomaly score
290
+ if anomaly_score > 0.8:
291
+ recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
292
+ elif anomaly_score > 0.6:
293
+ recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
294
+ elif anomaly_score > 0.4:
295
+ recommendations.append("πŸ“Š MONITOR: Early warning signs detected")
296
+
297
+ return recommendations[:4] # Return top 4 most important recommendations
298
 
299
  class RootCauseAgent(BaseAgent):
300
  def __init__(self):
 
313
  'investigation_priority': self._prioritize_investigation(causes)
314
  },
315
  'recommendations': [
316
+ f"Check {cause['cause']} for issues" for cause in causes[:2]
317
  ]
318
  }
319
 
320
+ def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
321
+ """Enhanced root cause analysis with confidence scoring"""
322
+ causes = []
323
+
324
+ # High latency + high errors pattern
325
+ if event.latency_p99 > 500 and event.error_rate > 0.2:
326
+ causes.append({
327
+ "cause": "Database/External Dependency Failure",
328
+ "confidence": 0.85,
329
+ "evidence": f"Extreme latency ({event.latency_p99}ms) with high errors ({event.error_rate*100:.1f}%)",
330
+ "investigation": "Check database connection pool, external API health"
331
+ })
332
+
333
+ # Resource exhaustion pattern
334
+ if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
335
+ causes.append({
336
+ "cause": "Resource Exhaustion",
337
+ "confidence": 0.90,
338
+ "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
339
+ "investigation": "Check for memory leaks, infinite loops, insufficient resources"
340
+ })
341
+
342
+ # Error spike pattern
343
+ if event.error_rate > 0.3 and event.latency_p99 < 200:
344
+ causes.append({
345
+ "cause": "Application Bug / Configuration Issue",
346
+ "confidence": 0.75,
347
+ "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
348
+ "investigation": "Review recent deployments, configuration changes, application logs"
349
+ })
350
+
351
+ # Gradual degradation pattern
352
+ if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
353
+ causes.append({
354
+ "cause": "Gradual Performance Degradation",
355
+ "confidence": 0.65,
356
+ "evidence": f"Moderate latency ({event.latency_p99}ms) and errors ({event.error_rate*100:.1f}%)",
357
+ "investigation": "Check resource trends, dependency performance, capacity planning"
358
+ })
359
+
360
+ if not causes:
361
+ causes.append({
362
+ "cause": "Unknown - Requires Investigation",
363
+ "confidence": 0.3,
364
+ "evidence": "Pattern does not match known failure modes",
365
+ "investigation": "Complete system review needed"
366
+ })
367
+
368
+ return causes
369
 
370
  def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
371
  """Identify evidence patterns"""
 
376
  evidence.append("correlated_resource_exhaustion")
377
  return evidence
378
 
379
+ def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
380
+ """Prioritize investigation based on causes"""
381
+ for cause in causes:
382
+ if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
383
+ return "HIGH"
384
+ return "MEDIUM"
 
385
 
386
  class OrchestrationManager:
387
  def __init__(self):
 
420
  'incident_summary': {
421
  'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
422
  'anomaly_confidence': detective_result['confidence'],
423
+ 'primary_metrics_affected': [metric["metric"] for metric in detective_result['findings'].get('primary_metrics_affected', [])]
424
  },
425
  'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
426
  'recommended_actions': self._prioritize_actions(