petter2025 commited on
Commit
be77688
Β·
verified Β·
1 Parent(s): 1c2c217

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -27
app.py CHANGED
@@ -211,18 +211,81 @@ class AnomalyDetectionAgent(BaseAgent):
211
 
212
  return min(1.0, sum(scores))
213
 
214
- def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[str]:
215
- """Identify which metrics are contributing to anomalies"""
216
- affected = []
217
- if event.latency_p99 > 150:
218
- affected.append("latency")
219
- if event.error_rate > 0.05:
220
- affected.append("error_rate")
221
- if event.cpu_util and event.cpu_util > 0.8:
222
- affected.append("cpu_utilization")
223
- if event.memory_util and event.memory_util > 0.8:
224
- affected.append("memory_utilization")
225
- return affected
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  def _classify_severity(self, anomaly_score: float) -> str:
228
  if anomaly_score > 0.8:
@@ -255,21 +318,55 @@ class RootCauseAgent(BaseAgent):
255
  ]
256
  }
257
 
258
- def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[str]:
259
- """Analyze potential root causes based on metrics"""
260
- causes = []
261
-
262
- if event.latency_p99 > 300 and event.error_rate > 0.1:
263
- causes.append("database_connection_pool")
264
- causes.append("external_dependency_timeout")
265
- elif event.cpu_util and event.cpu_util > 0.9:
266
- causes.append("resource_exhaustion")
267
- causes.append("memory_leak")
268
- elif event.error_rate > 0.2:
269
- causes.append("recent_deployment")
270
- causes.append("configuration_change")
271
-
272
- return causes if causes else ["unknown_cause_requires_investigation"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
275
  """Identify evidence patterns"""
 
211
 
212
  return min(1.0, sum(scores))
213
 
214
+ def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
215
+ """Enhanced metric analysis with severity levels"""
216
+ affected = []
217
+
218
+ # Latency analysis
219
+ if event.latency_p99 > 500:
220
+ affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
221
+ elif event.latency_p99 > 300:
222
+ affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
223
+ elif event.latency_p99 > 150:
224
+ affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
225
+
226
+ # Error rate analysis
227
+ if event.error_rate > 0.3:
228
+ affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
229
+ elif event.error_rate > 0.15:
230
+ affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
231
+ elif event.error_rate > 0.05:
232
+ affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
233
+
234
+ # Resource analysis
235
+ if event.cpu_util and event.cpu_util > 0.9:
236
+ affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
237
+ elif event.cpu_util and event.cpu_util > 0.8:
238
+ affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
239
+
240
+ if event.memory_util and event.memory_util > 0.9:
241
+ affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
242
+ elif event.memory_util and event.memory_util > 0.8:
243
+ affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
244
+
245
+ return affected
246
+
247
+ def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
248
+ """Generate specific, actionable recommendations"""
249
+ recommendations = []
250
+ affected_metrics = self._identify_affected_metrics(event)
251
+
252
+ for metric in affected_metrics:
253
+ metric_name = metric["metric"]
254
+ severity = metric["severity"]
255
+ value = metric["value"]
256
+ threshold = metric["threshold"]
257
+
258
+ if metric_name == "latency":
259
+ if severity == "CRITICAL":
260
+ recommendations.append(f"🚨 CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
261
+ elif severity == "HIGH":
262
+ recommendations.append(f"⚠️ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
263
+ else:
264
+ recommendations.append(f"πŸ“ˆ Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
265
+
266
+ elif metric_name == "error_rate":
267
+ if severity == "CRITICAL":
268
+ recommendations.append(f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
269
+ elif severity == "HIGH":
270
+ recommendations.append(f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
271
+ else:
272
+ recommendations.append(f"πŸ“ˆ Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
273
+
274
+ elif metric_name == "cpu":
275
+ recommendations.append(f"πŸ”₯ CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
276
+
277
+ elif metric_name == "memory":
278
+ recommendations.append(f"πŸ’Ύ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
279
+
280
+ # Add overall recommendations based on anomaly score
281
+ if anomaly_score > 0.8:
282
+ recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
283
+ elif anomaly_score > 0.6:
284
+ recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
285
+ elif anomaly_score > 0.4:
286
+ recommendations.append("πŸ“Š MONITOR: Early warning signs detected")
287
+
288
+ return recommendations[:4] # Return top 4 most important recommendations
289
 
290
  def _classify_severity(self, anomaly_score: float) -> str:
291
  if anomaly_score > 0.8:
 
318
  ]
319
  }
320
 
321
+ def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
322
+ """Enhanced root cause analysis with confidence scoring"""
323
+ causes = []
324
+
325
+ # High latency + high errors pattern
326
+ if event.latency_p99 > 500 and event.error_rate > 0.2:
327
+ causes.append({
328
+ "cause": "Database/External Dependency Failure",
329
+ "confidence": 0.85,
330
+ "evidence": f"Extreme latency ({event.latency_p99}ms) with high errors ({event.error_rate*100:.1f}%)",
331
+ "investigation": "Check database connection pool, external API health"
332
+ })
333
+
334
+ # Resource exhaustion pattern
335
+ if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
336
+ causes.append({
337
+ "cause": "Resource Exhaustion",
338
+ "confidence": 0.90,
339
+ "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
340
+ "investigation": "Check for memory leaks, infinite loops, insufficient resources"
341
+ })
342
+
343
+ # Error spike pattern
344
+ if event.error_rate > 0.3 and event.latency_p99 < 200:
345
+ causes.append({
346
+ "cause": "Application Bug / Configuration Issue",
347
+ "confidence": 0.75,
348
+ "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
349
+ "investigation": "Review recent deployments, configuration changes, application logs"
350
+ })
351
+
352
+ # Gradual degradation pattern
353
+ if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
354
+ causes.append({
355
+ "cause": "Gradual Performance Degradation",
356
+ "confidence": 0.65,
357
+ "evidence": f"Moderate latency ({event.latency_p99}ms) and errors ({event.error_rate*100:.1f}%)",
358
+ "investigation": "Check resource trends, dependency performance, capacity planning"
359
+ })
360
+
361
+ if not causes:
362
+ causes.append({
363
+ "cause": "Unknown - Requires Investigation",
364
+ "confidence": 0.3,
365
+ "evidence": "Pattern does not match known failure modes",
366
+ "investigation": "Complete system review needed"
367
+ })
368
+
369
+ return causes
370
 
371
  def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
372
  """Identify evidence patterns"""