Update app.py
Browse files
app.py
CHANGED
|
@@ -211,18 +211,81 @@ class AnomalyDetectionAgent(BaseAgent):
|
|
| 211 |
|
| 212 |
return min(1.0, sum(scores))
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
def _classify_severity(self, anomaly_score: float) -> str:
|
| 228 |
if anomaly_score > 0.8:
|
|
@@ -255,21 +318,55 @@ class RootCauseAgent(BaseAgent):
|
|
| 255 |
]
|
| 256 |
}
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
|
| 275 |
"""Identify evidence patterns"""
|
|
|
|
| 211 |
|
| 212 |
return min(1.0, sum(scores))
|
| 213 |
|
| 214 |
+
def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
|
| 215 |
+
"""Enhanced metric analysis with severity levels"""
|
| 216 |
+
affected = []
|
| 217 |
+
|
| 218 |
+
# Latency analysis
|
| 219 |
+
if event.latency_p99 > 500:
|
| 220 |
+
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
|
| 221 |
+
elif event.latency_p99 > 300:
|
| 222 |
+
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
|
| 223 |
+
elif event.latency_p99 > 150:
|
| 224 |
+
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
|
| 225 |
+
|
| 226 |
+
# Error rate analysis
|
| 227 |
+
if event.error_rate > 0.3:
|
| 228 |
+
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
|
| 229 |
+
elif event.error_rate > 0.15:
|
| 230 |
+
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
|
| 231 |
+
elif event.error_rate > 0.05:
|
| 232 |
+
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
|
| 233 |
+
|
| 234 |
+
# Resource analysis
|
| 235 |
+
if event.cpu_util and event.cpu_util > 0.9:
|
| 236 |
+
affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
|
| 237 |
+
elif event.cpu_util and event.cpu_util > 0.8:
|
| 238 |
+
affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
|
| 239 |
+
|
| 240 |
+
if event.memory_util and event.memory_util > 0.9:
|
| 241 |
+
affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
|
| 242 |
+
elif event.memory_util and event.memory_util > 0.8:
|
| 243 |
+
affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
|
| 244 |
+
|
| 245 |
+
return affected
|
| 246 |
+
|
| 247 |
+
def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
|
| 248 |
+
"""Generate specific, actionable recommendations"""
|
| 249 |
+
recommendations = []
|
| 250 |
+
affected_metrics = self._identify_affected_metrics(event)
|
| 251 |
+
|
| 252 |
+
for metric in affected_metrics:
|
| 253 |
+
metric_name = metric["metric"]
|
| 254 |
+
severity = metric["severity"]
|
| 255 |
+
value = metric["value"]
|
| 256 |
+
threshold = metric["threshold"]
|
| 257 |
+
|
| 258 |
+
if metric_name == "latency":
|
| 259 |
+
if severity == "CRITICAL":
|
| 260 |
+
recommendations.append(f"π¨ CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
|
| 261 |
+
elif severity == "HIGH":
|
| 262 |
+
recommendations.append(f"β οΈ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
|
| 263 |
+
else:
|
| 264 |
+
recommendations.append(f"π Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
|
| 265 |
+
|
| 266 |
+
elif metric_name == "error_rate":
|
| 267 |
+
if severity == "CRITICAL":
|
| 268 |
+
recommendations.append(f"π¨ CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
|
| 269 |
+
elif severity == "HIGH":
|
| 270 |
+
recommendations.append(f"β οΈ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
|
| 271 |
+
else:
|
| 272 |
+
recommendations.append(f"π Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
|
| 273 |
+
|
| 274 |
+
elif metric_name == "cpu":
|
| 275 |
+
recommendations.append(f"π₯ CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
|
| 276 |
+
|
| 277 |
+
elif metric_name == "memory":
|
| 278 |
+
recommendations.append(f"πΎ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
|
| 279 |
+
|
| 280 |
+
# Add overall recommendations based on anomaly score
|
| 281 |
+
if anomaly_score > 0.8:
|
| 282 |
+
recommendations.append("π― IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
|
| 283 |
+
elif anomaly_score > 0.6:
|
| 284 |
+
recommendations.append("π― INVESTIGATE: Significant performance degradation detected")
|
| 285 |
+
elif anomaly_score > 0.4:
|
| 286 |
+
recommendations.append("π MONITOR: Early warning signs detected")
|
| 287 |
+
|
| 288 |
+
return recommendations[:4] # Return top 4 most important recommendations
|
| 289 |
|
| 290 |
def _classify_severity(self, anomaly_score: float) -> str:
|
| 291 |
if anomaly_score > 0.8:
|
|
|
|
| 318 |
]
|
| 319 |
}
|
| 320 |
|
| 321 |
+
def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
|
| 322 |
+
"""Enhanced root cause analysis with confidence scoring"""
|
| 323 |
+
causes = []
|
| 324 |
+
|
| 325 |
+
# High latency + high errors pattern
|
| 326 |
+
if event.latency_p99 > 500 and event.error_rate > 0.2:
|
| 327 |
+
causes.append({
|
| 328 |
+
"cause": "Database/External Dependency Failure",
|
| 329 |
+
"confidence": 0.85,
|
| 330 |
+
"evidence": f"Extreme latency ({event.latency_p99}ms) with high errors ({event.error_rate*100:.1f}%)",
|
| 331 |
+
"investigation": "Check database connection pool, external API health"
|
| 332 |
+
})
|
| 333 |
+
|
| 334 |
+
# Resource exhaustion pattern
|
| 335 |
+
if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
|
| 336 |
+
causes.append({
|
| 337 |
+
"cause": "Resource Exhaustion",
|
| 338 |
+
"confidence": 0.90,
|
| 339 |
+
"evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
|
| 340 |
+
"investigation": "Check for memory leaks, infinite loops, insufficient resources"
|
| 341 |
+
})
|
| 342 |
+
|
| 343 |
+
# Error spike pattern
|
| 344 |
+
if event.error_rate > 0.3 and event.latency_p99 < 200:
|
| 345 |
+
causes.append({
|
| 346 |
+
"cause": "Application Bug / Configuration Issue",
|
| 347 |
+
"confidence": 0.75,
|
| 348 |
+
"evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
|
| 349 |
+
"investigation": "Review recent deployments, configuration changes, application logs"
|
| 350 |
+
})
|
| 351 |
+
|
| 352 |
+
# Gradual degradation pattern
|
| 353 |
+
if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
|
| 354 |
+
causes.append({
|
| 355 |
+
"cause": "Gradual Performance Degradation",
|
| 356 |
+
"confidence": 0.65,
|
| 357 |
+
"evidence": f"Moderate latency ({event.latency_p99}ms) and errors ({event.error_rate*100:.1f}%)",
|
| 358 |
+
"investigation": "Check resource trends, dependency performance, capacity planning"
|
| 359 |
+
})
|
| 360 |
+
|
| 361 |
+
if not causes:
|
| 362 |
+
causes.append({
|
| 363 |
+
"cause": "Unknown - Requires Investigation",
|
| 364 |
+
"confidence": 0.3,
|
| 365 |
+
"evidence": "Pattern does not match known failure modes",
|
| 366 |
+
"investigation": "Complete system review needed"
|
| 367 |
+
})
|
| 368 |
+
|
| 369 |
+
return causes
|
| 370 |
|
| 371 |
def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
|
| 372 |
"""Identify evidence patterns"""
|