Update app.py
Browse files
app.py
CHANGED
|
@@ -182,9 +182,7 @@ class AnomalyDetectionAgent(BaseAgent):
|
|
| 182 |
'severity_tier': self._classify_severity(anomaly_score),
|
| 183 |
'primary_metrics_affected': self._identify_affected_metrics(event)
|
| 184 |
},
|
| 185 |
-
'recommendations':
|
| 186 |
-
f"Investigate {metric} anomalies" for metric in self._identify_affected_metrics(event)
|
| 187 |
-
]
|
| 188 |
}
|
| 189 |
|
| 190 |
def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
|
|
@@ -211,83 +209,8 @@ class AnomalyDetectionAgent(BaseAgent):
|
|
| 211 |
|
| 212 |
return min(1.0, sum(scores))
|
| 213 |
|
| 214 |
-
def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
|
| 215 |
-
"""Enhanced metric analysis with severity levels"""
|
| 216 |
-
affected = []
|
| 217 |
-
|
| 218 |
-
# Latency analysis
|
| 219 |
-
if event.latency_p99 > 500:
|
| 220 |
-
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
|
| 221 |
-
elif event.latency_p99 > 300:
|
| 222 |
-
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
|
| 223 |
-
elif event.latency_p99 > 150:
|
| 224 |
-
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
|
| 225 |
-
|
| 226 |
-
# Error rate analysis
|
| 227 |
-
if event.error_rate > 0.3:
|
| 228 |
-
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
|
| 229 |
-
elif event.error_rate > 0.15:
|
| 230 |
-
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
|
| 231 |
-
elif event.error_rate > 0.05:
|
| 232 |
-
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
|
| 233 |
-
|
| 234 |
-
# Resource analysis
|
| 235 |
-
if event.cpu_util and event.cpu_util > 0.9:
|
| 236 |
-
affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
|
| 237 |
-
elif event.cpu_util and event.cpu_util > 0.8:
|
| 238 |
-
affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
|
| 239 |
-
|
| 240 |
-
if event.memory_util and event.memory_util > 0.9:
|
| 241 |
-
affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
|
| 242 |
-
elif event.memory_util and event.memory_util > 0.8:
|
| 243 |
-
affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
|
| 244 |
-
|
| 245 |
-
return affected
|
| 246 |
-
|
| 247 |
-
def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
|
| 248 |
-
"""Generate specific, actionable recommendations"""
|
| 249 |
-
recommendations = []
|
| 250 |
-
affected_metrics = self._identify_affected_metrics(event)
|
| 251 |
-
|
| 252 |
-
for metric in affected_metrics:
|
| 253 |
-
metric_name = metric["metric"]
|
| 254 |
-
severity = metric["severity"]
|
| 255 |
-
value = metric["value"]
|
| 256 |
-
threshold = metric["threshold"]
|
| 257 |
-
|
| 258 |
-
if metric_name == "latency":
|
| 259 |
-
if severity == "CRITICAL":
|
| 260 |
-
recommendations.append(f"π¨ CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
|
| 261 |
-
elif severity == "HIGH":
|
| 262 |
-
recommendations.append(f"β οΈ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
|
| 263 |
-
else:
|
| 264 |
-
recommendations.append(f"π Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
|
| 265 |
-
|
| 266 |
-
elif metric_name == "error_rate":
|
| 267 |
-
if severity == "CRITICAL":
|
| 268 |
-
recommendations.append(f"π¨ CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
|
| 269 |
-
elif severity == "HIGH":
|
| 270 |
-
recommendations.append(f"β οΈ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
|
| 271 |
-
else:
|
| 272 |
-
recommendations.append(f"π Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
|
| 273 |
-
|
| 274 |
-
elif metric_name == "cpu":
|
| 275 |
-
recommendations.append(f"π₯ CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
|
| 276 |
-
|
| 277 |
-
elif metric_name == "memory":
|
| 278 |
-
recommendations.append(f"πΎ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
|
| 279 |
-
|
| 280 |
-
# Add overall recommendations based on anomaly score
|
| 281 |
-
if anomaly_score > 0.8:
|
| 282 |
-
recommendations.append("π― IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
|
| 283 |
-
elif anomaly_score > 0.6:
|
| 284 |
-
recommendations.append("π― INVESTIGATE: Significant performance degradation detected")
|
| 285 |
-
elif anomaly_score > 0.4:
|
| 286 |
-
recommendations.append("π MONITOR: Early warning signs detected")
|
| 287 |
-
|
| 288 |
-
return recommendations[:4] # Return top 4 most important recommendations
|
| 289 |
-
|
| 290 |
def _classify_severity(self, anomaly_score: float) -> str:
|
|
|
|
| 291 |
if anomaly_score > 0.8:
|
| 292 |
return "CRITICAL"
|
| 293 |
elif anomaly_score > 0.6:
|
|
@@ -296,6 +219,82 @@ def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_s
|
|
| 296 |
return "MEDIUM"
|
| 297 |
else:
|
| 298 |
return "LOW"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
class RootCauseAgent(BaseAgent):
|
| 301 |
def __init__(self):
|
|
@@ -314,59 +313,59 @@ class RootCauseAgent(BaseAgent):
|
|
| 314 |
'investigation_priority': self._prioritize_investigation(causes)
|
| 315 |
},
|
| 316 |
'recommendations': [
|
| 317 |
-
f"Check {cause} for issues" for cause in causes[:2]
|
| 318 |
]
|
| 319 |
}
|
| 320 |
|
| 321 |
-
def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
|
| 371 |
def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
|
| 372 |
"""Identify evidence patterns"""
|
|
@@ -377,13 +376,12 @@ def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, A
|
|
| 377 |
evidence.append("correlated_resource_exhaustion")
|
| 378 |
return evidence
|
| 379 |
|
| 380 |
-
def _prioritize_investigation(self, causes: List[str]) -> str:
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
return "MEDIUM"
|
| 387 |
|
| 388 |
class OrchestrationManager:
|
| 389 |
def __init__(self):
|
|
@@ -422,7 +420,7 @@ class OrchestrationManager:
|
|
| 422 |
'incident_summary': {
|
| 423 |
'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
|
| 424 |
'anomaly_confidence': detective_result['confidence'],
|
| 425 |
-
'primary_metrics_affected': detective_result['findings'].get('primary_metrics_affected', [])
|
| 426 |
},
|
| 427 |
'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
|
| 428 |
'recommended_actions': self._prioritize_actions(
|
|
|
|
| 182 |
'severity_tier': self._classify_severity(anomaly_score),
|
| 183 |
'primary_metrics_affected': self._identify_affected_metrics(event)
|
| 184 |
},
|
| 185 |
+
'recommendations': self._generate_detection_recommendations(event, anomaly_score)
|
|
|
|
|
|
|
| 186 |
}
|
| 187 |
|
| 188 |
def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
|
|
|
|
| 209 |
|
| 210 |
return min(1.0, sum(scores))
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
def _classify_severity(self, anomaly_score: float) -> str:
|
| 213 |
+
"""Classify severity based on anomaly score"""
|
| 214 |
if anomaly_score > 0.8:
|
| 215 |
return "CRITICAL"
|
| 216 |
elif anomaly_score > 0.6:
|
|
|
|
| 219 |
return "MEDIUM"
|
| 220 |
else:
|
| 221 |
return "LOW"
|
| 222 |
+
|
| 223 |
+
def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
|
| 224 |
+
"""Enhanced metric analysis with severity levels"""
|
| 225 |
+
affected = []
|
| 226 |
+
|
| 227 |
+
# Latency analysis
|
| 228 |
+
if event.latency_p99 > 500:
|
| 229 |
+
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
|
| 230 |
+
elif event.latency_p99 > 300:
|
| 231 |
+
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
|
| 232 |
+
elif event.latency_p99 > 150:
|
| 233 |
+
affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
|
| 234 |
+
|
| 235 |
+
# Error rate analysis
|
| 236 |
+
if event.error_rate > 0.3:
|
| 237 |
+
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
|
| 238 |
+
elif event.error_rate > 0.15:
|
| 239 |
+
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
|
| 240 |
+
elif event.error_rate > 0.05:
|
| 241 |
+
affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
|
| 242 |
+
|
| 243 |
+
# Resource analysis
|
| 244 |
+
if event.cpu_util and event.cpu_util > 0.9:
|
| 245 |
+
affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
|
| 246 |
+
elif event.cpu_util and event.cpu_util > 0.8:
|
| 247 |
+
affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
|
| 248 |
+
|
| 249 |
+
if event.memory_util and event.memory_util > 0.9:
|
| 250 |
+
affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
|
| 251 |
+
elif event.memory_util and event.memory_util > 0.8:
|
| 252 |
+
affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
|
| 253 |
+
|
| 254 |
+
return affected
|
| 255 |
+
|
| 256 |
+
def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
|
| 257 |
+
"""Generate specific, actionable recommendations"""
|
| 258 |
+
recommendations = []
|
| 259 |
+
affected_metrics = self._identify_affected_metrics(event)
|
| 260 |
+
|
| 261 |
+
for metric in affected_metrics:
|
| 262 |
+
metric_name = metric["metric"]
|
| 263 |
+
severity = metric["severity"]
|
| 264 |
+
value = metric["value"]
|
| 265 |
+
threshold = metric["threshold"]
|
| 266 |
+
|
| 267 |
+
if metric_name == "latency":
|
| 268 |
+
if severity == "CRITICAL":
|
| 269 |
+
recommendations.append(f"π¨ CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
|
| 270 |
+
elif severity == "HIGH":
|
| 271 |
+
recommendations.append(f"β οΈ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
|
| 272 |
+
else:
|
| 273 |
+
recommendations.append(f"π Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
|
| 274 |
+
|
| 275 |
+
elif metric_name == "error_rate":
|
| 276 |
+
if severity == "CRITICAL":
|
| 277 |
+
recommendations.append(f"π¨ CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
|
| 278 |
+
elif severity == "HIGH":
|
| 279 |
+
recommendations.append(f"β οΈ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
|
| 280 |
+
else:
|
| 281 |
+
recommendations.append(f"π Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
|
| 282 |
+
|
| 283 |
+
elif metric_name == "cpu":
|
| 284 |
+
recommendations.append(f"π₯ CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
|
| 285 |
+
|
| 286 |
+
elif metric_name == "memory":
|
| 287 |
+
recommendations.append(f"πΎ Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
|
| 288 |
+
|
| 289 |
+
# Add overall recommendations based on anomaly score
|
| 290 |
+
if anomaly_score > 0.8:
|
| 291 |
+
recommendations.append("π― IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
|
| 292 |
+
elif anomaly_score > 0.6:
|
| 293 |
+
recommendations.append("π― INVESTIGATE: Significant performance degradation detected")
|
| 294 |
+
elif anomaly_score > 0.4:
|
| 295 |
+
recommendations.append("π MONITOR: Early warning signs detected")
|
| 296 |
+
|
| 297 |
+
return recommendations[:4] # Return top 4 most important recommendations
|
| 298 |
|
| 299 |
class RootCauseAgent(BaseAgent):
|
| 300 |
def __init__(self):
|
|
|
|
| 313 |
'investigation_priority': self._prioritize_investigation(causes)
|
| 314 |
},
|
| 315 |
'recommendations': [
|
| 316 |
+
f"Check {cause['cause']} for issues" for cause in causes[:2]
|
| 317 |
]
|
| 318 |
}
|
| 319 |
|
| 320 |
+
def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
|
| 321 |
+
"""Enhanced root cause analysis with confidence scoring"""
|
| 322 |
+
causes = []
|
| 323 |
+
|
| 324 |
+
# High latency + high errors pattern
|
| 325 |
+
if event.latency_p99 > 500 and event.error_rate > 0.2:
|
| 326 |
+
causes.append({
|
| 327 |
+
"cause": "Database/External Dependency Failure",
|
| 328 |
+
"confidence": 0.85,
|
| 329 |
+
"evidence": f"Extreme latency ({event.latency_p99}ms) with high errors ({event.error_rate*100:.1f}%)",
|
| 330 |
+
"investigation": "Check database connection pool, external API health"
|
| 331 |
+
})
|
| 332 |
+
|
| 333 |
+
# Resource exhaustion pattern
|
| 334 |
+
if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
|
| 335 |
+
causes.append({
|
| 336 |
+
"cause": "Resource Exhaustion",
|
| 337 |
+
"confidence": 0.90,
|
| 338 |
+
"evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
|
| 339 |
+
"investigation": "Check for memory leaks, infinite loops, insufficient resources"
|
| 340 |
+
})
|
| 341 |
+
|
| 342 |
+
# Error spike pattern
|
| 343 |
+
if event.error_rate > 0.3 and event.latency_p99 < 200:
|
| 344 |
+
causes.append({
|
| 345 |
+
"cause": "Application Bug / Configuration Issue",
|
| 346 |
+
"confidence": 0.75,
|
| 347 |
+
"evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
|
| 348 |
+
"investigation": "Review recent deployments, configuration changes, application logs"
|
| 349 |
+
})
|
| 350 |
+
|
| 351 |
+
# Gradual degradation pattern
|
| 352 |
+
if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
|
| 353 |
+
causes.append({
|
| 354 |
+
"cause": "Gradual Performance Degradation",
|
| 355 |
+
"confidence": 0.65,
|
| 356 |
+
"evidence": f"Moderate latency ({event.latency_p99}ms) and errors ({event.error_rate*100:.1f}%)",
|
| 357 |
+
"investigation": "Check resource trends, dependency performance, capacity planning"
|
| 358 |
+
})
|
| 359 |
+
|
| 360 |
+
if not causes:
|
| 361 |
+
causes.append({
|
| 362 |
+
"cause": "Unknown - Requires Investigation",
|
| 363 |
+
"confidence": 0.3,
|
| 364 |
+
"evidence": "Pattern does not match known failure modes",
|
| 365 |
+
"investigation": "Complete system review needed"
|
| 366 |
+
})
|
| 367 |
+
|
| 368 |
+
return causes
|
| 369 |
|
| 370 |
def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
|
| 371 |
"""Identify evidence patterns"""
|
|
|
|
| 376 |
evidence.append("correlated_resource_exhaustion")
|
| 377 |
return evidence
|
| 378 |
|
| 379 |
+
def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
|
| 380 |
+
"""Prioritize investigation based on causes"""
|
| 381 |
+
for cause in causes:
|
| 382 |
+
if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
|
| 383 |
+
return "HIGH"
|
| 384 |
+
return "MEDIUM"
|
|
|
|
| 385 |
|
| 386 |
class OrchestrationManager:
|
| 387 |
def __init__(self):
|
|
|
|
| 420 |
'incident_summary': {
|
| 421 |
'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
|
| 422 |
'anomaly_confidence': detective_result['confidence'],
|
| 423 |
+
'primary_metrics_affected': [metric["metric"] for metric in detective_result['findings'].get('primary_metrics_affected', [])]
|
| 424 |
},
|
| 425 |
'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
|
| 426 |
'recommended_actions': self._prioritize_actions(
|