Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Nov 24, 2025

Commit

be77688

verified ·

1 Parent(s): 1c2c217

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -27

app.py CHANGED Viewed

@@ -211,18 +211,81 @@ class AnomalyDetectionAgent(BaseAgent):
         return min(1.0, sum(scores))
-    def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[str]:
-        """Identify which metrics are contributing to anomalies"""
-        affected = []
-        if event.latency_p99 > 150:
-            affected.append("latency")
-        if event.error_rate > 0.05:
-            affected.append("error_rate")
-        if event.cpu_util and event.cpu_util > 0.8:
-            affected.append("cpu_utilization")
-        if event.memory_util and event.memory_util > 0.8:
-            affected.append("memory_utilization")
-        return affected
     def _classify_severity(self, anomaly_score: float) -> str:
         if anomaly_score > 0.8:
@@ -255,21 +318,55 @@ class RootCauseAgent(BaseAgent):
             ]
         }
-    def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[str]:
-        """Analyze potential root causes based on metrics"""
-        causes = []
-        if event.latency_p99 > 300 and event.error_rate > 0.1:
-            causes.append("database_connection_pool")
-            causes.append("external_dependency_timeout")
-        elif event.cpu_util and event.cpu_util > 0.9:
-            causes.append("resource_exhaustion")
-            causes.append("memory_leak")
-        elif event.error_rate > 0.2:
-            causes.append("recent_deployment")
-            causes.append("configuration_change")
-        return causes if causes else ["unknown_cause_requires_investigation"]
     def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
         """Identify evidence patterns"""

         return min(1.0, sum(scores))
+def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
+    """Enhanced metric analysis with severity levels"""
+    affected = []
+    # Latency analysis
+    if event.latency_p99 > 500:
+        affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
+    elif event.latency_p99 > 300:
+        affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
+    elif event.latency_p99 > 150:
+        affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
+    # Error rate analysis
+    if event.error_rate > 0.3:
+        affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
+    elif event.error_rate > 0.15:
+        affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
+    elif event.error_rate > 0.05:
+        affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
+    # Resource analysis
+    if event.cpu_util and event.cpu_util > 0.9:
+        affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
+    elif event.cpu_util and event.cpu_util > 0.8:
+        affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
+    if event.memory_util and event.memory_util > 0.9:
+        affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
+    elif event.memory_util and event.memory_util > 0.8:
+        affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
+    return affected
+def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
+    """Generate specific, actionable recommendations"""
+    recommendations = []
+    affected_metrics = self._identify_affected_metrics(event)
+    for metric in affected_metrics:
+        metric_name = metric["metric"]
+        severity = metric["severity"]
+        value = metric["value"]
+        threshold = metric["threshold"]
+        if metric_name == "latency":
+            if severity == "CRITICAL":
+                recommendations.append(f"🚨 CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
+            elif severity == "HIGH":
+                recommendations.append(f"⚠️ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
+            else:
+                recommendations.append(f"📈 Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
+        elif metric_name == "error_rate":
+            if severity == "CRITICAL":
+                recommendations.append(f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
+            elif severity == "HIGH":
+                recommendations.append(f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
+            else:
+                recommendations.append(f"📈 Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
+        elif metric_name == "cpu":
+            recommendations.append(f"🔥 CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
+        elif metric_name == "memory":
+            recommendations.append(f"💾 Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
+    # Add overall recommendations based on anomaly score
+    if anomaly_score > 0.8:
+        recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
+    elif anomaly_score > 0.6:
+        recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
+    elif anomaly_score > 0.4:
+        recommendations.append("📊 MONITOR: Early warning signs detected")
+    return recommendations[:4]  # Return top 4 most important recommendations
     def _classify_severity(self, anomaly_score: float) -> str:
         if anomaly_score > 0.8:
             ]
         }
+def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
+    """Enhanced root cause analysis with confidence scoring"""
+    causes = []
+    # High latency + high errors pattern
+    if event.latency_p99 > 500 and event.error_rate > 0.2:
+        causes.append({
+            "cause": "Database/External Dependency Failure",
+            "confidence": 0.85,
+            "evidence": f"Extreme latency ({event.latency_p99}ms) with high errors ({event.error_rate*100:.1f}%)",
+            "investigation": "Check database connection pool, external API health"
+        })
+    # Resource exhaustion pattern
+    if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
+        causes.append({
+            "cause": "Resource Exhaustion",
+            "confidence": 0.90,
+            "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
+            "investigation": "Check for memory leaks, infinite loops, insufficient resources"
+        })
+    # Error spike pattern
+    if event.error_rate > 0.3 and event.latency_p99 < 200:
+        causes.append({
+            "cause": "Application Bug / Configuration Issue",
+            "confidence": 0.75,
+            "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
+            "investigation": "Review recent deployments, configuration changes, application logs"
+        })
+    # Gradual degradation pattern
+    if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
+        causes.append({
+            "cause": "Gradual Performance Degradation",
+            "confidence": 0.65,
+            "evidence": f"Moderate latency ({event.latency_p99}ms) and errors ({event.error_rate*100:.1f}%)",
+            "investigation": "Check resource trends, dependency performance, capacity planning"
+        })
+    if not causes:
+        causes.append({
+            "cause": "Unknown - Requires Investigation",
+            "confidence": 0.3,
+            "evidence": "Pattern does not match known failure modes",
+            "investigation": "Complete system review needed"
+        })
+    return causes
     def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
         """Identify evidence patterns"""