Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Nov 25, 2025

Commit

714bfce

verified ·

1 Parent(s): 1437f82

Update app.py

Browse files

Files changed (1) hide show

app.py +732 -70

app.py CHANGED Viewed

@@ -2,8 +2,14 @@
 Enterprise Agentic Reliability Framework - Main Application
 Multi-Agent AI System for Production Reliability Monitoring
-This module provides the main Gradio UI and orchestrates the reliability
-monitoring system with anomaly detection, predictive analytics, and auto-healing.
 """
 import os
@@ -20,11 +26,11 @@ from collections import deque
 from dataclasses import dataclass, asdict
 import hashlib
 import asyncio
 # Import our modules
 from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
 from healing_policies import PolicyEngine
-from agent_orchestrator import OrchestrationManager
 # === Logging Configuration ===
 logging.basicConfig(
@@ -47,8 +53,10 @@ class Config:
     # Thresholds
     LATENCY_WARNING: float = 150.0
     LATENCY_CRITICAL: float = 300.0
     ERROR_RATE_WARNING: float = 0.05
-    ERROR_RATE_CRITICAL: float = 0.15
     CPU_WARNING: float = 0.8
     CPU_CRITICAL: float = 0.9
     MEMORY_WARNING: float = 0.8
@@ -59,6 +67,10 @@ class Config:
     MAX_EVENTS_STORED: int = 1000
     AGENT_TIMEOUT: int = 10
     CACHE_EXPIRY_MINUTES: int = 15
 config = Config()
@@ -214,7 +226,10 @@ class ForecastResult:
     risk_level: str = "low"  # low, medium, high, critical
 class SimplePredictiveEngine:
-    """Lightweight forecasting engine optimized for Hugging Face Spaces"""
     def __init__(self, history_window: int = config.HISTORY_WINDOW):
         self.history_window = history_window
@@ -225,7 +240,13 @@ class SimplePredictiveEngine:
         logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
     def add_telemetry(self, service: str, event_data: Dict) -> None:
-        """Add telemetry data to service history"""
         with self._lock:
             if service not in self.service_history:
                 self.service_history[service] = deque(maxlen=self.history_window)
@@ -256,7 +277,16 @@ class SimplePredictiveEngine:
             logger.debug(f"Cleaned {len(expired)} expired cache entries")
     def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
-        """Forecast service health metrics"""
         with self._lock:
             if service not in self.service_history or len(self.service_history[service]) < 10:
                 return []
@@ -288,7 +318,16 @@ class SimplePredictiveEngine:
         return forecasts
     def _forecast_latency(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
-        """Forecast latency using linear regression and trend analysis"""
         try:
             latencies = [point['latency'] for point in history[-20:]]
@@ -307,25 +346,25 @@ class SimplePredictiveEngine:
             residuals = latencies - (slope * x + intercept)
             confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
-            # Determine trend
             if slope > 5:
                 trend = "increasing"
-                risk = "high" if predicted_latency > config.LATENCY_CRITICAL else "medium"
             elif slope < -2:
                 trend = "decreasing"
                 risk = "low"
             else:
                 trend = "stable"
-                risk = "low"
             # Calculate time to reach critical threshold (500ms)
             time_to_critical = None
-            if slope > 0 and predicted_latency < 500:
                 denominator = predicted_latency - latencies[-1]
                 if abs(denominator) > 0.1:  # Avoid division by very small numbers
-                    time_to_critical = datetime.timedelta(
-                        minutes=lookahead_minutes * (500 - predicted_latency) / denominator
-                    )
             return ForecastResult(
                 metric="latency",
@@ -341,7 +380,16 @@ class SimplePredictiveEngine:
             return None
     def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
-        """Forecast error rate using exponential smoothing"""
         try:
             error_rates = [point['error_rate'] for point in history[-15:]]
@@ -361,13 +409,13 @@ class SimplePredictiveEngine:
             if recent_trend > 0.02:
                 trend = "increasing"
-                risk = "high" if predicted_rate > 0.1 else "medium"
             elif recent_trend < -0.01:
                 trend = "decreasing"
                 risk = "low"
             else:
                 trend = "stable"
-                risk = "low"
             # Confidence based on volatility
             confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
@@ -385,7 +433,16 @@ class SimplePredictiveEngine:
             return None
     def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
-        """Forecast CPU and memory utilization"""
         forecasts = []
         # CPU forecast
@@ -441,7 +498,15 @@ class SimplePredictiveEngine:
         return forecasts
     def get_predictive_insights(self, service: str) -> Dict[str, Any]:
-        """Generate actionable insights from forecasts"""
         forecasts = self.forecast_service_health(service)
         critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
@@ -483,7 +548,10 @@ events_history_store = ThreadSafeEventStore()
 predictive_engine = SimplePredictiveEngine()
 class BusinessImpactCalculator:
-    """Calculate business impact of anomalies"""
     def __init__(self, revenue_per_request: float = 0.01):
         self.revenue_per_request = revenue_per_request
@@ -498,12 +566,13 @@ class BusinessImpactCalculator:
             duration_minutes: Assumed duration of the incident
         Returns:
-            Dictionary containing impact estimates
         """
-        base_revenue_per_minute = 100
         impact_multiplier = 1.0
         if event.latency_p99 > config.LATENCY_CRITICAL:
             impact_multiplier += 0.5
         if event.error_rate > 0.1:
@@ -513,10 +582,11 @@ class BusinessImpactCalculator:
         revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
-        base_users_affected = 1000
         user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
         affected_users = int(base_users_affected * user_impact_multiplier)
         if revenue_loss > 500 or affected_users > 5000:
             severity = "CRITICAL"
         elif revenue_loss > 100 or affected_users > 1000:
@@ -526,7 +596,7 @@ class BusinessImpactCalculator:
         else:
             severity = "LOW"
-        logger.info(f"Business impact calculated: ${revenue_loss:.2f} revenue loss, {affected_users} users affected, {severity} severity")
         return {
             'revenue_loss_estimate': round(revenue_loss, 2),
@@ -538,7 +608,10 @@ class BusinessImpactCalculator:
 business_calculator = BusinessImpactCalculator()
 class AdvancedAnomalyDetector:
-    """Enhanced anomaly detection with adaptive thresholds"""
     def __init__(self):
         self.historical_data = deque(maxlen=100)
@@ -551,7 +624,7 @@ class AdvancedAnomalyDetector:
     def detect_anomaly(self, event: ReliabilityEvent) -> bool:
         """
-        Detect if event is anomalous
         Args:
             event: The reliability event to check
@@ -590,41 +663,597 @@ class AdvancedAnomalyDetector:
 anomaly_detector = AdvancedAnomalyDetector()
-# === Predictive Agent Integration ===
-class PredictiveAgent:
-    """Predictive agent that uses SimplePredictiveEngine"""
     def __init__(self):
         self.engine = predictive_engine
         logger.info("Initialized PredictiveAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
-        """Predictive analysis for future risks"""
-        event_data = {
-            'latency_p99': event.latency_p99,
-            'error_rate': event.error_rate,
-            'throughput': event.throughput,
-            'cpu_util': event.cpu_util,
-            'memory_util': event.memory_util
         }
-        self.engine.add_telemetry(event.component, event_data)
-        insights = self.engine.get_predictive_insights(event.component)
-        return {
-            'specialization': 'predictive_analytics',
-            'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
-            'findings': insights,
-            'recommendations': insights['recommendations']
         }
-# Initialize orchestration with predictive agent
 orchestration_manager = OrchestrationManager()
-orchestration_manager.agents['predictive_analytics'] = PredictiveAgent()
 # === Enhanced Reliability Engine ===
 class EnhancedReliabilityEngine:
-    """Main engine for processing reliability events"""
     def __init__(self):
         self.performance_metrics = {
@@ -645,7 +1274,7 @@ class EnhancedReliabilityEngine:
         memory_util: Optional[float] = None
     ) -> Dict[str, Any]:
         """
-        Process a reliability event through the multi-agent system
         Args:
             component: Service component name
@@ -656,7 +1285,7 @@ class EnhancedReliabilityEngine:
             memory_util: Memory utilization (0-1)
         Returns:
-            Dictionary containing analysis results
         """
         logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
@@ -684,6 +1313,7 @@ class EnhancedReliabilityEngine:
         else:
             agent_confidence = 0.8 if is_anomaly else 0.1
         if agent_confidence > 0.8:
             event.severity = EventSeverity.CRITICAL
         elif agent_confidence > 0.6:
@@ -699,7 +1329,7 @@ class EnhancedReliabilityEngine:
         # Calculate business impact
         business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
-        # Store in vector database
         if thread_safe_index is not None and model is not None and is_anomaly:
             try:
                 analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
@@ -709,7 +1339,7 @@ class EnhancedReliabilityEngine:
             except Exception as e:
                 logger.error(f"Error storing vector: {e}", exc_info=True)
-        # Build result
         result = {
             "timestamp": event.timestamp,
             "component": component,
@@ -728,10 +1358,10 @@ class EnhancedReliabilityEngine:
             }
         }
-        # Store event
         events_history_store.add(event)
-        # Update metrics
         with self._lock:
             self.performance_metrics['total_incidents_processed'] += 1
             self.performance_metrics['multi_agent_analyses'] += 1
@@ -754,10 +1384,17 @@ def validate_inputs(
     memory_util: Optional[float]
 ) -> Tuple[bool, str]:
     """
-    Validate user inputs
     Returns:
-        Tuple of (is_valid, error_message)
     """
     if not (0 <= latency <= 10000):
         return False, "❌ Invalid latency: must be between 0-10000ms"
@@ -772,9 +1409,13 @@ def validate_inputs(
     return True, ""
-# === Enhanced UI with Multi-Agent Insights ===
 def create_enhanced_ui():
-    """Create the Gradio UI for the reliability framework"""
     with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
         gr.Markdown("""
@@ -882,9 +1523,19 @@ def create_enhanced_ui():
             gr.Markdown("\n\n".join(policy_info))
-        # ✅ FIXED: Synchronous wrapper for async function
         def submit_event_enhanced_sync(component, latency, error_rate, throughput, cpu_util, memory_util):
-            """Synchronous wrapper for async event processing - FIXES GRADIO ASYNC ISSUE"""
             try:
                 # Type conversion
                 latency = float(latency)
@@ -893,13 +1544,13 @@ def create_enhanced_ui():
                 cpu_util = float(cpu_util) if cpu_util else None
                 memory_util = float(memory_util) if memory_util else None
-                # Input validation
                 is_valid, error_msg = validate_inputs(latency, error_rate, throughput, cpu_util, memory_util)
                 if not is_valid:
                     logger.warning(f"Invalid input: {error_msg}")
                     return error_msg, {}, {}, gr.Dataframe(value=[])
-                # Create new event loop for async execution
                 loop = asyncio.new_event_loop()
                 asyncio.set_event_loop(loop)
@@ -913,7 +1564,7 @@ def create_enhanced_ui():
                 finally:
                     loop.close()
-                # Build table data
                 table_data = []
                 for event in events_history_store.get_recent(15):
                     table_data.append([
@@ -945,7 +1596,11 @@ def create_enhanced_ui():
                 if result["business_impact"]:
                     impact = result["business_impact"]
-                    output_msg += f"\n💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | 👥 {impact['affected_users_estimate']} users | 🚨 {impact['severity_level']}"
                 if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
                     actions = ", ".join(result["healing_actions"])
@@ -974,32 +1629,39 @@ def create_enhanced_ui():
                 logger.error(error_msg, exc_info=True)
                 return error_msg, {}, {}, gr.Dataframe(value=[])
-        # ✅ FIXED: Use sync wrapper instead of async function
         submit_btn.click(
-            fn=submit_event_enhanced_sync,
             inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
             outputs=[output_text, agent_insights, predictive_insights, events_table]
         )
     return demo
 if __name__ == "__main__":
-    logger.info("Starting Enterprise Agentic Reliability Framework...")
     logger.info(f"Total events in history: {events_history_store.count()}")
     logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
     demo = create_enhanced_ui()
-    logger.info("Launching Gradio UI...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False
     )
-    # Save any pending vectors on shutdown
     if thread_safe_index:
-        logger.info("Saving pending vectors...")
         thread_safe_index.force_save()
-    logger.info("Application shutdown complete")

 Enterprise Agentic Reliability Framework - Main Application
 Multi-Agent AI System for Production Reliability Monitoring
+This module provides the complete reliability monitoring system including:
+- Multi-agent anomaly detection and root cause analysis
+- Predictive analytics and forecasting
+- Policy-based auto-healing
+- Business impact quantification
+- Vector-based incident memory
+- Adaptive thresholds
+- Thread-safe concurrent operations
 """
 import os
 from dataclasses import dataclass, asdict
 import hashlib
 import asyncio
+from enum import Enum
 # Import our modules
 from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
 from healing_policies import PolicyEngine
 # === Logging Configuration ===
 logging.basicConfig(
     # Thresholds
     LATENCY_WARNING: float = 150.0
     LATENCY_CRITICAL: float = 300.0
+    LATENCY_EXTREME: float = 500.0
     ERROR_RATE_WARNING: float = 0.05
+    ERROR_RATE_HIGH: float = 0.15
+    ERROR_RATE_CRITICAL: float = 0.3
     CPU_WARNING: float = 0.8
     CPU_CRITICAL: float = 0.9
     MEMORY_WARNING: float = 0.8
     MAX_EVENTS_STORED: int = 1000
     AGENT_TIMEOUT: int = 10
     CACHE_EXPIRY_MINUTES: int = 15
+    # Business metrics
+    BASE_REVENUE_PER_MINUTE: float = 100.0
+    BASE_USERS: int = 1000
 config = Config()
     risk_level: str = "low"  # low, medium, high, critical
 class SimplePredictiveEngine:
+    """
+    Lightweight forecasting engine optimized for Hugging Face Spaces.
+    Uses statistical methods for time-series prediction.
+    """
     def __init__(self, history_window: int = config.HISTORY_WINDOW):
         self.history_window = history_window
         logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
     def add_telemetry(self, service: str, event_data: Dict) -> None:
+        """
+        Add telemetry data to service history
+        Args:
+            service: Service name
+            event_data: Dictionary containing metrics (latency_p99, error_rate, etc.)
+        """
         with self._lock:
             if service not in self.service_history:
                 self.service_history[service] = deque(maxlen=self.history_window)
             logger.debug(f"Cleaned {len(expired)} expired cache entries")
     def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
+        """
+        Forecast service health metrics
+        Args:
+            service: Service name to forecast
+            lookahead_minutes: Time horizon in minutes
+        Returns:
+            List of forecast results for different metrics
+        """
         with self._lock:
             if service not in self.service_history or len(self.service_history[service]) < 10:
                 return []
         return forecasts
     def _forecast_latency(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
+        """
+        Forecast latency using linear regression and trend analysis
+        Args:
+            history: Historical telemetry data
+            lookahead_minutes: Forecast horizon
+        Returns:
+            ForecastResult or None if insufficient data
+        """
         try:
             latencies = [point['latency'] for point in history[-20:]]
             residuals = latencies - (slope * x + intercept)
             confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
+            # Determine trend and risk
             if slope > 5:
                 trend = "increasing"
+                risk = "critical" if predicted_latency > config.LATENCY_EXTREME else "high"
             elif slope < -2:
                 trend = "decreasing"
                 risk = "low"
             else:
                 trend = "stable"
+                risk = "low" if predicted_latency < config.LATENCY_WARNING else "medium"
             # Calculate time to reach critical threshold (500ms)
             time_to_critical = None
+            if slope > 0 and predicted_latency < config.LATENCY_EXTREME:
                 denominator = predicted_latency - latencies[-1]
                 if abs(denominator) > 0.1:  # Avoid division by very small numbers
+                    minutes_to_critical = lookahead_minutes * (config.LATENCY_EXTREME - predicted_latency) / denominator
+                    if minutes_to_critical > 0:
+                        time_to_critical = datetime.timedelta(minutes=minutes_to_critical)
             return ForecastResult(
                 metric="latency",
             return None
     def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
+        """
+        Forecast error rate using exponential smoothing
+        Args:
+            history: Historical telemetry data
+            lookahead_minutes: Forecast horizon
+        Returns:
+            ForecastResult or None if insufficient data
+        """
         try:
             error_rates = [point['error_rate'] for point in history[-15:]]
             if recent_trend > 0.02:
                 trend = "increasing"
+                risk = "critical" if predicted_rate > config.ERROR_RATE_CRITICAL else "high"
             elif recent_trend < -0.01:
                 trend = "decreasing"
                 risk = "low"
             else:
                 trend = "stable"
+                risk = "low" if predicted_rate < config.ERROR_RATE_WARNING else "medium"
             # Confidence based on volatility
             confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
             return None
     def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
+        """
+        Forecast CPU and memory utilization
+        Args:
+            history: Historical telemetry data
+            lookahead_minutes: Forecast horizon
+        Returns:
+            List of forecast results for CPU and memory
+        """
         forecasts = []
         # CPU forecast
         return forecasts
     def get_predictive_insights(self, service: str) -> Dict[str, Any]:
+        """
+        Generate actionable insights from forecasts
+        Args:
+            service: Service name
+        Returns:
+            Dictionary containing warnings, recommendations, and forecast data
+        """
         forecasts = self.forecast_service_health(service)
         critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
 predictive_engine = SimplePredictiveEngine()
 class BusinessImpactCalculator:
+    """
+    Calculate business impact of anomalies including revenue loss
+    and user impact estimation
+    """
     def __init__(self, revenue_per_request: float = 0.01):
         self.revenue_per_request = revenue_per_request
             duration_minutes: Assumed duration of the incident
         Returns:
+            Dictionary containing revenue loss, user impact, and severity
         """
+        base_revenue_per_minute = config.BASE_REVENUE_PER_MINUTE
         impact_multiplier = 1.0
+        # Impact factors
         if event.latency_p99 > config.LATENCY_CRITICAL:
             impact_multiplier += 0.5
         if event.error_rate > 0.1:
         revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
+        base_users_affected = config.BASE_USERS
         user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
         affected_users = int(base_users_affected * user_impact_multiplier)
+        # Severity classification
         if revenue_loss > 500 or affected_users > 5000:
             severity = "CRITICAL"
         elif revenue_loss > 100 or affected_users > 1000:
         else:
             severity = "LOW"
+        logger.info(f"Business impact: ${revenue_loss:.2f} revenue loss, {affected_users} users, {severity} severity")
         return {
             'revenue_loss_estimate': round(revenue_loss, 2),
 business_calculator = BusinessImpactCalculator()
 class AdvancedAnomalyDetector:
+    """
+    Enhanced anomaly detection with adaptive thresholds that learn
+    from historical data patterns
+    """
     def __init__(self):
         self.historical_data = deque(maxlen=100)
     def detect_anomaly(self, event: ReliabilityEvent) -> bool:
         """
+        Detect if event is anomalous using adaptive thresholds
         Args:
             event: The reliability event to check
 anomaly_detector = AdvancedAnomalyDetector()
+# === Multi-Agent System ===
+class AgentSpecialization(Enum):
+    """Agent specialization types"""
+    DETECTIVE = "anomaly_detection"
+    DIAGNOSTICIAN = "root_cause_analysis"
+    PREDICTIVE = "predictive_analytics"
+class BaseAgent:
+    """Base class for all specialized agents"""
+    def __init__(self, specialization: AgentSpecialization):
+        self.specialization = specialization
+        self.performance_metrics = {
+            'processed_events': 0,
+            'successful_analyses': 0,
+            'average_confidence': 0.0
+        }
+    async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
+        """Base analysis method to be implemented by specialized agents"""
+        raise NotImplementedError
+class AnomalyDetectionAgent(BaseAgent):
+    """
+    Specialized agent for anomaly detection and pattern recognition.
+    Calculates multi-dimensional anomaly scores and identifies affected metrics.
+    """
     def __init__(self):
+        super().__init__(AgentSpecialization.DETECTIVE)
+        logger.info("Initialized AnomalyDetectionAgent")
+    async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
+        """
+        Perform comprehensive anomaly analysis
+        Args:
+            event: Reliability event to analyze
+        Returns:
+            Dictionary containing anomaly score, severity, affected metrics, and recommendations
+        """
+        try:
+            anomaly_score = self._calculate_anomaly_score(event)
+            return {
+                'specialization': self.specialization.value,
+                'confidence': anomaly_score,
+                'findings': {
+                    'anomaly_score': anomaly_score,
+                    'severity_tier': self._classify_severity(anomaly_score),
+                    'primary_metrics_affected': self._identify_affected_metrics(event)
+                },
+                'recommendations': self._generate_detection_recommendations(event, anomaly_score)
+            }
+        except Exception as e:
+            logger.error(f"AnomalyDetectionAgent error: {e}", exc_info=True)
+            return {
+                'specialization': self.specialization.value,
+                'confidence': 0.0,
+                'findings': {},
+                'recommendations': [f"Analysis error: {str(e)}"]
+            }
+    def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
+        """
+        Calculate comprehensive anomaly score (0-1) using weighted metrics
+        Args:
+            event: Reliability event
+        Returns:
+            Float between 0 and 1 representing anomaly severity
+        """
+        scores = []
+        # Latency anomaly (weighted 40%)
+        if event.latency_p99 > config.LATENCY_WARNING:
+            latency_score = min(1.0, (event.latency_p99 - config.LATENCY_WARNING) / 500)
+            scores.append(0.4 * latency_score)
+        # Error rate anomaly (weighted 30%)
+        if event.error_rate > config.ERROR_RATE_WARNING:
+            error_score = min(1.0, event.error_rate / 0.3)
+            scores.append(0.3 * error_score)
+        # Resource anomaly (weighted 30%)
+        resource_score = 0
+        if event.cpu_util and event.cpu_util > config.CPU_WARNING:
+            resource_score += 0.15 * min(1.0, (event.cpu_util - config.CPU_WARNING) / 0.2)
+        if event.memory_util and event.memory_util > config.MEMORY_WARNING:
+            resource_score += 0.15 * min(1.0, (event.memory_util - config.MEMORY_WARNING) / 0.2)
+        scores.append(resource_score)
+        return min(1.0, sum(scores))
+    def _classify_severity(self, anomaly_score: float) -> str:
+        """
+        Classify severity tier based on anomaly score
+        Args:
+            anomaly_score: Score between 0 and 1
+        Returns:
+            Severity tier string (LOW, MEDIUM, HIGH, CRITICAL)
+        """
+        if anomaly_score > 0.8:
+            return "CRITICAL"
+        elif anomaly_score > 0.6:
+            return "HIGH"
+        elif anomaly_score > 0.4:
+            return "MEDIUM"
+        else:
+            return "LOW"
+    def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
+        """
+        Identify which metrics are outside normal ranges
+        Args:
+            event: Reliability event
+        Returns:
+            List of dictionaries describing affected metrics with severity
+        """
+        affected = []
+        # Latency checks
+        if event.latency_p99 > config.LATENCY_EXTREME:
+            affected.append({
+                "metric": "latency",
+                "value": event.latency_p99,
+                "severity": "CRITICAL",
+                "threshold": config.LATENCY_WARNING
+            })
+        elif event.latency_p99 > config.LATENCY_CRITICAL:
+            affected.append({
+                "metric": "latency",
+                "value": event.latency_p99,
+                "severity": "HIGH",
+                "threshold": config.LATENCY_WARNING
+            })
+        elif event.latency_p99 > config.LATENCY_WARNING:
+            affected.append({
+                "metric": "latency",
+                "value": event.latency_p99,
+                "severity": "MEDIUM",
+                "threshold": config.LATENCY_WARNING
+            })
+        # Error rate checks
+        if event.error_rate > config.ERROR_RATE_CRITICAL:
+            affected.append({
+                "metric": "error_rate",
+                "value": event.error_rate,
+                "severity": "CRITICAL",
+                "threshold": config.ERROR_RATE_WARNING
+            })
+        elif event.error_rate > config.ERROR_RATE_HIGH:
+            affected.append({
+                "metric": "error_rate",
+                "value": event.error_rate,
+                "severity": "HIGH",
+                "threshold": config.ERROR_RATE_WARNING
+            })
+        elif event.error_rate > config.ERROR_RATE_WARNING:
+            affected.append({
+                "metric": "error_rate",
+                "value": event.error_rate,
+                "severity": "MEDIUM",
+                "threshold": config.ERROR_RATE_WARNING
+            })
+        # CPU checks
+        if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
+            affected.append({
+                "metric": "cpu",
+                "value": event.cpu_util,
+                "severity": "CRITICAL",
+                "threshold": config.CPU_WARNING
+            })
+        elif event.cpu_util and event.cpu_util > config.CPU_WARNING:
+            affected.append({
+                "metric": "cpu",
+                "value": event.cpu_util,
+                "severity": "HIGH",
+                "threshold": config.CPU_WARNING
+            })
+        # Memory checks
+        if event.memory_util and event.memory_util > config.MEMORY_CRITICAL:
+            affected.append({
+                "metric": "memory",
+                "value": event.memory_util,
+                "severity": "CRITICAL",
+                "threshold": config.MEMORY_WARNING
+            })
+        elif event.memory_util and event.memory_util > config.MEMORY_WARNING:
+            affected.append({
+                "metric": "memory",
+                "value": event.memory_util,
+                "severity": "HIGH",
+                "threshold": config.MEMORY_WARNING
+            })
+        return affected
+    def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
+        """
+        Generate actionable recommendations based on detected anomalies
+        Args:
+            event: Reliability event
+            anomaly_score: Calculated anomaly score
+        Returns:
+            List of recommendation strings with emojis for visibility
+        """
+        recommendations = []
+        affected_metrics = self._identify_affected_metrics(event)
+        for metric in affected_metrics:
+            metric_name = metric["metric"]
+            severity = metric["severity"]
+            value = metric["value"]
+            threshold = metric["threshold"]
+            if metric_name == "latency":
+                if severity == "CRITICAL":
+                    recommendations.append(
+                        f"🚨 CRITICAL: Latency {value:.0f}ms (>{threshold}ms) - "
+                        f"Check database & external dependencies"
+                    )
+                elif severity == "HIGH":
+                    recommendations.append(
+                        f"⚠️ HIGH: Latency {value:.0f}ms (>{threshold}ms) - "
+                        f"Investigate service performance"
+                    )
+                else:
+                    recommendations.append(
+                        f"📈 Latency elevated: {value:.0f}ms (>{threshold}ms) - Monitor trend"
+                    )
+            elif metric_name == "error_rate":
+                if severity == "CRITICAL":
+                    recommendations.append(
+                        f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100:.1f}%) - "
+                        f"Check recent deployments"
+                    )
+                elif severity == "HIGH":
+                    recommendations.append(
+                        f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100:.1f}%) - "
+                        f"Review application logs"
+                    )
+                else:
+                    recommendations.append(
+                        f"📈 Errors increasing: {value*100:.1f}% (>{threshold*100:.1f}%)"
+                    )
+            elif metric_name == "cpu":
+                recommendations.append(
+                    f"🔥 CPU {severity}: {value*100:.1f}% utilization - Consider scaling"
+                )
+            elif metric_name == "memory":
+                recommendations.append(
+                    f"💾 Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks"
+                )
+        # Overall severity recommendations
+        if anomaly_score > 0.8:
+            recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
+        elif anomaly_score > 0.6:
+            recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
+        elif anomaly_score > 0.4:
+            recommendations.append("📊 MONITOR: Early warning signs detected")
+        return recommendations[:4]  # Return top 4 recommendations
+class RootCauseAgent(BaseAgent):
+    """
+    Specialized agent for root cause analysis.
+    Analyzes failure patterns and provides investigation guidance.
+    """
+    def __init__(self):
+        super().__init__(AgentSpecialization.DIAGNOSTICIAN)
+        logger.info("Initialized RootCauseAgent")
+    async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
+        """
+        Perform root cause analysis
+        Args:
+            event: Reliability event to analyze
+        Returns:
+            Dictionary containing likely root causes and investigation guidance
+        """
+        try:
+            causes = self._analyze_potential_causes(event)
+            return {
+                'specialization': self.specialization.value,
+                'confidence': 0.7,
+                'findings': {
+                    'likely_root_causes': causes,
+                    'evidence_patterns': self._identify_evidence(event),
+                    'investigation_priority': self._prioritize_investigation(causes)
+                },
+                'recommendations': [
+                    f"Check {cause['cause']} for issues" for cause in causes[:2]
+                ]
+            }
+        except Exception as e:
+            logger.error(f"RootCauseAgent error: {e}", exc_info=True)
+            return {
+                'specialization': self.specialization.value,
+                'confidence': 0.0,
+                'findings': {},
+                'recommendations': [f"Analysis error: {str(e)}"]
+            }
+    def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
+        """
+        Analyze potential root causes based on event patterns
+        Args:
+            event: Reliability event
+        Returns:
+            List of potential root causes with confidence scores
+        """
+        causes = []
+        # Pattern 1: Database/External Dependency Failure
+        if event.latency_p99 > config.LATENCY_EXTREME and event.error_rate > 0.2:
+            causes.append({
+                "cause": "Database/External Dependency Failure",
+                "confidence": 0.85,
+                "evidence": f"Extreme latency ({event.latency_p99:.0f}ms) with high errors ({event.error_rate*100:.1f}%)",
+                "investigation": "Check database connection pool, external API health"
+            })
+        # Pattern 2: Resource Exhaustion
+        if (event.cpu_util and event.cpu_util > config.CPU_CRITICAL and
+            event.memory_util and event.memory_util > config.MEMORY_CRITICAL):
+            causes.append({
+                "cause": "Resource Exhaustion",
+                "confidence": 0.90,
+                "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
+                "investigation": "Check for memory leaks, infinite loops, insufficient resources"
+            })
+        # Pattern 3: Application Bug / Configuration Issue
+        if event.error_rate > config.ERROR_RATE_CRITICAL and event.latency_p99 < 200:
+            causes.append({
+                "cause": "Application Bug / Configuration Issue",
+                "confidence": 0.75,
+                "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
+                "investigation": "Review recent deployments, configuration changes, application logs"
+            })
+        # Pattern 4: Gradual Performance Degradation
+        if (200 <= event.latency_p99 <= 400 and
+            config.ERROR_RATE_WARNING <= event.error_rate <= config.ERROR_RATE_HIGH):
+            causes.append({
+                "cause": "Gradual Performance Degradation",
+                "confidence": 0.65,
+                "evidence": f"Moderate latency ({event.latency_p99:.0f}ms) and errors ({event.error_rate*100:.1f}%)",
+                "investigation": "Check resource trends, dependency performance, capacity planning"
+            })
+        # Default: Unknown pattern
+        if not causes:
+            causes.append({
+                "cause": "Unknown - Requires Investigation",
+                "confidence": 0.3,
+                "evidence": "Pattern does not match known failure modes",
+                "investigation": "Complete system review needed"
+            })
+        return causes
+    def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
+        """
+        Identify evidence patterns in the event data
+        Args:
+            event: Reliability event
+        Returns:
+            List of evidence pattern identifiers
+        """
+        evidence = []
+        if event.latency_p99 > event.error_rate * 1000:
+            evidence.append("latency_disproportionate_to_errors")
+        if (event.cpu_util and event.cpu_util > config.CPU_WARNING and
+            event.memory_util and event.memory_util > config.MEMORY_WARNING):
+            evidence.append("correlated_resource_exhaustion")
+        if event.error_rate > config.ERROR_RATE_HIGH and event.latency_p99 < config.LATENCY_CRITICAL:
+            evidence.append("errors_without_latency_impact")
+        return evidence
+    def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
+        """
+        Determine investigation priority based on identified causes
+        Args:
+            causes: List of potential root causes
+        Returns:
+            Priority level (HIGH, MEDIUM, LOW)
+        """
+        for cause in causes:
+            if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
+                return "HIGH"
+        return "MEDIUM"
+class PredictiveAgent(BaseAgent):
+    """
+    Specialized agent for predictive analytics.
+    Forecasts future risks and trends using statistical models.
+    """
+    def __init__(self):
+        super().__init__(AgentSpecialization.PREDICTIVE)
         self.engine = predictive_engine
         logger.info("Initialized PredictiveAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
+        """
+        Perform predictive analysis for future risks
+        Args:
+            event: Current reliability event
+        Returns:
+            Dictionary containing forecasts and predictive insights
+        """
+        try:
+            event_data = {
+                'latency_p99': event.latency_p99,
+                'error_rate': event.error_rate,
+                'throughput': event.throughput,
+                'cpu_util': event.cpu_util,
+                'memory_util': event.memory_util
+            }
+            self.engine.add_telemetry(event.component, event_data)
+            insights = self.engine.get_predictive_insights(event.component)
+            return {
+                'specialization': self.specialization.value,
+                'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
+                'findings': insights,
+                'recommendations': insights['recommendations']
+            }
+        except Exception as e:
+            logger.error(f"PredictiveAgent error: {e}", exc_info=True)
+            return {
+                'specialization': self.specialization.value,
+                'confidence': 0.0,
+                'findings': {},
+                'recommendations': [f"Analysis error: {str(e)}"]
+            }
+class OrchestrationManager:
+    """
+    Orchestrates multiple specialized agents for comprehensive analysis.
+    Runs agents in parallel and synthesizes their findings.
+    """
+    def __init__(self):
+        self.agents = {
+            AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
+            AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
+            AgentSpecialization.PREDICTIVE: PredictiveAgent(),
         }
+        logger.info(f"Initialized OrchestrationManager with {len(self.agents)} agents")
+    async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
+        """
+        Coordinate multiple agents for comprehensive analysis
+        Args:
+            event: Reliability event to analyze
+        Returns:
+            Synthesized findings from all agents
+        """
+        agent_tasks = {
+            spec: agent.analyze(event)
+            for spec, agent in self.agents.items()
+        }
+        # Parallel agent execution with timeout protection
+        agent_results = {}
+        for specialization, task in agent_tasks.items():
+            try:
+                result = await asyncio.wait_for(task, timeout=5.0)
+                agent_results[specialization.value] = result
+                logger.debug(f"Agent {specialization.value} completed successfully")
+            except asyncio.TimeoutError:
+                logger.warning(f"Agent {specialization.value} timed out")
+                continue
+            except Exception as e:
+                logger.error(f"Agent {specialization.value} error: {e}", exc_info=True)
+                continue
+        return self._synthesize_agent_findings(event, agent_results)
+    def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
+        """
+        Combine insights from all specialized agents
+        Args:
+            event: Original reliability event
+            agent_results: Results from each agent
+        Returns:
+            Synthesized analysis combining all agent findings
+        """
+        detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
+        diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
+        predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
+        if not detective_result:
+            logger.warning("No detective agent results available")
+            return {'error': 'No agent results available'}
+        synthesis = {
+            'incident_summary': {
+                'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
+                'anomaly_confidence': detective_result['confidence'],
+                'primary_metrics_affected': [
+                    metric["metric"] for metric in
+                    detective_result['findings'].get('primary_metrics_affected', [])
+                ]
+            },
+            'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
+            'predictive_insights': predictive_result['findings'] if predictive_result else {},
+            'recommended_actions': self._prioritize_actions(
+                detective_result.get('recommendations', []),
+                diagnostician_result.get('recommendations', []) if diagnostician_result else [],
+                predictive_result.get('recommendations', []) if predictive_result else []
+            ),
+            'agent_metadata': {
+                'participating_agents': list(agent_results.keys()),
+                'analysis_timestamp': datetime.datetime.now().isoformat()
+            }
         }
+        return synthesis
+    def _prioritize_actions(self, detection_actions: List[str],
+                          diagnosis_actions: List[str],
+                          predictive_actions: List[str]) -> List[str]:
+        """
+        Combine and prioritize actions from multiple agents
+        Args:
+            detection_actions: Actions from detective agent
+            diagnosis_actions: Actions from diagnostician agent
+            predictive_actions: Actions from predictive agent
+        Returns:
+            Prioritized list of unique actions
+        """
+        all_actions = detection_actions + diagnosis_actions + predictive_actions
+        seen = set()
+        unique_actions = []
+        for action in all_actions:
+            if action not in seen:
+                seen.add(action)
+                unique_actions.append(action)
+        return unique_actions[:5]  # Return top 5 actions
+# Initialize orchestration manager
 orchestration_manager = OrchestrationManager()
 # === Enhanced Reliability Engine ===
 class EnhancedReliabilityEngine:
+    """
+    Main engine for processing reliability events through the multi-agent system.
+    Coordinates anomaly detection, agent analysis, policy evaluation, and impact calculation.
+    """
     def __init__(self):
         self.performance_metrics = {
         memory_util: Optional[float] = None
     ) -> Dict[str, Any]:
         """
+        Process a reliability event through the complete analysis pipeline
         Args:
             component: Service component name
             memory_util: Memory utilization (0-1)
         Returns:
+            Comprehensive analysis results including agent findings, healing actions, and business impact
         """
         logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
         else:
             agent_confidence = 0.8 if is_anomaly else 0.1
+        # Set event severity
         if agent_confidence > 0.8:
             event.severity = EventSeverity.CRITICAL
         elif agent_confidence > 0.6:
         # Calculate business impact
         business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
+        # Store in vector database for similarity detection
         if thread_safe_index is not None and model is not None and is_anomaly:
             try:
                 analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
             except Exception as e:
                 logger.error(f"Error storing vector: {e}", exc_info=True)
+        # Build comprehensive result
         result = {
             "timestamp": event.timestamp,
             "component": component,
             }
         }
+        # Store event in history
         events_history_store.add(event)
+        # Update performance metrics
         with self._lock:
             self.performance_metrics['total_incidents_processed'] += 1
             self.performance_metrics['multi_agent_analyses'] += 1
     memory_util: Optional[float]
 ) -> Tuple[bool, str]:
     """
+    Validate user inputs for bounds and type correctness
+    Args:
+        latency: Latency value in milliseconds
+        error_rate: Error rate (0-1)
+        throughput: Throughput in requests/sec
+        cpu_util: CPU utilization (0-1)
+        memory_util: Memory utilization (0-1)
     Returns:
+        Tuple of (is_valid: bool, error_message: str)
     """
     if not (0 <= latency <= 10000):
         return False, "❌ Invalid latency: must be between 0-10000ms"
     return True, ""
+# === Gradio UI ===
 def create_enhanced_ui():
+    """
+    Create the comprehensive Gradio UI for the reliability framework.
+    Includes telemetry input, multi-agent analysis display, predictive insights,
+    and event history visualization.
+    """
     with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
         gr.Markdown("""
             gr.Markdown("\n\n".join(policy_info))
+        # ✅ FIXED: Synchronous wrapper for async function (CRITICAL FIX)
         def submit_event_enhanced_sync(component, latency, error_rate, throughput, cpu_util, memory_util):
+            """
+            Synchronous wrapper for async event processing.
+            FIXES GRADIO ASYNC/SYNC COMPATIBILITY ISSUE.
+            This wrapper:
+            1. Validates inputs
+            2. Creates new event loop for async execution
+            3. Calls the async processing function
+            4. Formats results for display
+            5. Handles all errors gracefully
+            """
             try:
                 # Type conversion
                 latency = float(latency)
                 cpu_util = float(cpu_util) if cpu_util else None
                 memory_util = float(memory_util) if memory_util else None
+                # Input validation (CRITICAL FIX)
                 is_valid, error_msg = validate_inputs(latency, error_rate, throughput, cpu_util, memory_util)
                 if not is_valid:
                     logger.warning(f"Invalid input: {error_msg}")
                     return error_msg, {}, {}, gr.Dataframe(value=[])
+                # Create new event loop for async execution (CRITICAL FIX)
                 loop = asyncio.new_event_loop()
                 asyncio.set_event_loop(loop)
                 finally:
                     loop.close()
+                # Build table data (THREAD-SAFE FIX)
                 table_data = []
                 for event in events_history_store.get_recent(15):
                     table_data.append([
                 if result["business_impact"]:
                     impact = result["business_impact"]
+                    output_msg += (
+                        f"\n💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | "
+                        f"👥 {impact['affected_users_estimate']} users | "
+                        f"🚨 {impact['severity_level']}"
+                    )
                 if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
                     actions = ", ".join(result["healing_actions"])
                 logger.error(error_msg, exc_info=True)
                 return error_msg, {}, {}, gr.Dataframe(value=[])
+        # ✅ FIXED: Use sync wrapper instead of async function (CRITICAL FIX)
         submit_btn.click(
+            fn=submit_event_enhanced_sync,  # Synchronous wrapper
             inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
             outputs=[output_text, agent_insights, predictive_insights, events_table]
         )
     return demo
+# === Main Entry Point ===
 if __name__ == "__main__":
+    logger.info("=" * 80)
+    logger.info("Starting Enterprise Agentic Reliability Framework")
+    logger.info("=" * 80)
     logger.info(f"Total events in history: {events_history_store.count()}")
     logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
+    logger.info(f"Agents initialized: {len(orchestration_manager.agents)}")
+    logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
     demo = create_enhanced_ui()
+    logger.info("Launching Gradio UI on 0.0.0.0:7860...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False
     )
+    # Graceful shutdown: Save any pending vectors
     if thread_safe_index:
+        logger.info("Saving pending vectors before shutdown...")
         thread_safe_index.force_save()
+    logger.info("=" * 80)
+    logger.info("Application shutdown complete")
+    logger.info("=" * 80)