Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Nov 25, 2025

Commit

1437f82

verified ·

1 Parent(s): fc7752d

Update app.py

Browse files

Files changed (1) hide show

app.py +452 -452

app.py CHANGED Viewed

@@ -1,3 +1,11 @@
 import os
 import json
 import numpy as np
@@ -5,103 +13,256 @@ import gradio as gr
 import requests
 import pandas as pd
 import datetime
-from typing import List, Dict, Any
 import hashlib
 import asyncio
-from enum import Enum
-from dataclasses import dataclass
 # Import our modules
 from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
 from healing_policies import PolicyEngine
 # === Configuration ===
-HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
-HF_API_URL = "https://router.huggingface.co/hf-inference/v1/completions"
-HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
 # === FAISS & Embeddings Setup ===
 try:
     from sentence_transformers import SentenceTransformer
     import faiss
-    VECTOR_DIM = 384
-    INDEX_FILE = "incident_vectors.index"
-    TEXTS_FILE = "incident_texts.json"
-    # Try to load model with error handling
-    try:
-        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-    except Exception as e:
-        print(f"Model loading warning: {e}")
-        from sentence_transformers import SentenceTransformer as ST
-        model = ST("sentence-transformers/all-MiniLM-L6-v2")
-    if os.path.exists(INDEX_FILE):
-        index = faiss.read_index(INDEX_FILE)
-        with open(TEXTS_FILE, "r") as f:
-            incident_texts = json.load(f)
     else:
-        index = faiss.IndexFlatL2(VECTOR_DIM)
         incident_texts = []
 except ImportError as e:
-    print(f"Warning: FAISS or SentenceTransformers not available: {e}")
     index = None
     incident_texts = []
     model = None
-def save_index():
-    """Save FAISS index and incident texts"""
-    if index is not None:
-        faiss.write_index(index, INDEX_FILE)
-        with open(TEXTS_FILE, "w") as f:
-            json.dump(incident_texts, f)
 # === Predictive Models ===
 @dataclass
 class ForecastResult:
     metric: str
     predicted_value: float
     confidence: float
     trend: str  # "increasing", "decreasing", "stable"
-    time_to_threshold: Any = None
     risk_level: str = "low"  # low, medium, high, critical
 class SimplePredictiveEngine:
     """Lightweight forecasting engine optimized for Hugging Face Spaces"""
-    def __init__(self, history_window: int = 50):
         self.history_window = history_window
-        self.service_history: Dict[str, List] = {}
-        self.prediction_cache: Dict[str, ForecastResult] = {}
-    def add_telemetry(self, service: str, event_data: Dict):
         """Add telemetry data to service history"""
-        if service not in self.service_history:
-            self.service_history[service] = []
-        telemetry_point = {
-            'timestamp': datetime.datetime.now(),
-            'latency': event_data.get('latency_p99', 0),
-            'error_rate': event_data.get('error_rate', 0),
-            'throughput': event_data.get('throughput', 0),
-            'cpu_util': event_data.get('cpu_util'),
-            'memory_util': event_data.get('memory_util')
-        }
-        self.service_history[service].append(telemetry_point)
-        # Keep only recent history
-        if len(self.service_history[service]) > self.history_window:
-            self.service_history[service].pop(0)
     def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
         """Forecast service health metrics"""
-        if service not in self.service_history or len(self.service_history[service]) < 10:
-            return []
-        history = self.service_history[service]
         forecasts = []
         # Forecast latency
@@ -119,13 +280,14 @@ class SimplePredictiveEngine:
         forecasts.extend(resource_forecasts)
         # Cache results
-        for forecast in forecasts:
-            cache_key = f"{service}_{forecast.metric}"
-            self.prediction_cache[cache_key] = forecast
         return forecasts
-    def _forecast_latency(self, history: List, lookahead_minutes: int) -> Any:
         """Forecast latency using linear regression and trend analysis"""
         try:
             latencies = [point['latency'] for point in history[-20:]]
@@ -148,7 +310,7 @@ class SimplePredictiveEngine:
             # Determine trend
             if slope > 5:
                 trend = "increasing"
-                risk = "high" if predicted_latency > 300 else "medium"
             elif slope < -2:
                 trend = "decreasing"
                 risk = "low"
@@ -159,9 +321,11 @@ class SimplePredictiveEngine:
             # Calculate time to reach critical threshold (500ms)
             time_to_critical = None
             if slope > 0 and predicted_latency < 500:
-                time_to_critical = datetime.timedelta(
-                    minutes=lookahead_minutes * (500 - predicted_latency) / max(0.1, (predicted_latency - latencies[-1]))
-                )
             return ForecastResult(
                 metric="latency",
@@ -173,10 +337,10 @@ class SimplePredictiveEngine:
             )
         except Exception as e:
-            print(f"Latency forecast error: {e}")
             return None
-    def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Any:
         """Forecast error rate using exponential smoothing"""
         try:
             error_rates = [point['error_rate'] for point in history[-15:]]
@@ -217,7 +381,7 @@ class SimplePredictiveEngine:
             )
         except Exception as e:
-            print(f"Error rate forecast error: {e}")
             return None
     def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
@@ -232,8 +396,10 @@ class SimplePredictiveEngine:
                 trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
                 risk = "low"
-                if predicted_cpu > 0.8:
-                    risk = "critical" if predicted_cpu > 0.9 else "high"
                 elif predicted_cpu > 0.7:
                     risk = "medium"
@@ -245,7 +411,7 @@ class SimplePredictiveEngine:
                     risk_level=risk
                 ))
             except Exception as e:
-                print(f"CPU forecast error: {e}")
         # Memory forecast
         memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
@@ -255,8 +421,10 @@ class SimplePredictiveEngine:
                 trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
                 risk = "low"
-                if predicted_memory > 0.8:
-                    risk = "critical" if predicted_memory > 0.9 else "high"
                 elif predicted_memory > 0.7:
                     risk = "medium"
@@ -268,7 +436,7 @@ class SimplePredictiveEngine:
                     risk_level=risk
                 ))
             except Exception as e:
-                print(f"Memory forecast error: {e}")
         return forecasts
@@ -302,7 +470,7 @@ class SimplePredictiveEngine:
         return {
             'service': service,
-            'forecasts': [f.__dict__ for f in forecasts],
             'warnings': warnings[:3],
             'recommendations': list(dict.fromkeys(recommendations))[:3],
             'critical_risk_count': len(critical_risks),
@@ -311,24 +479,36 @@ class SimplePredictiveEngine:
 # === Core Engine Components ===
 policy_engine = PolicyEngine()
-events_history: List[ReliabilityEvent] = []
 class BusinessImpactCalculator:
     """Calculate business impact of anomalies"""
     def __init__(self, revenue_per_request: float = 0.01):
         self.revenue_per_request = revenue_per_request
     def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
         base_revenue_per_minute = 100
         impact_multiplier = 1.0
-        if event.latency_p99 > 300:
             impact_multiplier += 0.5
         if event.error_rate > 0.1:
             impact_multiplier += 0.8
-        if event.cpu_util and event.cpu_util > 0.9:
             impact_multiplier += 0.3
         revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
@@ -346,6 +526,8 @@ class BusinessImpactCalculator:
         else:
             severity = "LOW"
         return {
             'revenue_loss_estimate': round(revenue_loss, 2),
             'affected_users_estimate': affected_users,
@@ -359,252 +541,62 @@ class AdvancedAnomalyDetector:
     """Enhanced anomaly detection with adaptive thresholds"""
     def __init__(self):
-        self.historical_data = []
         self.adaptive_thresholds = {
-            'latency_p99': 150,
-            'error_rate': 0.05
         }
     def detect_anomaly(self, event: ReliabilityEvent) -> bool:
-        latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
-        error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
-        resource_anomaly = False
-        if event.cpu_util and event.cpu_util > 0.9:
-            resource_anomaly = True
-        if event.memory_util and event.memory_util > 0.9:
-            resource_anomaly = True
-        self._update_thresholds(event)
-        return latency_anomaly or error_anomaly or resource_anomaly
-    def _update_thresholds(self, event: ReliabilityEvent):
         self.historical_data.append(event)
-        if len(self.historical_data) > 100:
-            self.historical_data.pop(0)
         if len(self.historical_data) > 10:
-            recent_latencies = [e.latency_p99 for e in self.historical_data[-20:]]
-            self.adaptive_thresholds['latency_p99'] = np.percentile(recent_latencies, 90)
 anomaly_detector = AdvancedAnomalyDetector()
-# === Multi-Agent Foundation ===
-class AgentSpecialization(Enum):
-    DETECTIVE = "anomaly_detection"
-    DIAGNOSTICIAN = "root_cause_analysis"
-    PREDICTIVE = "predictive_analytics"
-class BaseAgent:
-    def __init__(self, specialization: AgentSpecialization):
-        self.specialization = specialization
-    async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
-        raise NotImplementedError
-class AnomalyDetectionAgent(BaseAgent):
     def __init__(self):
-        super().__init__(AgentSpecialization.DETECTIVE)
-    async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
-        anomaly_score = self._calculate_anomaly_score(event)
-        return {
-            'specialization': self.specialization.value,
-            'confidence': anomaly_score,
-            'findings': {
-                'anomaly_score': anomaly_score,
-                'severity_tier': self._classify_severity(anomaly_score),
-                'primary_metrics_affected': self._identify_affected_metrics(event)
-            },
-            'recommendations': self._generate_detection_recommendations(event, anomaly_score)
-        }
-    def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
-        scores = []
-        if event.latency_p99 > 150:
-            latency_score = min(1.0, (event.latency_p99 - 150) / 500)
-            scores.append(0.4 * latency_score)
-        if event.error_rate > 0.05:
-            error_score = min(1.0, event.error_rate / 0.3)
-            scores.append(0.3 * error_score)
-        resource_score = 0
-        if event.cpu_util and event.cpu_util > 0.8:
-            resource_score += 0.15 * min(1.0, (event.cpu_util - 0.8) / 0.2)
-        if event.memory_util and event.memory_util > 0.8:
-            resource_score += 0.15 * min(1.0, (event.memory_util - 0.8) / 0.2)
-        scores.append(resource_score)
-        return min(1.0, sum(scores))
-    def _classify_severity(self, anomaly_score: float) -> str:
-        if anomaly_score > 0.8:
-            return "CRITICAL"
-        elif anomaly_score > 0.6:
-            return "HIGH"
-        elif anomaly_score > 0.4:
-            return "MEDIUM"
-        else:
-            return "LOW"
-    def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
-        affected = []
-        if event.latency_p99 > 500:
-            affected.append({"metric": "latency", "value": event.latency_p99, "severity": "CRITICAL", "threshold": 150})
-        elif event.latency_p99 > 300:
-            affected.append({"metric": "latency", "value": event.latency_p99, "severity": "HIGH", "threshold": 150})
-        elif event.latency_p99 > 150:
-            affected.append({"metric": "latency", "value": event.latency_p99, "severity": "MEDIUM", "threshold": 150})
-        if event.error_rate > 0.3:
-            affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "CRITICAL", "threshold": 0.05})
-        elif event.error_rate > 0.15:
-            affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "HIGH", "threshold": 0.05})
-        elif event.error_rate > 0.05:
-            affected.append({"metric": "error_rate", "value": event.error_rate, "severity": "MEDIUM", "threshold": 0.05})
-        if event.cpu_util and event.cpu_util > 0.9:
-            affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "CRITICAL", "threshold": 0.8})
-        elif event.cpu_util and event.cpu_util > 0.8:
-            affected.append({"metric": "cpu", "value": event.cpu_util, "severity": "HIGH", "threshold": 0.8})
-        if event.memory_util and event.memory_util > 0.9:
-            affected.append({"metric": "memory", "value": event.memory_util, "severity": "CRITICAL", "threshold": 0.8})
-        elif event.memory_util and event.memory_util > 0.8:
-            affected.append({"metric": "memory", "value": event.memory_util, "severity": "HIGH", "threshold": 0.8})
-        return affected
-    def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
-        recommendations = []
-        affected_metrics = self._identify_affected_metrics(event)
-        for metric in affected_metrics:
-            metric_name = metric["metric"]
-            severity = metric["severity"]
-            value = metric["value"]
-            threshold = metric["threshold"]
-            if metric_name == "latency":
-                if severity == "CRITICAL":
-                    recommendations.append(f"🚨 CRITICAL: Latency {value}ms (>{threshold}ms) - Check database & external dependencies")
-                elif severity == "HIGH":
-                    recommendations.append(f"⚠️ HIGH: Latency {value}ms (>{threshold}ms) - Investigate service performance")
-                else:
-                    recommendations.append(f"📈 Latency elevated: {value}ms (>{threshold}ms) - Monitor trend")
-            elif metric_name == "error_rate":
-                if severity == "CRITICAL":
-                    recommendations.append(f"🚨 CRITICAL: Error rate {value*100:.1f}% (>{threshold*100}%) - Check recent deployments")
-                elif severity == "HIGH":
-                    recommendations.append(f"⚠️ HIGH: Error rate {value*100:.1f}% (>{threshold*100}%) - Review application logs")
-                else:
-                    recommendations.append(f"📈 Errors increasing: {value*100:.1f}% (>{threshold*100}%)")
-            elif metric_name == "cpu":
-                recommendations.append(f"🔥 CPU {severity}: {value*100:.1f}% utilization - Consider scaling")
-            elif metric_name == "memory":
-                recommendations.append(f"💾 Memory {severity}: {value*100:.1f}% utilization - Check for memory leaks")
-        if anomaly_score > 0.8:
-            recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
-        elif anomaly_score > 0.6:
-            recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
-        elif anomaly_score > 0.4:
-            recommendations.append("📊 MONITOR: Early warning signs detected")
-        return recommendations[:4]
-class RootCauseAgent(BaseAgent):
-    def __init__(self):
-        super().__init__(AgentSpecialization.DIAGNOSTICIAN)
-    async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
-        causes = self._analyze_potential_causes(event)
-        return {
-            'specialization': self.specialization.value,
-            'confidence': 0.7,
-            'findings': {
-                'likely_root_causes': causes,
-                'evidence_patterns': self._identify_evidence(event),
-                'investigation_priority': self._prioritize_investigation(causes)
-            },
-            'recommendations': [
-                f"Check {cause['cause']} for issues" for cause in causes[:2]
-            ]
-        }
-    def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
-        causes = []
-        if event.latency_p99 > 500 and event.error_rate > 0.2:
-            causes.append({
-                "cause": "Database/External Dependency Failure",
-                "confidence": 0.85,
-                "evidence": f"Extreme latency ({event.latency_p99}ms) with high errors ({event.error_rate*100:.1f}%)",
-                "investigation": "Check database connection pool, external API health"
-            })
-        if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
-            causes.append({
-                "cause": "Resource Exhaustion",
-                "confidence": 0.90,
-                "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
-                "investigation": "Check for memory leaks, infinite loops, insufficient resources"
-            })
-        if event.error_rate > 0.3 and event.latency_p99 < 200:
-            causes.append({
-                "cause": "Application Bug / Configuration Issue",
-                "confidence": 0.75,
-                "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
-                "investigation": "Review recent deployments, configuration changes, application logs"
-            })
-        if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
-            causes.append({
-                "cause": "Gradual Performance Degradation",
-                "confidence": 0.65,
-                "evidence": f"Moderate latency ({event.latency_p99}ms) and errors ({event.error_rate*100:.1f}%)",
-                "investigation": "Check resource trends, dependency performance, capacity planning"
-            })
-        if not causes:
-            causes.append({
-                "cause": "Unknown - Requires Investigation",
-                "confidence": 0.3,
-                "evidence": "Pattern does not match known failure modes",
-                "investigation": "Complete system review needed"
-            })
-        return causes
-    def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
-        evidence = []
-        if event.latency_p99 > event.error_rate * 1000:
-            evidence.append("latency_disproportionate_to_errors")
-        if event.cpu_util and event.cpu_util > 0.8 and event.memory_util and event.memory_util > 0.8:
-            evidence.append("correlated_resource_exhaustion")
-        return evidence
-    def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
-        for cause in causes:
-            if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
-                return "HIGH"
-        return "MEDIUM"
-class PredictiveAgent(BaseAgent):
-    def __init__(self):
-        super().__init__(AgentSpecialization.PREDICTIVE)
-        self.engine = SimplePredictiveEngine()
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
         """Predictive analysis for future risks"""
@@ -620,89 +612,55 @@ class PredictiveAgent(BaseAgent):
         insights = self.engine.get_predictive_insights(event.component)
         return {
-            'specialization': self.specialization.value,
             'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
             'findings': insights,
             'recommendations': insights['recommendations']
         }
-class OrchestrationManager:
-    def __init__(self):
-        self.agents = {
-            AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
-            AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
-            AgentSpecialization.PREDICTIVE: PredictiveAgent(),
-        }
-    async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
-        agent_tasks = {
-            spec: agent.analyze(event)
-            for spec, agent in self.agents.items()
-        }
-        agent_results = {}
-        for specialization, task in agent_tasks.items():
-            try:
-                result = await asyncio.wait_for(task, timeout=5.0)
-                agent_results[specialization.value] = result
-            except asyncio.TimeoutError:
-                continue
-        return self._synthesize_agent_findings(event, agent_results)
-    def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
-        detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
-        diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
-        predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
-        if not detective_result:
-            return {'error': 'No agent results available'}
-        synthesis = {
-            'incident_summary': {
-                'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
-                'anomaly_confidence': detective_result['confidence'],
-                'primary_metrics_affected': [metric["metric"] for metric in detective_result['findings'].get('primary_metrics_affected', [])]
-            },
-            'root_cause_insights': diagnostician_result['findings'] if diagnostician_result else {},
-            'predictive_insights': predictive_result['findings'] if predictive_result else {},
-            'recommended_actions': self._prioritize_actions(
-                detective_result.get('recommendations', []),
-                diagnostician_result.get('recommendations', []) if diagnostician_result else [],
-                predictive_result.get('recommendations', []) if predictive_result else []
-            ),
-            'agent_metadata': {
-                'participating_agents': list(agent_results.keys()),
-                'analysis_timestamp': datetime.datetime.now().isoformat()
-            }
-        }
-        return synthesis
-    def _prioritize_actions(self, detection_actions: List[str], diagnosis_actions: List[str], predictive_actions: List[str]) -> List[str]:
-        all_actions = detection_actions + diagnosis_actions + predictive_actions
-        seen = set()
-        unique_actions = []
-        for action in all_actions:
-            if action not in seen:
-                seen.add(action)
-                unique_actions.append(action)
-        return unique_actions[:5]
-# Initialize enhanced components
 orchestration_manager = OrchestrationManager()
 class EnhancedReliabilityEngine:
     def __init__(self):
         self.performance_metrics = {
             'total_incidents_processed': 0,
-            'multi_agent_analyses': 0
         }
-    async def process_event_enhanced(self, component: str, latency: float, error_rate: float,
-                                   throughput: float = 1000, cpu_util: float = None,
-                                   memory_util: float = None) -> Dict[str, Any]:
         event = ReliabilityEvent(
             component=component,
             latency_p99=latency,
@@ -713,10 +671,13 @@ class EnhancedReliabilityEngine:
             upstream_deps=["auth-service", "database"] if component == "api-service" else []
         )
         agent_analysis = await orchestration_manager.orchestrate_analysis(event)
         is_anomaly = anomaly_detector.detect_anomaly(event)
         agent_confidence = 0.0
         if agent_analysis and 'incident_summary' in agent_analysis:
             agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
@@ -732,18 +693,23 @@ class EnhancedReliabilityEngine:
         else:
             event.severity = EventSeverity.LOW
         healing_actions = policy_engine.evaluate_policies(event)
         business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
-        if index is not None and is_anomaly:
-            analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
-            vector_text = f"{component} {latency} {error_rate} {analysis_text}"
-            vec = model.encode([vector_text])
-            index.add(np.array(vec, dtype=np.float32))
-            incident_texts.append(vector_text)
-            save_index()
         result = {
             "timestamp": event.timestamp,
             "component": component,
@@ -755,69 +721,61 @@ class EnhancedReliabilityEngine:
             "healing_actions": [action.value for action in healing_actions],
             "business_impact": business_impact,
             "severity": event.severity.value,
-            "similar_incidents_count": len(incident_texts) if is_anomaly else 0,
             "processing_metadata": {
                 "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
                 "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
             }
         }
-        events_history.append(event)
-        self.performance_metrics['total_incidents_processed'] += 1
-        self.performance_metrics['multi_agent_analyses'] += 1
         return result
 # Initialize enhanced engine
 enhanced_engine = EnhancedReliabilityEngine()
-def call_huggingface_analysis(prompt: str) -> str:
-    if not HF_TOKEN:
-        fallback_insights = [
-            "High latency detected - possible resource contention or network issues",
-            "Error rate increase suggests recent deployment instability",
-            "Latency spike correlates with increased user traffic patterns",
-            "Intermittent failures indicate potential dependency service degradation",
-            "Performance degradation detected - consider scaling compute resources"
-        ]
-        import random
-        return random.choice(fallback_insights)
-    try:
-        enhanced_prompt = f"""
-        As a senior reliability engineer, analyze this telemetry event and provide a concise root cause analysis:
-        {prompt}
-        Focus on:
-        - Potential infrastructure or application issues
-        - Correlation between metrics
-        - Business impact assessment
-        - Recommended investigation areas
-        Provide 1-2 sentences maximum with actionable insights.
-        """
-        payload = {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "prompt": enhanced_prompt,
-            "max_tokens": 150,
-            "temperature": 0.4,
-        }
-        response = requests.post(HF_API_URL, headers=HEADERS, json=payload, timeout=15)
-        if response.status_code == 200:
-            result = response.json()
-            analysis_text = result.get("choices", [{}])[0].get("text", "").strip()
-            if analysis_text and len(analysis_text) > 10:
-                return analysis_text.split('\n')[0]
-            return analysis_text
-        else:
-            return f"API Error {response.status_code}: Service temporarily unavailable"
-    except Exception as e:
-        return f"Analysis service error: {str(e)}"
 # === Enhanced UI with Multi-Agent Insights ===
 def create_enhanced_ui():
     with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
         gr.Markdown("""
         # 🧠 Enterprise Agentic Reliability Framework
@@ -838,12 +796,12 @@ def create_enhanced_ui():
                 latency = gr.Slider(
                     minimum=10, maximum=1000, value=100, step=1,
                     label="Latency P99 (ms)",
-                    info="Alert threshold: >150ms (adaptive)"
                 )
                 error_rate = gr.Slider(
                     minimum=0, maximum=0.5, value=0.02, step=0.001,
                     label="Error Rate",
-                    info="Alert threshold: >0.05"
                 )
                 throughput = gr.Number(
                     value=1000,
@@ -924,20 +882,40 @@ def create_enhanced_ui():
             gr.Markdown("\n\n".join(policy_info))
-        async def submit_event_enhanced(component, latency, error_rate, throughput, cpu_util, memory_util):
             try:
                 latency = float(latency)
                 error_rate = float(error_rate)
                 throughput = float(throughput) if throughput else 1000
                 cpu_util = float(cpu_util) if cpu_util else None
                 memory_util = float(memory_util) if memory_util else None
-                result = await enhanced_engine.process_event_enhanced(
-                    component, latency, error_rate, throughput, cpu_util, memory_util
-                )
                 table_data = []
-                for event in events_history[-15:]:
                     table_data.append([
                         event.timestamp[:19],
                         event.component,
@@ -945,31 +923,33 @@ def create_enhanced_ui():
                         f"{event.error_rate:.3f}",
                         event.throughput,
                         event.severity.value.upper(),
-                        "Multi-agent analysis" if 'multi_agent_analysis' in result else 'N/A'
                     ])
                 status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
-                output_msg = f"{status_emoji} {result['status']}"
                 if "multi_agent_analysis" in result:
                     analysis = result["multi_agent_analysis"]
                     confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
-                    output_msg += f"\n🎯 Confidence: {confidence*100:.1f}%"
                     predictive_data = analysis.get('predictive_insights', {})
                     if predictive_data.get('critical_risk_count', 0) > 0:
-                        output_msg += f"\n🔮 PREDICTIVE: {predictive_data['critical_risk_count']} critical risks forecast"
                     if analysis.get('recommended_actions'):
-                        output_msg += f"\n💡 Insights: {', '.join(analysis['recommended_actions'][:2])}"
                 if result["business_impact"]:
                     impact = result["business_impact"]
-                    output_msg += f"\n💰 Business Impact: ${impact['revenue_loss_estimate']} | 👥 {impact['affected_users_estimate']} users | 🚨 {impact['severity_level']}"
                 if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
                     actions = ", ".join(result["healing_actions"])
-                    output_msg += f"\n🔧 Auto-Actions: {actions}"
                 agent_insights_data = result.get("multi_agent_analysis", {})
                 predictive_insights_data = agent_insights_data.get('predictive_insights', {})
@@ -985,11 +965,18 @@ def create_enhanced_ui():
                     )
                 )
             except Exception as e:
-                return f"❌ Error processing event: {str(e)}", {}, {}, gr.Dataframe(value=[])
         submit_btn.click(
-            fn=submit_event_enhanced,
             inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
             outputs=[output_text, agent_insights, predictive_insights, events_table]
         )
@@ -997,9 +984,22 @@ def create_enhanced_ui():
     return demo
 if __name__ == "__main__":
     demo = create_enhanced_ui()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False
-    )

+"""
+Enterprise Agentic Reliability Framework - Main Application
+Multi-Agent AI System for Production Reliability Monitoring
+This module provides the main Gradio UI and orchestrates the reliability
+monitoring system with anomaly detection, predictive analytics, and auto-healing.
+"""
 import os
 import json
 import numpy as np
 import requests
 import pandas as pd
 import datetime
+import threading
+import logging
+from typing import List, Dict, Any, Optional, Tuple
+from collections import deque
+from dataclasses import dataclass, asdict
 import hashlib
 import asyncio
 # Import our modules
 from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
 from healing_policies import PolicyEngine
+from agent_orchestrator import OrchestrationManager
+# === Logging Configuration ===
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
 # === Configuration ===
+class Config:
+    """Centralized configuration for the reliability framework"""
+    HF_TOKEN: str = os.getenv("HF_TOKEN", "").strip()
+    HF_API_URL: str = "https://router.huggingface.co/hf-inference/v1/completions"
+    # Vector storage
+    VECTOR_DIM: int = 384
+    INDEX_FILE: str = "incident_vectors.index"
+    TEXTS_FILE: str = "incident_texts.json"
+    # Thresholds
+    LATENCY_WARNING: float = 150.0
+    LATENCY_CRITICAL: float = 300.0
+    ERROR_RATE_WARNING: float = 0.05
+    ERROR_RATE_CRITICAL: float = 0.15
+    CPU_WARNING: float = 0.8
+    CPU_CRITICAL: float = 0.9
+    MEMORY_WARNING: float = 0.8
+    MEMORY_CRITICAL: float = 0.9
+    # Performance
+    HISTORY_WINDOW: int = 50
+    MAX_EVENTS_STORED: int = 1000
+    AGENT_TIMEOUT: int = 10
+    CACHE_EXPIRY_MINUTES: int = 15
+config = Config()
+HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
+# === Thread-Safe Data Structures ===
+class ThreadSafeEventStore:
+    """Thread-safe storage for reliability events"""
+    def __init__(self, max_size: int = config.MAX_EVENTS_STORED):
+        self._events = deque(maxlen=max_size)
+        self._lock = threading.RLock()
+        logger.info(f"Initialized ThreadSafeEventStore with max_size={max_size}")
+    def add(self, event: ReliabilityEvent) -> None:
+        """Add event to store"""
+        with self._lock:
+            self._events.append(event)
+            logger.debug(f"Added event for {event.component}: {event.severity.value}")
+    def get_recent(self, n: int = 15) -> List[ReliabilityEvent]:
+        """Get n most recent events"""
+        with self._lock:
+            return list(self._events)[-n:] if self._events else []
+    def get_all(self) -> List[ReliabilityEvent]:
+        """Get all events"""
+        with self._lock:
+            return list(self._events)
+    def count(self) -> int:
+        """Get total event count"""
+        with self._lock:
+            return len(self._events)
+class ThreadSafeFAISSIndex:
+    """Thread-safe wrapper for FAISS index operations with batching"""
+    def __init__(self, index, texts: List[str]):
+        self.index = index
+        self.texts = texts
+        self._lock = threading.RLock()
+        self.last_save = datetime.datetime.now()
+        self.save_interval = datetime.timedelta(seconds=30)
+        self.pending_vectors = []
+        self.pending_texts = []
+        logger.info(f"Initialized ThreadSafeFAISSIndex with {len(texts)} existing vectors")
+    def add(self, vector: np.ndarray, text: str) -> None:
+        """Add vector and text with batching"""
+        with self._lock:
+            self.pending_vectors.append(vector)
+            self.pending_texts.append(text)
+            # Flush if we have enough pending
+            if len(self.pending_vectors) >= 10:
+                self._flush()
+    def _flush(self) -> None:
+        """Flush pending vectors to index"""
+        if not self.pending_vectors:
+            return
+        try:
+            vectors = np.vstack(self.pending_vectors)
+            self.index.add(vectors)
+            self.texts.extend(self.pending_texts)
+            logger.info(f"Flushed {len(self.pending_vectors)} vectors to FAISS index")
+            self.pending_vectors = []
+            self.pending_texts = []
+            # Save if enough time has passed
+            if datetime.datetime.now() - self.last_save > self.save_interval:
+                self._save()
+        except Exception as e:
+            logger.error(f"Error flushing vectors: {e}", exc_info=True)
+    def _save(self) -> None:
+        """Save index to disk"""
+        try:
+            import faiss
+            faiss.write_index(self.index, config.INDEX_FILE)
+            with open(config.TEXTS_FILE, "w") as f:
+                json.dump(self.texts, f)
+            self.last_save = datetime.datetime.now()
+            logger.info(f"Saved FAISS index with {len(self.texts)} vectors")
+        except Exception as e:
+            logger.error(f"Error saving index: {e}", exc_info=True)
+    def get_count(self) -> int:
+        """Get total count of vectors"""
+        with self._lock:
+            return len(self.texts) + len(self.pending_texts)
+    def force_save(self) -> None:
+        """Force immediate save of pending vectors"""
+        with self._lock:
+            self._flush()
 # === FAISS & Embeddings Setup ===
 try:
     from sentence_transformers import SentenceTransformer
     import faiss
+    logger.info("Loading SentenceTransformer model...")
+    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    logger.info("SentenceTransformer model loaded successfully")
+    if os.path.exists(config.INDEX_FILE):
+        logger.info(f"Loading existing FAISS index from {config.INDEX_FILE}")
+        index = faiss.read_index(config.INDEX_FILE)
+        # Validate dimension
+        if index.d != config.VECTOR_DIM:
+            logger.warning(f"Index dimension mismatch: {index.d} != {config.VECTOR_DIM}. Creating new index.")
+            index = faiss.IndexFlatL2(config.VECTOR_DIM)
+            incident_texts = []
+        else:
+            with open(config.TEXTS_FILE, "r") as f:
+                incident_texts = json.load(f)
+            logger.info(f"Loaded {len(incident_texts)} incident texts")
     else:
+        logger.info("Creating new FAISS index")
+        index = faiss.IndexFlatL2(config.VECTOR_DIM)
         incident_texts = []
+    thread_safe_index = ThreadSafeFAISSIndex(index, incident_texts)
 except ImportError as e:
+    logger.warning(f"FAISS or SentenceTransformers not available: {e}")
     index = None
     incident_texts = []
     model = None
+    thread_safe_index = None
+except Exception as e:
+    logger.error(f"Error initializing FAISS: {e}", exc_info=True)
+    index = None
+    incident_texts = []
+    model = None
+    thread_safe_index = None
 # === Predictive Models ===
 @dataclass
 class ForecastResult:
+    """Data class for forecast results"""
     metric: str
     predicted_value: float
     confidence: float
     trend: str  # "increasing", "decreasing", "stable"
+    time_to_threshold: Optional[datetime.timedelta] = None
     risk_level: str = "low"  # low, medium, high, critical
 class SimplePredictiveEngine:
     """Lightweight forecasting engine optimized for Hugging Face Spaces"""
+    def __init__(self, history_window: int = config.HISTORY_WINDOW):
         self.history_window = history_window
+        self.service_history: Dict[str, deque] = {}
+        self.prediction_cache: Dict[str, Tuple[ForecastResult, datetime.datetime]] = {}
+        self.max_cache_age = datetime.timedelta(minutes=config.CACHE_EXPIRY_MINUTES)
+        self._lock = threading.RLock()
+        logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
+    def add_telemetry(self, service: str, event_data: Dict) -> None:
         """Add telemetry data to service history"""
+        with self._lock:
+            if service not in self.service_history:
+                self.service_history[service] = deque(maxlen=self.history_window)
+            telemetry_point = {
+                'timestamp': datetime.datetime.now(),
+                'latency': event_data.get('latency_p99', 0),
+                'error_rate': event_data.get('error_rate', 0),
+                'throughput': event_data.get('throughput', 0),
+                'cpu_util': event_data.get('cpu_util'),
+                'memory_util': event_data.get('memory_util')
+            }
+            self.service_history[service].append(telemetry_point)
+            # Clean expired cache
+            self._clean_cache()
+    def _clean_cache(self) -> None:
+        """Remove expired entries from prediction cache"""
+        now = datetime.datetime.now()
+        expired = [k for k, (_, ts) in self.prediction_cache.items()
+                   if now - ts > self.max_cache_age]
+        for k in expired:
+            del self.prediction_cache[k]
+        if expired:
+            logger.debug(f"Cleaned {len(expired)} expired cache entries")
     def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
         """Forecast service health metrics"""
+        with self._lock:
+            if service not in self.service_history or len(self.service_history[service]) < 10:
+                return []
+            history = list(self.service_history[service])
         forecasts = []
         # Forecast latency
         forecasts.extend(resource_forecasts)
         # Cache results
+        with self._lock:
+            for forecast in forecasts:
+                cache_key = f"{service}_{forecast.metric}"
+                self.prediction_cache[cache_key] = (forecast, datetime.datetime.now())
         return forecasts
+    def _forecast_latency(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
         """Forecast latency using linear regression and trend analysis"""
         try:
             latencies = [point['latency'] for point in history[-20:]]
             # Determine trend
             if slope > 5:
                 trend = "increasing"
+                risk = "high" if predicted_latency > config.LATENCY_CRITICAL else "medium"
             elif slope < -2:
                 trend = "decreasing"
                 risk = "low"
             # Calculate time to reach critical threshold (500ms)
             time_to_critical = None
             if slope > 0 and predicted_latency < 500:
+                denominator = predicted_latency - latencies[-1]
+                if abs(denominator) > 0.1:  # Avoid division by very small numbers
+                    time_to_critical = datetime.timedelta(
+                        minutes=lookahead_minutes * (500 - predicted_latency) / denominator
+                    )
             return ForecastResult(
                 metric="latency",
             )
         except Exception as e:
+            logger.error(f"Latency forecast error: {e}", exc_info=True)
             return None
+    def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
         """Forecast error rate using exponential smoothing"""
         try:
             error_rates = [point['error_rate'] for point in history[-15:]]
             )
         except Exception as e:
+            logger.error(f"Error rate forecast error: {e}", exc_info=True)
             return None
     def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
                 trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
                 risk = "low"
+                if predicted_cpu > config.CPU_CRITICAL:
+                    risk = "critical"
+                elif predicted_cpu > config.CPU_WARNING:
+                    risk = "high"
                 elif predicted_cpu > 0.7:
                     risk = "medium"
                     risk_level=risk
                 ))
             except Exception as e:
+                logger.error(f"CPU forecast error: {e}", exc_info=True)
         # Memory forecast
         memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
                 trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
                 risk = "low"
+                if predicted_memory > config.MEMORY_CRITICAL:
+                    risk = "critical"
+                elif predicted_memory > config.MEMORY_WARNING:
+                    risk = "high"
                 elif predicted_memory > 0.7:
                     risk = "medium"
                     risk_level=risk
                 ))
             except Exception as e:
+                logger.error(f"Memory forecast error: {e}", exc_info=True)
         return forecasts
         return {
             'service': service,
+            'forecasts': [asdict(f) for f in forecasts],
             'warnings': warnings[:3],
             'recommendations': list(dict.fromkeys(recommendations))[:3],
             'critical_risk_count': len(critical_risks),
 # === Core Engine Components ===
 policy_engine = PolicyEngine()
+events_history_store = ThreadSafeEventStore()
+predictive_engine = SimplePredictiveEngine()
 class BusinessImpactCalculator:
     """Calculate business impact of anomalies"""
     def __init__(self, revenue_per_request: float = 0.01):
         self.revenue_per_request = revenue_per_request
+        logger.info(f"Initialized BusinessImpactCalculator with revenue_per_request={revenue_per_request}")
     def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
+        """
+        Calculate business impact for a reliability event
+        Args:
+            event: The reliability event to analyze
+            duration_minutes: Assumed duration of the incident
+        Returns:
+            Dictionary containing impact estimates
+        """
         base_revenue_per_minute = 100
         impact_multiplier = 1.0
+        if event.latency_p99 > config.LATENCY_CRITICAL:
             impact_multiplier += 0.5
         if event.error_rate > 0.1:
             impact_multiplier += 0.8
+        if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
             impact_multiplier += 0.3
         revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
         else:
             severity = "LOW"
+        logger.info(f"Business impact calculated: ${revenue_loss:.2f} revenue loss, {affected_users} users affected, {severity} severity")
         return {
             'revenue_loss_estimate': round(revenue_loss, 2),
             'affected_users_estimate': affected_users,
     """Enhanced anomaly detection with adaptive thresholds"""
     def __init__(self):
+        self.historical_data = deque(maxlen=100)
         self.adaptive_thresholds = {
+            'latency_p99': config.LATENCY_WARNING,
+            'error_rate': config.ERROR_RATE_WARNING
         }
+        self._lock = threading.RLock()
+        logger.info("Initialized AdvancedAnomalyDetector")
     def detect_anomaly(self, event: ReliabilityEvent) -> bool:
+        """
+        Detect if event is anomalous
+        Args:
+            event: The reliability event to check
+        Returns:
+            True if anomaly detected, False otherwise
+        """
+        with self._lock:
+            latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
+            error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
+            resource_anomaly = False
+            if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
+                resource_anomaly = True
+            if event.memory_util and event.memory_util > config.MEMORY_CRITICAL:
+                resource_anomaly = True
+            self._update_thresholds(event)
+            is_anomaly = latency_anomaly or error_anomaly or resource_anomaly
+            if is_anomaly:
+                logger.info(f"Anomaly detected for {event.component}: latency={latency_anomaly}, error={error_anomaly}, resource={resource_anomaly}")
+            return is_anomaly
+    def _update_thresholds(self, event: ReliabilityEvent) -> None:
+        """Update adaptive thresholds based on historical data"""
         self.historical_data.append(event)
         if len(self.historical_data) > 10:
+            recent_latencies = [e.latency_p99 for e in list(self.historical_data)[-20:]]
+            new_threshold = np.percentile(recent_latencies, 90)
+            self.adaptive_thresholds['latency_p99'] = new_threshold
+            logger.debug(f"Updated adaptive latency threshold to {new_threshold:.2f}ms")
 anomaly_detector = AdvancedAnomalyDetector()
+# === Predictive Agent Integration ===
+class PredictiveAgent:
+    """Predictive agent that uses SimplePredictiveEngine"""
     def __init__(self):
+        self.engine = predictive_engine
+        logger.info("Initialized PredictiveAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
         """Predictive analysis for future risks"""
         insights = self.engine.get_predictive_insights(event.component)
         return {
+            'specialization': 'predictive_analytics',
             'confidence': 0.8 if insights['critical_risk_count'] > 0 else 0.5,
             'findings': insights,
             'recommendations': insights['recommendations']
         }
+# Initialize orchestration with predictive agent
 orchestration_manager = OrchestrationManager()
+orchestration_manager.agents['predictive_analytics'] = PredictiveAgent()
+# === Enhanced Reliability Engine ===
 class EnhancedReliabilityEngine:
+    """Main engine for processing reliability events"""
     def __init__(self):
         self.performance_metrics = {
             'total_incidents_processed': 0,
+            'multi_agent_analyses': 0,
+            'anomalies_detected': 0
         }
+        self._lock = threading.RLock()
+        logger.info("Initialized EnhancedReliabilityEngine")
+    async def process_event_enhanced(
+        self,
+        component: str,
+        latency: float,
+        error_rate: float,
+        throughput: float = 1000,
+        cpu_util: Optional[float] = None,
+        memory_util: Optional[float] = None
+    ) -> Dict[str, Any]:
+        """
+        Process a reliability event through the multi-agent system
+        Args:
+            component: Service component name
+            latency: P99 latency in milliseconds
+            error_rate: Error rate (0-1)
+            throughput: Requests per second
+            cpu_util: CPU utilization (0-1)
+            memory_util: Memory utilization (0-1)
+        Returns:
+            Dictionary containing analysis results
+        """
+        logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
+        # Create event
         event = ReliabilityEvent(
             component=component,
             latency_p99=latency,
             upstream_deps=["auth-service", "database"] if component == "api-service" else []
         )
+        # Multi-agent analysis
         agent_analysis = await orchestration_manager.orchestrate_analysis(event)
+        # Anomaly detection
         is_anomaly = anomaly_detector.detect_anomaly(event)
+        # Determine severity based on agent confidence
         agent_confidence = 0.0
         if agent_analysis and 'incident_summary' in agent_analysis:
             agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
         else:
             event.severity = EventSeverity.LOW
+        # Evaluate healing policies
         healing_actions = policy_engine.evaluate_policies(event)
+        # Calculate business impact
         business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
+        # Store in vector database
+        if thread_safe_index is not None and model is not None and is_anomaly:
+            try:
+                analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
+                vector_text = f"{component} {latency} {error_rate} {analysis_text}"
+                vec = model.encode([vector_text])
+                thread_safe_index.add(np.array(vec, dtype=np.float32), vector_text)
+            except Exception as e:
+                logger.error(f"Error storing vector: {e}", exc_info=True)
+        # Build result
         result = {
             "timestamp": event.timestamp,
             "component": component,
             "healing_actions": [action.value for action in healing_actions],
             "business_impact": business_impact,
             "severity": event.severity.value,
+            "similar_incidents_count": thread_safe_index.get_count() if thread_safe_index and is_anomaly else 0,
             "processing_metadata": {
                 "agents_used": agent_analysis.get('agent_metadata', {}).get('participating_agents', []),
                 "analysis_confidence": agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
             }
         }
+        # Store event
+        events_history_store.add(event)
+        # Update metrics
+        with self._lock:
+            self.performance_metrics['total_incidents_processed'] += 1
+            self.performance_metrics['multi_agent_analyses'] += 1
+            if is_anomaly:
+                self.performance_metrics['anomalies_detected'] += 1
+        logger.info(f"Event processed: {result['status']} with {result['severity']} severity")
         return result
 # Initialize enhanced engine
 enhanced_engine = EnhancedReliabilityEngine()
+# === Input Validation ===
+def validate_inputs(
+    latency: float,
+    error_rate: float,
+    throughput: float,
+    cpu_util: Optional[float],
+    memory_util: Optional[float]
+) -> Tuple[bool, str]:
+    """
+    Validate user inputs
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    if not (0 <= latency <= 10000):
+        return False, "❌ Invalid latency: must be between 0-10000ms"
+    if not (0 <= error_rate <= 1):
+        return False, "❌ Invalid error rate: must be between 0-1"
+    if throughput < 0:
+        return False, "❌ Invalid throughput: must be positive"
+    if cpu_util is not None and not (0 <= cpu_util <= 1):
+        return False, "❌ Invalid CPU utilization: must be between 0-1"
+    if memory_util is not None and not (0 <= memory_util <= 1):
+        return False, "❌ Invalid memory utilization: must be between 0-1"
+    return True, ""
 # === Enhanced UI with Multi-Agent Insights ===
 def create_enhanced_ui():
+    """Create the Gradio UI for the reliability framework"""
     with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
         gr.Markdown("""
         # 🧠 Enterprise Agentic Reliability Framework
                 latency = gr.Slider(
                     minimum=10, maximum=1000, value=100, step=1,
                     label="Latency P99 (ms)",
+                    info=f"Alert threshold: >{config.LATENCY_WARNING}ms (adaptive)"
                 )
                 error_rate = gr.Slider(
                     minimum=0, maximum=0.5, value=0.02, step=0.001,
                     label="Error Rate",
+                    info=f"Alert threshold: >{config.ERROR_RATE_WARNING}"
                 )
                 throughput = gr.Number(
                     value=1000,
             gr.Markdown("\n\n".join(policy_info))
+        # ✅ FIXED: Synchronous wrapper for async function
+        def submit_event_enhanced_sync(component, latency, error_rate, throughput, cpu_util, memory_util):
+            """Synchronous wrapper for async event processing - FIXES GRADIO ASYNC ISSUE"""
             try:
+                # Type conversion
                 latency = float(latency)
                 error_rate = float(error_rate)
                 throughput = float(throughput) if throughput else 1000
                 cpu_util = float(cpu_util) if cpu_util else None
                 memory_util = float(memory_util) if memory_util else None
+                # Input validation
+                is_valid, error_msg = validate_inputs(latency, error_rate, throughput, cpu_util, memory_util)
+                if not is_valid:
+                    logger.warning(f"Invalid input: {error_msg}")
+                    return error_msg, {}, {}, gr.Dataframe(value=[])
+                # Create new event loop for async execution
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+                try:
+                    # Call async function
+                    result = loop.run_until_complete(
+                        enhanced_engine.process_event_enhanced(
+                            component, latency, error_rate, throughput, cpu_util, memory_util
+                        )
+                    )
+                finally:
+                    loop.close()
+                # Build table data
                 table_data = []
+                for event in events_history_store.get_recent(15):
                     table_data.append([
                         event.timestamp[:19],
                         event.component,
                         f"{event.error_rate:.3f}",
                         event.throughput,
                         event.severity.value.upper(),
+                        "Multi-agent analysis"
                     ])
+                # Format output message
                 status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
+                output_msg = f"{status_emoji} **{result['status']}**"
                 if "multi_agent_analysis" in result:
                     analysis = result["multi_agent_analysis"]
                     confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
+                    output_msg += f"\n🎯 **Confidence**: {confidence*100:.1f}%"
                     predictive_data = analysis.get('predictive_insights', {})
                     if predictive_data.get('critical_risk_count', 0) > 0:
+                        output_msg += f"\n🔮 **PREDICTIVE**: {predictive_data['critical_risk_count']} critical risks forecast"
                     if analysis.get('recommended_actions'):
+                        actions_preview = ', '.join(analysis['recommended_actions'][:2])
+                        output_msg += f"\n💡 **Top Insights**: {actions_preview}"
                 if result["business_impact"]:
                     impact = result["business_impact"]
+                    output_msg += f"\n💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | 👥 {impact['affected_users_estimate']} users | 🚨 {impact['severity_level']}"
                 if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
                     actions = ", ".join(result["healing_actions"])
+                    output_msg += f"\n🔧 **Auto-Actions**: {actions}"
                 agent_insights_data = result.get("multi_agent_analysis", {})
                 predictive_insights_data = agent_insights_data.get('predictive_insights', {})
                     )
                 )
+            except ValueError as e:
+                error_msg = f"❌ Value error: {str(e)}"
+                logger.error(error_msg, exc_info=True)
+                return error_msg, {}, {}, gr.Dataframe(value=[])
             except Exception as e:
+                error_msg = f"❌ Error processing event: {str(e)}"
+                logger.error(error_msg, exc_info=True)
+                return error_msg, {}, {}, gr.Dataframe(value=[])
+        # ✅ FIXED: Use sync wrapper instead of async function
         submit_btn.click(
+            fn=submit_event_enhanced_sync,
             inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
             outputs=[output_text, agent_insights, predictive_insights, events_table]
         )
     return demo
 if __name__ == "__main__":
+    logger.info("Starting Enterprise Agentic Reliability Framework...")
+    logger.info(f"Total events in history: {events_history_store.count()}")
+    logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
     demo = create_enhanced_ui()
+    logger.info("Launching Gradio UI...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
         share=False
+    )
+    # Save any pending vectors on shutdown
+    if thread_safe_index:
+        logger.info("Saving pending vectors...")
+        thread_safe_index.force_save()
+    logger.info("Application shutdown complete")