Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Nov 30, 2025

Commit

4dfd09d

verified ·

1 Parent(s): c7e6112

Update app.py

Browse files

Files changed (1) hide show

app.py +812 -627

app.py CHANGED Viewed

@@ -1,15 +1,16 @@
 """
-Enterprise Agentic Reliability Framework - Main Application
 Multi-Agent AI System for Production Reliability Monitoring
-This module provides the complete reliability monitoring system including:
-- Multi-agent anomaly detection and root cause analysis
-- Predictive analytics and forecasting
-- Policy-based auto-healing
-- Business impact quantification
-- Vector-based incident memory
-- Adaptive thresholds
-- Thread-safe concurrent operations
 """
 import os
@@ -21,16 +22,23 @@ import pandas as pd
 import datetime
 import threading
 import logging
 from typing import List, Dict, Any, Optional, Tuple
-from collections import deque
 from dataclasses import dataclass, asdict
-import hashlib
-import asyncio
 from enum import Enum
 # Import our modules
-from models import ReliabilityEvent, EventSeverity, AnomalyResult, HealingAction
-from healing_policies import PolicyEngine
 # === Logging Configuration ===
 logging.basicConfig(
@@ -39,48 +47,159 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 # === Configuration ===
 class Config:
     """Centralized configuration for the reliability framework"""
     HF_TOKEN: str = os.getenv("HF_TOKEN", "").strip()
     HF_API_URL: str = "https://router.huggingface.co/hf-inference/v1/completions"
-    # Vector storage
-    VECTOR_DIM: int = 384
-    INDEX_FILE: str = "incident_vectors.index"
-    TEXTS_FILE: str = "incident_texts.json"
-    # Thresholds
-    LATENCY_WARNING: float = 150.0
-    LATENCY_CRITICAL: float = 300.0
-    LATENCY_EXTREME: float = 500.0
-    ERROR_RATE_WARNING: float = 0.05
-    ERROR_RATE_HIGH: float = 0.15
-    ERROR_RATE_CRITICAL: float = 0.3
-    CPU_WARNING: float = 0.8
-    CPU_CRITICAL: float = 0.9
-    MEMORY_WARNING: float = 0.8
-    MEMORY_CRITICAL: float = 0.9
-    # Performance
-    HISTORY_WINDOW: int = 50
-    MAX_EVENTS_STORED: int = 1000
-    AGENT_TIMEOUT: int = 10
-    CACHE_EXPIRY_MINUTES: int = 15
-    # Business metrics
-    BASE_REVENUE_PER_MINUTE: float = 100.0
-    BASE_USERS: int = 1000
-config = Config()
-HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
 # === Thread-Safe Data Structures ===
 class ThreadSafeEventStore:
     """Thread-safe storage for reliability events"""
-    def __init__(self, max_size: int = config.MAX_EVENTS_STORED):
         self._events = deque(maxlen=max_size)
         self._lock = threading.RLock()
         logger.info(f"Initialized ThreadSafeEventStore with max_size={max_size}")
@@ -106,71 +225,188 @@ class ThreadSafeEventStore:
         with self._lock:
             return len(self._events)
-class ThreadSafeFAISSIndex:
-    """Thread-safe wrapper for FAISS index operations with batching"""
     def __init__(self, index, texts: List[str]):
         self.index = index
         self.texts = texts
         self._lock = threading.RLock()
-        self.last_save = datetime.datetime.now()
-        self.save_interval = datetime.timedelta(seconds=30)
-        self.pending_vectors = []
-        self.pending_texts = []
-        logger.info(f"Initialized ThreadSafeFAISSIndex with {len(texts)} existing vectors")
-    def add(self, vector: np.ndarray, text: str) -> None:
-        """Add vector and text with batching"""
-        with self._lock:
-            self.pending_vectors.append(vector)
-            self.pending_texts.append(text)
-            # Flush if we have enough pending
-            if len(self.pending_vectors) >= 10:
-                self._flush()
-    def _flush(self) -> None:
-        """Flush pending vectors to index"""
-        if not self.pending_vectors:
             return
         try:
-            vectors = np.vstack(self.pending_vectors)
             self.index.add(vectors)
-            self.texts.extend(self.pending_texts)
-            logger.info(f"Flushed {len(self.pending_vectors)} vectors to FAISS index")
-            self.pending_vectors = []
-            self.pending_texts = []
-            # Save if enough time has passed
-            if datetime.datetime.now() - self.last_save > self.save_interval:
-                self._save()
         except Exception as e:
-            logger.error(f"Error flushing vectors: {e}", exc_info=True)
-    def _save(self) -> None:
-        """Save index to disk"""
         try:
             import faiss
-            faiss.write_index(self.index, config.INDEX_FILE)
-            with open(config.TEXTS_FILE, "w") as f:
-                json.dump(self.texts, f)
-            self.last_save = datetime.datetime.now()
-            logger.info(f"Saved FAISS index with {len(self.texts)} vectors")
         except Exception as e:
             logger.error(f"Error saving index: {e}", exc_info=True)
     def get_count(self) -> int:
         """Get total count of vectors"""
         with self._lock:
-            return len(self.texts) + len(self.pending_texts)
     def force_save(self) -> None:
         """Force immediate save of pending vectors"""
-        with self._lock:
-            self._flush()
 # === FAISS & Embeddings Setup ===
 try:
@@ -185,10 +421,12 @@ try:
         logger.info(f"Loading existing FAISS index from {config.INDEX_FILE}")
         index = faiss.read_index(config.INDEX_FILE)
-        # Validate dimension
-        if index.d != config.VECTOR_DIM:
-            logger.warning(f"Index dimension mismatch: {index.d} != {config.VECTOR_DIM}. Creating new index.")
-            index = faiss.IndexFlatL2(config.VECTOR_DIM)
             incident_texts = []
         else:
             with open(config.TEXTS_FILE, "r") as f:
@@ -196,10 +434,10 @@ try:
             logger.info(f"Loaded {len(incident_texts)} incident texts")
     else:
         logger.info("Creating new FAISS index")
-        index = faiss.IndexFlatL2(config.VECTOR_DIM)
         incident_texts = []
-    thread_safe_index = ThreadSafeFAISSIndex(index, incident_texts)
 except ImportError as e:
     logger.warning(f"FAISS or SentenceTransformers not available: {e}")
@@ -214,45 +452,30 @@ except Exception as e:
     model = None
     thread_safe_index = None
-# === Predictive Models ===
-@dataclass
-class ForecastResult:
-    """Data class for forecast results"""
-    metric: str
-    predicted_value: float
-    confidence: float
-    trend: str  # "increasing", "decreasing", "stable"
-    time_to_threshold: Optional[datetime.timedelta] = None
-    risk_level: str = "low"  # low, medium, high, critical
 class SimplePredictiveEngine:
     """
-    Lightweight forecasting engine optimized for Hugging Face Spaces.
-    Uses statistical methods for time-series prediction.
     """
-    def __init__(self, history_window: int = config.HISTORY_WINDOW):
         self.history_window = history_window
         self.service_history: Dict[str, deque] = {}
         self.prediction_cache: Dict[str, Tuple[ForecastResult, datetime.datetime]] = {}
-        self.max_cache_age = datetime.timedelta(minutes=config.CACHE_EXPIRY_MINUTES)
         self._lock = threading.RLock()
         logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
     def add_telemetry(self, service: str, event_data: Dict) -> None:
-        """
-        Add telemetry data to service history
-        Args:
-            service: Service name
-            event_data: Dictionary containing metrics (latency_p99, error_rate, etc.)
-        """
         with self._lock:
             if service not in self.service_history:
                 self.service_history[service] = deque(maxlen=self.history_window)
             telemetry_point = {
-                'timestamp': datetime.datetime.now(),
                 'latency': event_data.get('latency_p99', 0),
                 'error_rate': event_data.get('error_rate', 0),
                 'throughput': event_data.get('throughput', 0),
@@ -261,13 +484,11 @@ class SimplePredictiveEngine:
             }
             self.service_history[service].append(telemetry_point)
-            # Clean expired cache
             self._clean_cache()
     def _clean_cache(self) -> None:
         """Remove expired entries from prediction cache"""
-        now = datetime.datetime.now()
         expired = [k for k, (_, ts) in self.prediction_cache.items()
                    if now - ts > self.max_cache_age]
         for k in expired:
@@ -276,19 +497,15 @@ class SimplePredictiveEngine:
         if expired:
             logger.debug(f"Cleaned {len(expired)} expired cache entries")
-    def forecast_service_health(self, service: str, lookahead_minutes: int = 15) -> List[ForecastResult]:
-        """
-        Forecast service health metrics
-        Args:
-            service: Service name to forecast
-            lookahead_minutes: Time horizon in minutes
-        Returns:
-            List of forecast results for different metrics
-        """
         with self._lock:
-            if service not in self.service_history or len(self.service_history[service]) < 10:
                 return []
             history = list(self.service_history[service])
@@ -313,28 +530,23 @@ class SimplePredictiveEngine:
         with self._lock:
             for forecast in forecasts:
                 cache_key = f"{service}_{forecast.metric}"
-                self.prediction_cache[cache_key] = (forecast, datetime.datetime.now())
         return forecasts
-    def _forecast_latency(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
-        """
-        Forecast latency using linear regression and trend analysis
-        Args:
-            history: Historical telemetry data
-            lookahead_minutes: Forecast horizon
-        Returns:
-            ForecastResult or None if insufficient data
-        """
         try:
             latencies = [point['latency'] for point in history[-20:]]
-            if len(latencies) < 5:
                 return None
-            # Simple linear trend
             x = np.arange(len(latencies))
             slope, intercept = np.polyfit(x, latencies, 1)
@@ -342,29 +554,30 @@ class SimplePredictiveEngine:
             next_x = len(latencies)
             predicted_latency = slope * next_x + intercept
-            # Calculate confidence based on data quality
             residuals = latencies - (slope * x + intercept)
             confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
             # Determine trend and risk
-            if slope > 5:
                 trend = "increasing"
-                risk = "critical" if predicted_latency > config.LATENCY_EXTREME else "high"
-            elif slope < -2:
-                trend = "decreasing"
                 risk = "low"
             else:
                 trend = "stable"
-                risk = "low" if predicted_latency < config.LATENCY_WARNING else "medium"
-            # Calculate time to reach critical threshold (500ms)
             time_to_critical = None
-            if slope > 0 and predicted_latency < config.LATENCY_EXTREME:
                 denominator = predicted_latency - latencies[-1]
-                if abs(denominator) > 0.1:  # Avoid division by very small numbers
-                    minutes_to_critical = lookahead_minutes * (config.LATENCY_EXTREME - predicted_latency) / denominator
                     if minutes_to_critical > 0:
-                        time_to_critical = datetime.timedelta(minutes=minutes_to_critical)
             return ForecastResult(
                 metric="latency",
@@ -379,21 +592,16 @@ class SimplePredictiveEngine:
             logger.error(f"Latency forecast error: {e}", exc_info=True)
             return None
-    def _forecast_error_rate(self, history: List, lookahead_minutes: int) -> Optional[ForecastResult]:
-        """
-        Forecast error rate using exponential smoothing
-        Args:
-            history: Historical telemetry data
-            lookahead_minutes: Forecast horizon
-        Returns:
-            ForecastResult or None if insufficient data
-        """
         try:
             error_rates = [point['error_rate'] for point in history[-15:]]
-            if len(error_rates) < 5:
                 return None
             # Exponential smoothing
@@ -409,13 +617,13 @@ class SimplePredictiveEngine:
             if recent_trend > 0.02:
                 trend = "increasing"
-                risk = "critical" if predicted_rate > config.ERROR_RATE_CRITICAL else "high"
             elif recent_trend < -0.01:
                 trend = "decreasing"
                 risk = "low"
             else:
                 trend = "stable"
-                risk = "low" if predicted_rate < config.ERROR_RATE_WARNING else "medium"
             # Confidence based on volatility
             confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
@@ -432,30 +640,25 @@ class SimplePredictiveEngine:
             logger.error(f"Error rate forecast error: {e}", exc_info=True)
             return None
-    def _forecast_resources(self, history: List, lookahead_minutes: int) -> List[ForecastResult]:
-        """
-        Forecast CPU and memory utilization
-        Args:
-            history: Historical telemetry data
-            lookahead_minutes: Forecast horizon
-        Returns:
-            List of forecast results for CPU and memory
-        """
         forecasts = []
         # CPU forecast
         cpu_values = [point['cpu_util'] for point in history if point.get('cpu_util') is not None]
-        if len(cpu_values) >= 5:
             try:
                 predicted_cpu = np.mean(cpu_values[-5:])
                 trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
                 risk = "low"
-                if predicted_cpu > config.CPU_CRITICAL:
                     risk = "critical"
-                elif predicted_cpu > config.CPU_WARNING:
                     risk = "high"
                 elif predicted_cpu > 0.7:
                     risk = "medium"
@@ -472,15 +675,15 @@ class SimplePredictiveEngine:
         # Memory forecast
         memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
-        if len(memory_values) >= 5:
             try:
                 predicted_memory = np.mean(memory_values[-5:])
                 trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
                 risk = "low"
-                if predicted_memory > config.MEMORY_CRITICAL:
                     risk = "critical"
-                elif predicted_memory > config.MEMORY_WARNING:
                     risk = "high"
                 elif predicted_memory > 0.7:
                     risk = "medium"
@@ -498,15 +701,7 @@ class SimplePredictiveEngine:
         return forecasts
     def get_predictive_insights(self, service: str) -> Dict[str, Any]:
-        """
-        Generate actionable insights from forecasts
-        Args:
-            service: Service name
-        Returns:
-            Dictionary containing warnings, recommendations, and forecast data
-        """
         forecasts = self.forecast_service_health(service)
         critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
@@ -517,8 +712,8 @@ class SimplePredictiveEngine:
             if forecast.metric == "latency" and forecast.risk_level in ["high", "critical"]:
                 warnings.append(f"📈 Latency expected to reach {forecast.predicted_value:.0f}ms")
                 if forecast.time_to_threshold:
-                    minutes = int(forecast.time_to_threshold.total_seconds() / 60)
-                    recommendations.append(f"⏰ Critical latency (~500ms) in ~{minutes} minutes")
                 recommendations.append("🔧 Consider scaling or optimizing dependencies")
             elif forecast.metric == "error_rate" and forecast.risk_level in ["high", "critical"]:
@@ -535,55 +730,54 @@ class SimplePredictiveEngine:
         return {
             'service': service,
-            'forecasts': [asdict(f) for f in forecasts],
             'warnings': warnings[:3],
             'recommendations': list(dict.fromkeys(recommendations))[:3],
             'critical_risk_count': len(critical_risks),
-            'forecast_timestamp': datetime.datetime.now().isoformat()
         }
-# === Core Engine Components ===
-policy_engine = PolicyEngine()
-events_history_store = ThreadSafeEventStore()
-predictive_engine = SimplePredictiveEngine()
 class BusinessImpactCalculator:
-    """
-    Calculate business impact of anomalies including revenue loss
-    and user impact estimation
-    """
     def __init__(self, revenue_per_request: float = 0.01):
         self.revenue_per_request = revenue_per_request
-        logger.info(f"Initialized BusinessImpactCalculator with revenue_per_request={revenue_per_request}")
-    def calculate_impact(self, event: ReliabilityEvent, duration_minutes: int = 5) -> Dict[str, Any]:
-        """
-        Calculate business impact for a reliability event
-        Args:
-            event: The reliability event to analyze
-            duration_minutes: Assumed duration of the incident
-        Returns:
-            Dictionary containing revenue loss, user impact, and severity
-        """
-        base_revenue_per_minute = config.BASE_REVENUE_PER_MINUTE
         impact_multiplier = 1.0
         # Impact factors
-        if event.latency_p99 > config.LATENCY_CRITICAL:
             impact_multiplier += 0.5
         if event.error_rate > 0.1:
             impact_multiplier += 0.8
-        if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
             impact_multiplier += 0.3
         revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
-        base_users_affected = config.BASE_USERS
-        user_impact_multiplier = (event.error_rate * 10) + (max(0, event.latency_p99 - 100) / 500)
         affected_users = int(base_users_affected * user_impact_multiplier)
         # Severity classification
@@ -596,7 +790,10 @@ class BusinessImpactCalculator:
         else:
             severity = "LOW"
-        logger.info(f"Business impact: ${revenue_loss:.2f} revenue loss, {affected_users} users, {severity} severity")
         return {
             'revenue_loss_estimate': round(revenue_loss, 2),
@@ -605,41 +802,29 @@ class BusinessImpactCalculator:
             'throughput_reduction_pct': round(min(100, user_impact_multiplier * 100), 1)
         }
-business_calculator = BusinessImpactCalculator()
 class AdvancedAnomalyDetector:
-    """
-    Enhanced anomaly detection with adaptive thresholds that learn
-    from historical data patterns
-    """
     def __init__(self):
         self.historical_data = deque(maxlen=100)
         self.adaptive_thresholds = {
-            'latency_p99': config.LATENCY_WARNING,
-            'error_rate': config.ERROR_RATE_WARNING
         }
         self._lock = threading.RLock()
         logger.info("Initialized AdvancedAnomalyDetector")
     def detect_anomaly(self, event: ReliabilityEvent) -> bool:
-        """
-        Detect if event is anomalous using adaptive thresholds
-        Args:
-            event: The reliability event to check
-        Returns:
-            True if anomaly detected, False otherwise
-        """
         with self._lock:
             latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
             error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
             resource_anomaly = False
-            if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
                 resource_anomaly = True
-            if event.memory_util and event.memory_util > config.MEMORY_CRITICAL:
                 resource_anomaly = True
             self._update_thresholds(event)
@@ -647,7 +832,11 @@ class AdvancedAnomalyDetector:
             is_anomaly = latency_anomaly or error_anomaly or resource_anomaly
             if is_anomaly:
-                logger.info(f"Anomaly detected for {event.component}: latency={latency_anomaly}, error={error_anomaly}, resource={resource_anomaly}")
             return is_anomaly
@@ -661,15 +850,14 @@ class AdvancedAnomalyDetector:
             self.adaptive_thresholds['latency_p99'] = new_threshold
             logger.debug(f"Updated adaptive latency threshold to {new_threshold:.2f}ms")
-anomaly_detector = AdvancedAnomalyDetector()
-# === Multi-Agent System ===
 class AgentSpecialization(Enum):
     """Agent specialization types"""
     DETECTIVE = "anomaly_detection"
     DIAGNOSTICIAN = "root_cause_analysis"
     PREDICTIVE = "predictive_analytics"
 class BaseAgent:
     """Base class for all specialized agents"""
@@ -685,26 +873,16 @@ class BaseAgent:
         """Base analysis method to be implemented by specialized agents"""
         raise NotImplementedError
 class AnomalyDetectionAgent(BaseAgent):
-    """
-    Specialized agent for anomaly detection and pattern recognition.
-    Calculates multi-dimensional anomaly scores and identifies affected metrics.
-    """
     def __init__(self):
         super().__init__(AgentSpecialization.DETECTIVE)
         logger.info("Initialized AnomalyDetectionAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
-        """
-        Perform comprehensive anomaly analysis
-        Args:
-            event: Reliability event to analyze
-        Returns:
-            Dictionary containing anomaly score, severity, affected metrics, and recommendations
-        """
         try:
             anomaly_score = self._calculate_anomaly_score(event)
@@ -728,47 +906,31 @@ class AnomalyDetectionAgent(BaseAgent):
             }
     def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
-        """
-        Calculate comprehensive anomaly score (0-1) using weighted metrics
-        Args:
-            event: Reliability event
-        Returns:
-            Float between 0 and 1 representing anomaly severity
-        """
         scores = []
         # Latency anomaly (weighted 40%)
-        if event.latency_p99 > config.LATENCY_WARNING:
-            latency_score = min(1.0, (event.latency_p99 - config.LATENCY_WARNING) / 500)
             scores.append(0.4 * latency_score)
         # Error rate anomaly (weighted 30%)
-        if event.error_rate > config.ERROR_RATE_WARNING:
             error_score = min(1.0, event.error_rate / 0.3)
             scores.append(0.3 * error_score)
         # Resource anomaly (weighted 30%)
         resource_score = 0
-        if event.cpu_util and event.cpu_util > config.CPU_WARNING:
-            resource_score += 0.15 * min(1.0, (event.cpu_util - config.CPU_WARNING) / 0.2)
-        if event.memory_util and event.memory_util > config.MEMORY_WARNING:
-            resource_score += 0.15 * min(1.0, (event.memory_util - config.MEMORY_WARNING) / 0.2)
         scores.append(resource_score)
         return min(1.0, sum(scores))
     def _classify_severity(self, anomaly_score: float) -> str:
-        """
-        Classify severity tier based on anomaly score
-        Args:
-            anomaly_score: Score between 0 and 1
-        Returns:
-            Severity tier string (LOW, MEDIUM, HIGH, CRITICAL)
-        """
         if anomaly_score > 0.8:
             return "CRITICAL"
         elif anomaly_score > 0.6:
@@ -779,108 +941,95 @@ class AnomalyDetectionAgent(BaseAgent):
             return "LOW"
     def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
-        """
-        Identify which metrics are outside normal ranges
-        Args:
-            event: Reliability event
-        Returns:
-            List of dictionaries describing affected metrics with severity
-        """
         affected = []
         # Latency checks
-        if event.latency_p99 > config.LATENCY_EXTREME:
             affected.append({
-                "metric": "latency",
-                "value": event.latency_p99,
-                "severity": "CRITICAL",
-                "threshold": config.LATENCY_WARNING
             })
-        elif event.latency_p99 > config.LATENCY_CRITICAL:
             affected.append({
-                "metric": "latency",
-                "value": event.latency_p99,
-                "severity": "HIGH",
-                "threshold": config.LATENCY_WARNING
             })
-        elif event.latency_p99 > config.LATENCY_WARNING:
             affected.append({
-                "metric": "latency",
-                "value": event.latency_p99,
-                "severity": "MEDIUM",
-                "threshold": config.LATENCY_WARNING
             })
         # Error rate checks
-        if event.error_rate > config.ERROR_RATE_CRITICAL:
             affected.append({
-                "metric": "error_rate",
-                "value": event.error_rate,
-                "severity": "CRITICAL",
-                "threshold": config.ERROR_RATE_WARNING
             })
-        elif event.error_rate > config.ERROR_RATE_HIGH:
             affected.append({
-                "metric": "error_rate",
-                "value": event.error_rate,
-                "severity": "HIGH",
-                "threshold": config.ERROR_RATE_WARNING
             })
-        elif event.error_rate > config.ERROR_RATE_WARNING:
             affected.append({
-                "metric": "error_rate",
-                "value": event.error_rate,
-                "severity": "MEDIUM",
-                "threshold": config.ERROR_RATE_WARNING
             })
         # CPU checks
-        if event.cpu_util and event.cpu_util > config.CPU_CRITICAL:
             affected.append({
-                "metric": "cpu",
-                "value": event.cpu_util,
-                "severity": "CRITICAL",
-                "threshold": config.CPU_WARNING
             })
-        elif event.cpu_util and event.cpu_util > config.CPU_WARNING:
             affected.append({
-                "metric": "cpu",
-                "value": event.cpu_util,
-                "severity": "HIGH",
-                "threshold": config.CPU_WARNING
             })
         # Memory checks
-        if event.memory_util and event.memory_util > config.MEMORY_CRITICAL:
             affected.append({
-                "metric": "memory",
-                "value": event.memory_util,
-                "severity": "CRITICAL",
-                "threshold": config.MEMORY_WARNING
             })
-        elif event.memory_util and event.memory_util > config.MEMORY_WARNING:
             affected.append({
-                "metric": "memory",
-                "value": event.memory_util,
-                "severity": "HIGH",
-                "threshold": config.MEMORY_WARNING
             })
         return affected
-    def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
-        """
-        Generate actionable recommendations based on detected anomalies
-        Args:
-            event: Reliability event
-            anomaly_score: Calculated anomaly score
-        Returns:
-            List of recommendation strings with emojis for visibility
-        """
         recommendations = []
         affected_metrics = self._identify_affected_metrics(event)
@@ -940,28 +1089,18 @@ class AnomalyDetectionAgent(BaseAgent):
         elif anomaly_score > 0.4:
             recommendations.append("📊 MONITOR: Early warning signs detected")
-        return recommendations[:4]  # Return top 4 recommendations
 class RootCauseAgent(BaseAgent):
-    """
-    Specialized agent for root cause analysis.
-    Analyzes failure patterns and provides investigation guidance.
-    """
     def __init__(self):
         super().__init__(AgentSpecialization.DIAGNOSTICIAN)
         logger.info("Initialized RootCauseAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
-        """
-        Perform root cause analysis
-        Args:
-            event: Reliability event to analyze
-        Returns:
-            Dictionary containing likely root causes and investigation guidance
-        """
         try:
             causes = self._analyze_potential_causes(event)
@@ -987,19 +1126,11 @@ class RootCauseAgent(BaseAgent):
             }
     def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
-        """
-        Analyze potential root causes based on event patterns
-        Args:
-            event: Reliability event
-        Returns:
-            List of potential root causes with confidence scores
-        """
         causes = []
         # Pattern 1: Database/External Dependency Failure
-        if event.latency_p99 > config.LATENCY_EXTREME and event.error_rate > 0.2:
             causes.append({
                 "cause": "Database/External Dependency Failure",
                 "confidence": 0.85,
@@ -1008,8 +1139,8 @@ class RootCauseAgent(BaseAgent):
             })
         # Pattern 2: Resource Exhaustion
-        if (event.cpu_util and event.cpu_util > config.CPU_CRITICAL and
-            event.memory_util and event.memory_util > config.MEMORY_CRITICAL):
             causes.append({
                 "cause": "Resource Exhaustion",
                 "confidence": 0.90,
@@ -1018,7 +1149,7 @@ class RootCauseAgent(BaseAgent):
             })
         # Pattern 3: Application Bug / Configuration Issue
-        if event.error_rate > config.ERROR_RATE_CRITICAL and event.latency_p99 < 200:
             causes.append({
                 "cause": "Application Bug / Configuration Issue",
                 "confidence": 0.75,
@@ -1027,8 +1158,8 @@ class RootCauseAgent(BaseAgent):
             })
         # Pattern 4: Gradual Performance Degradation
-        if (200 <= event.latency_p99 <= 400 and
-            config.ERROR_RATE_WARNING <= event.error_rate <= config.ERROR_RATE_HIGH):
             causes.append({
                 "cause": "Gradual Performance Degradation",
                 "confidence": 0.65,
@@ -1048,65 +1179,39 @@ class RootCauseAgent(BaseAgent):
         return causes
     def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
-        """
-        Identify evidence patterns in the event data
-        Args:
-            event: Reliability event
-        Returns:
-            List of evidence pattern identifiers
-        """
         evidence = []
         if event.latency_p99 > event.error_rate * 1000:
             evidence.append("latency_disproportionate_to_errors")
-        if (event.cpu_util and event.cpu_util > config.CPU_WARNING and
-            event.memory_util and event.memory_util > config.MEMORY_WARNING):
             evidence.append("correlated_resource_exhaustion")
-        if event.error_rate > config.ERROR_RATE_HIGH and event.latency_p99 < config.LATENCY_CRITICAL:
             evidence.append("errors_without_latency_impact")
         return evidence
     def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
-        """
-        Determine investigation priority based on identified causes
-        Args:
-            causes: List of potential root causes
-        Returns:
-            Priority level (HIGH, MEDIUM, LOW)
-        """
         for cause in causes:
             if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
                 return "HIGH"
         return "MEDIUM"
 class PredictiveAgent(BaseAgent):
-    """
-    Specialized agent for predictive analytics.
-    Forecasts future risks and trends using statistical models.
-    """
-    def __init__(self):
         super().__init__(AgentSpecialization.PREDICTIVE)
-        self.engine = predictive_engine
         logger.info("Initialized PredictiveAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
-        """
-        Perform predictive analysis for future risks
-        Args:
-            event: Current reliability event
-        Returns:
-            Dictionary containing forecasts and predictive insights
-        """
         try:
             event_data = {
                 'latency_p99': event.latency_p99,
@@ -1134,17 +1239,47 @@ class PredictiveAgent(BaseAgent):
                 'recommendations': [f"Analysis error: {str(e)}"]
             }
-class OrchestrationManager:
     """
-    Orchestrates multiple specialized agents for comprehensive analysis.
-    Runs agents in parallel and synthesizes their findings.
     """
-    def __init__(self):
         self.agents = {
-            AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
-            AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
-            AgentSpecialization.PREDICTIVE: PredictiveAgent(),
         }
         logger.info(f"Initialized OrchestrationManager with {len(self.agents)} agents")
@@ -1152,44 +1287,48 @@ class OrchestrationManager:
         """
         Coordinate multiple agents for comprehensive analysis
-        Args:
-            event: Reliability event to analyze
-        Returns:
-            Synthesized findings from all agents
         """
-        agent_tasks = {
-            spec: agent.analyze(event)
-            for spec, agent in self.agents.items()
-        }
-        # Parallel agent execution with timeout protection
         agent_results = {}
-        for specialization, task in agent_tasks.items():
-            try:
-                result = await asyncio.wait_for(task, timeout=5.0)
-                agent_results[specialization.value] = result
-                logger.debug(f"Agent {specialization.value} completed successfully")
-            except asyncio.TimeoutError:
-                logger.warning(f"Agent {specialization.value} timed out")
-                continue
-            except Exception as e:
-                logger.error(f"Agent {specialization.value} error: {e}", exc_info=True)
-                continue
         return self._synthesize_agent_findings(event, agent_results)
-    def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
-        """
-        Combine insights from all specialized agents
-        Args:
-            event: Original reliability event
-            agent_results: Results from each agent
-        Returns:
-            Synthesized analysis combining all agent findings
-        """
         detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
         diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
         predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
@@ -1203,7 +1342,7 @@ class OrchestrationManager:
                 'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
                 'anomaly_confidence': detective_result['confidence'],
                 'primary_metrics_affected': [
-                    metric["metric"] for metric in
                     detective_result['findings'].get('primary_metrics_affected', [])
                 ]
             },
@@ -1216,26 +1355,19 @@ class OrchestrationManager:
             ),
             'agent_metadata': {
                 'participating_agents': list(agent_results.keys()),
-                'analysis_timestamp': datetime.datetime.now().isoformat()
             }
         }
         return synthesis
-    def _prioritize_actions(self, detection_actions: List[str],
-                          diagnosis_actions: List[str],
-                          predictive_actions: List[str]) -> List[str]:
-        """
-        Combine and prioritize actions from multiple agents
-        Args:
-            detection_actions: Actions from detective agent
-            diagnosis_actions: Actions from diagnostician agent
-            predictive_actions: Actions from predictive agent
-        Returns:
-            Prioritized list of unique actions
-        """
         all_actions = detection_actions + diagnosis_actions + predictive_actions
         seen = set()
         unique_actions = []
@@ -1243,19 +1375,35 @@ class OrchestrationManager:
             if action not in seen:
                 seen.add(action)
                 unique_actions.append(action)
-        return unique_actions[:5]  # Return top 5 actions
-# Initialize orchestration manager
-orchestration_manager = OrchestrationManager()
-# === Enhanced Reliability Engine ===
 class EnhancedReliabilityEngine:
     """
-    Main engine for processing reliability events through the multi-agent system.
-    Coordinates anomaly detection, agent analysis, policy evaluation, and impact calculation.
     """
-    def __init__(self):
         self.performance_metrics = {
             'total_incidents_processed': 0,
             'multi_agent_analyses': 0,
@@ -1265,83 +1413,98 @@ class EnhancedReliabilityEngine:
         logger.info("Initialized EnhancedReliabilityEngine")
     async def process_event_enhanced(
-        self,
-        component: str,
-        latency: float,
         error_rate: float,
-        throughput: float = 1000,
         cpu_util: Optional[float] = None,
         memory_util: Optional[float] = None
     ) -> Dict[str, Any]:
         """
         Process a reliability event through the complete analysis pipeline
-        Args:
-            component: Service component name
-            latency: P99 latency in milliseconds
-            error_rate: Error rate (0-1)
-            throughput: Requests per second
-            cpu_util: CPU utilization (0-1)
-            memory_util: Memory utilization (0-1)
-        Returns:
-            Comprehensive analysis results including agent findings, healing actions, and business impact
         """
-        logger.info(f"Processing event for {component}: latency={latency}ms, error_rate={error_rate*100:.1f}%")
         # Create event
-        event = ReliabilityEvent(
-            component=component,
-            latency_p99=latency,
-            error_rate=error_rate,
-            throughput=throughput,
-            cpu_util=cpu_util,
-            memory_util=memory_util,
-            upstream_deps=["auth-service", "database"] if component == "api-service" else []
-        )
         # Multi-agent analysis
-        agent_analysis = await orchestration_manager.orchestrate_analysis(event)
         # Anomaly detection
-        is_anomaly = anomaly_detector.detect_anomaly(event)
         # Determine severity based on agent confidence
         agent_confidence = 0.0
         if agent_analysis and 'incident_summary' in agent_analysis:
             agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
         else:
             agent_confidence = 0.8 if is_anomaly else 0.1
         # Set event severity
         if agent_confidence > 0.8:
-            event.severity = EventSeverity.CRITICAL
         elif agent_confidence > 0.6:
-            event.severity = EventSeverity.HIGH
         elif agent_confidence > 0.4:
-            event.severity = EventSeverity.MEDIUM
         else:
-            event.severity = EventSeverity.LOW
         # Evaluate healing policies
-        healing_actions = policy_engine.evaluate_policies(event)
         # Calculate business impact
-        business_impact = business_calculator.calculate_impact(event) if is_anomaly else None
         # Store in vector database for similarity detection
         if thread_safe_index is not None and model is not None and is_anomaly:
             try:
                 analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
                 vector_text = f"{component} {latency} {error_rate} {analysis_text}"
-                vec = model.encode([vector_text])
-                thread_safe_index.add(np.array(vec, dtype=np.float32), vector_text)
             except Exception as e:
                 logger.error(f"Error storing vector: {e}", exc_info=True)
         # Build comprehensive result
         result = {
-            "timestamp": event.timestamp,
             "component": component,
             "latency_p99": latency,
             "error_rate": error_rate,
@@ -1359,7 +1522,7 @@ class EnhancedReliabilityEngine:
         }
         # Store event in history
-        events_history_store.add(event)
         # Update performance metrics
         with self._lock:
@@ -1372,49 +1535,49 @@ class EnhancedReliabilityEngine:
         return result
-# Initialize enhanced engine
 enhanced_engine = EnhancedReliabilityEngine()
-# === Input Validation ===
-def validate_inputs(
-    latency: float,
-    error_rate: float,
-    throughput: float,
-    cpu_util: Optional[float],
-    memory_util: Optional[float]
-) -> Tuple[bool, str]:
-    """
-    Validate user inputs for bounds and type correctness
-    Args:
-        latency: Latency value in milliseconds
-        error_rate: Error rate (0-1)
-        throughput: Throughput in requests/sec
-        cpu_util: CPU utilization (0-1)
-        memory_util: Memory utilization (0-1)
-    Returns:
-        Tuple of (is_valid: bool, error_message: str)
-    """
-    if not (0 <= latency <= 10000):
-        return False, "❌ Invalid latency: must be between 0-10000ms"
-    if not (0 <= error_rate <= 1):
-        return False, "❌ Invalid error rate: must be between 0-1"
-    if throughput < 0:
-        return False, "❌ Invalid throughput: must be positive"
-    if cpu_util is not None and not (0 <= cpu_util <= 1):
-        return False, "❌ Invalid CPU utilization: must be between 0-1"
-    if memory_util is not None and not (0 <= memory_util <= 1):
-        return False, "❌ Invalid memory utilization: must be between 0-1"
-    return True, ""
 # === Gradio UI ===
 def create_enhanced_ui():
     """
-    Create the comprehensive Gradio UI for the reliability framework.
-    Includes telemetry input, multi-agent analysis display, predictive insights,
-    and event history visualization.
     """
     with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
@@ -1423,6 +1586,8 @@ def create_enhanced_ui():
         **Multi-Agent AI System for Production Reliability**
         *Specialized AI agents working together to detect, diagnose, predict, and heal system issues*
         """)
         with gr.Row():
@@ -1437,12 +1602,12 @@ def create_enhanced_ui():
                 latency = gr.Slider(
                     minimum=10, maximum=1000, value=100, step=1,
                     label="Latency P99 (ms)",
-                    info=f"Alert threshold: >{config.LATENCY_WARNING}ms (adaptive)"
                 )
                 error_rate = gr.Slider(
                     minimum=0, maximum=0.5, value=0.02, step=0.001,
                     label="Error Rate",
-                    info=f"Alert threshold: >{config.ERROR_RATE_WARNING}"
                 )
                 throughput = gr.Number(
                     value=1000,
@@ -1456,7 +1621,7 @@ def create_enhanced_ui():
                 )
                 memory_util = gr.Slider(
                     minimum=0, maximum=1, value=0.3, step=0.01,
-                    label="Memory Utilization",
                     info="0.0 - 1.0 scale"
                 )
                 submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary", size="lg")
@@ -1473,7 +1638,7 @@ def create_enhanced_ui():
                     gr.Markdown("""
                     **Specialized AI Agents:**
                     - 🕵️ **Detective**: Anomaly detection & pattern recognition
-                    - 🔍 **Diagnostician**: Root cause analysis & investigation
                     - 🔮 **Predictive**: Future risk forecasting & trend analysis
                     """)
@@ -1486,7 +1651,7 @@ def create_enhanced_ui():
                     gr.Markdown("""
                     **Future Risk Forecasting:**
                     - 📈 Latency trends and thresholds
-                    - 🚨 Error rate predictions
                     - 🔥 Resource utilization forecasts
                     - ⏰ Time-to-failure estimates
                     """)
@@ -1511,100 +1676,111 @@ def create_enhanced_ui():
             - **💰 Business Impact**: Revenue and user impact quantification
             - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
             - **📚 Vector Memory**: FAISS-based incident memory for similarity detection
-            - **⚡ Production Ready**: Circuit breakers, cooldowns, and enterprise features
             """)
         with gr.Accordion("🔧 Healing Policies", open=False):
             policy_info = []
-            for policy in policy_engine.policies:
                 if policy.enabled:
                     actions = ", ".join([action.value for action in policy.actions])
-                    policy_info.append(f"**{policy.name}**: {actions} (Priority: {policy.priority})")
             gr.Markdown("\n\n".join(policy_info))
-        # ✅ FIXED: Synchronous wrapper for async function (CRITICAL FIX)
-        def submit_event_enhanced_sync(component, latency, error_rate, throughput, cpu_util, memory_util):
             """
-            Synchronous wrapper for async event processing.
-            FIXES GRADIO ASYNC/SYNC COMPATIBILITY ISSUE.
-            This wrapper:
-            1. Validates inputs
-            2. Creates new event loop for async execution
-            3. Calls the async processing function
-            4. Formats results for display
-            5. Handles all errors gracefully
             """
             try:
                 # Type conversion
-                latency = float(latency)
-                error_rate = float(error_rate)
-                throughput = float(throughput) if throughput else 1000
-                cpu_util = float(cpu_util) if cpu_util else None
-                memory_util = float(memory_util) if memory_util else None
-                # Input validation (CRITICAL FIX)
-                is_valid, error_msg = validate_inputs(latency, error_rate, throughput, cpu_util, memory_util)
                 if not is_valid:
                     logger.warning(f"Invalid input: {error_msg}")
                     return error_msg, {}, {}, gr.Dataframe(value=[])
-                # Create new event loop for async execution (CRITICAL FIX)
-                loop = asyncio.new_event_loop()
-                asyncio.set_event_loop(loop)
-                try:
-                    # Call async function
-                    result = loop.run_until_complete(
-                        enhanced_engine.process_event_enhanced(
-                            component, latency, error_rate, throughput, cpu_util, memory_util
-                        )
-                    )
-                finally:
-                    loop.close()
-                # Build table data (THREAD-SAFE FIX)
                 table_data = []
-                for event in events_history_store.get_recent(15):
                     table_data.append([
-                        event.timestamp[:19],
                         event.component,
-                        event.latency_p99,
                         f"{event.error_rate:.3f}",
-                        event.throughput,
                         event.severity.value.upper(),
                         "Multi-agent analysis"
                     ])
                 # Format output message
                 status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
-                output_msg = f"{status_emoji} **{result['status']}**"
                 if "multi_agent_analysis" in result:
                     analysis = result["multi_agent_analysis"]
                     confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
-                    output_msg += f"\n🎯 **Confidence**: {confidence*100:.1f}%"
                     predictive_data = analysis.get('predictive_insights', {})
                     if predictive_data.get('critical_risk_count', 0) > 0:
-                        output_msg += f"\n🔮 **PREDICTIVE**: {predictive_data['critical_risk_count']} critical risks forecast"
                     if analysis.get('recommended_actions'):
                         actions_preview = ', '.join(analysis['recommended_actions'][:2])
-                        output_msg += f"\n💡 **Top Insights**: {actions_preview}"
-                if result["business_impact"]:
                     impact = result["business_impact"]
                     output_msg += (
-                        f"\n💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | "
                         f"👥 {impact['affected_users_estimate']} users | "
-                        f"🚨 {impact['severity_level']}"
                     )
-                if result["healing_actions"] and result["healing_actions"] != ["no_action"]:
                     actions = ", ".join(result["healing_actions"])
-                    output_msg += f"\n🔧 **Auto-Actions**: {actions}"
                 agent_insights_data = result.get("multi_agent_analysis", {})
                 predictive_insights_data = agent_insights_data.get('predictive_insights', {})
@@ -1620,48 +1796,57 @@ def create_enhanced_ui():
                     )
                 )
-            except ValueError as e:
-                error_msg = f"❌ Value error: {str(e)}"
-                logger.error(error_msg, exc_info=True)
-                return error_msg, {}, {}, gr.Dataframe(value=[])
             except Exception as e:
                 error_msg = f"❌ Error processing event: {str(e)}"
                 logger.error(error_msg, exc_info=True)
                 return error_msg, {}, {}, gr.Dataframe(value=[])
-        # ✅ FIXED: Use sync wrapper instead of async function (CRITICAL FIX)
         submit_btn.click(
-            fn=submit_event_enhanced_sync,  # Synchronous wrapper
             inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
             outputs=[output_text, agent_insights, predictive_insights, events_table]
         )
     return demo
 # === Main Entry Point ===
 if __name__ == "__main__":
     logger.info("=" * 80)
-    logger.info("Starting Enterprise Agentic Reliability Framework")
     logger.info("=" * 80)
-    logger.info(f"Total events in history: {events_history_store.count()}")
     logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
-    logger.info(f"Agents initialized: {len(orchestration_manager.agents)}")
     logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
-    demo = create_enhanced_ui()
-    logger.info("Launching Gradio UI on 0.0.0.0:7860...")
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False
-    )
-    # Graceful shutdown: Save any pending vectors
-    if thread_safe_index:
-        logger.info("Saving pending vectors before shutdown...")
-        thread_safe_index.force_save()
     logger.info("=" * 80)
-    logger.info("Application shutdown complete")
-    logger.info("=" * 80)

 """
+Enterprise Agentic Reliability Framework - Main Application (FIXED VERSION)
 Multi-Agent AI System for Production Reliability Monitoring
+CRITICAL FIXES APPLIED:
+- Removed event loop creation (uses Gradio native async)
+- Fixed FAISS thread safety with single-writer pattern
+- ProcessPoolExecutor for CPU-intensive encoding
+- Atomic saves with fsync
+- Dependency injection
+- Rate limiting
+- Comprehensive input validation
+- Circuit breakers for agent resilience
 """
 import os
 import datetime
 import threading
 import logging
+import asyncio
+import tempfile
 from typing import List, Dict, Any, Optional, Tuple
+from collections import deque, OrderedDict
 from dataclasses import dataclass, asdict
 from enum import Enum
+from concurrent.futures import ProcessPoolExecutor
+from queue import Queue
+from circuitbreaker import circuit
+import atomicwrites
 # Import our modules
+from models import (
+    ReliabilityEvent, EventSeverity, AnomalyResult,
+    HealingAction, ForecastResult, PolicyCondition
+)
+from healing_policies import PolicyEngine, DEFAULT_HEALING_POLICIES
 # === Logging Configuration ===
 logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
+# === CONSTANTS (FIXED: Extracted all magic numbers) ===
+class Constants:
+    """Centralized constants to eliminate magic numbers"""
+    # Thresholds
+    LATENCY_WARNING = 150.0
+    LATENCY_CRITICAL = 300.0
+    LATENCY_EXTREME = 500.0
+    ERROR_RATE_WARNING = 0.05
+    ERROR_RATE_HIGH = 0.15
+    ERROR_RATE_CRITICAL = 0.3
+    CPU_WARNING = 0.8
+    CPU_CRITICAL = 0.9
+    MEMORY_WARNING = 0.8
+    MEMORY_CRITICAL = 0.9
+    # Forecasting
+    SLOPE_THRESHOLD_INCREASING = 5.0
+    SLOPE_THRESHOLD_DECREASING = -2.0
+    FORECAST_MIN_DATA_POINTS = 5
+    FORECAST_LOOKAHEAD_MINUTES = 15
+    # Performance
+    HISTORY_WINDOW = 50
+    MAX_EVENTS_STORED = 1000
+    AGENT_TIMEOUT_SECONDS = 5
+    CACHE_EXPIRY_MINUTES = 15
+    # FAISS
+    FAISS_BATCH_SIZE = 10
+    FAISS_SAVE_INTERVAL_SECONDS = 30
+    VECTOR_DIM = 384
+    # Business metrics
+    BASE_REVENUE_PER_MINUTE = 100.0
+    BASE_USERS = 1000
+    # Rate limiting
+    MAX_REQUESTS_PER_MINUTE = 60
+    MAX_REQUESTS_PER_HOUR = 500
 # === Configuration ===
 class Config:
     """Centralized configuration for the reliability framework"""
     HF_TOKEN: str = os.getenv("HF_TOKEN", "").strip()
     HF_API_URL: str = "https://router.huggingface.co/hf-inference/v1/completions"
+    INDEX_FILE: str = os.getenv("INDEX_FILE", "data/incident_vectors.index")
+    TEXTS_FILE: str = os.getenv("TEXTS_FILE", "data/incident_texts.json")
+    DATA_DIR: str = os.getenv("DATA_DIR", "data")
+    # Create data directory if it doesn't exist
+    os.makedirs(DATA_DIR, exist_ok=True)
+config = Config()
+HEADERS = {"Authorization": f"Bearer {config.HF_TOKEN}"} if config.HF_TOKEN else {}
+# === Input Validation (FIXED: Comprehensive validation) ===
+def validate_component_id(component_id: str) -> Tuple[bool, str]:
+    """Validate component ID format"""
+    if not isinstance(component_id, str):
+        return False, "Component ID must be a string"
+    if not (1 <= len(component_id) <= 255):
+        return False, "Component ID must be 1-255 characters"
+    import re
+    if not re.match(r"^[a-z0-9-]+$", component_id):
+        return False, "Component ID must contain only lowercase letters, numbers, and hyphens"
+    return True, ""
+def validate_inputs(
+    latency: Any,
+    error_rate: Any,
+    throughput: Any,
+    cpu_util: Any,
+    memory_util: Any
+) -> Tuple[bool, str]:
+    """
+    Comprehensive input validation with type checking
+    FIXED: Added proper type validation before conversion
+    """
+    try:
+        # Type conversion with error handling
+        try:
+            latency_f = float(latency)
+        except (ValueError, TypeError):
+            return False, "❌ Invalid latency: must be a number"
+        try:
+            error_rate_f = float(error_rate)
+        except (ValueError, TypeError):
+            return False, "❌ Invalid error rate: must be a number"
+        try:
+            throughput_f = float(throughput) if throughput else 1000.0
+        except (ValueError, TypeError):
+            return False, "❌ Invalid throughput: must be a number"
+        # CPU and memory are optional
+        cpu_util_f = None
+        if cpu_util:
+            try:
+                cpu_util_f = float(cpu_util)
+            except (ValueError, TypeError):
+                return False, "❌ Invalid CPU utilization: must be a number"
+        memory_util_f = None
+        if memory_util:
+            try:
+                memory_util_f = float(memory_util)
+            except (ValueError, TypeError):
+                return False, "❌ Invalid memory utilization: must be a number"
+        # Range validation
+        if not (0 <= latency_f <= 10000):
+            return False, "❌ Invalid latency: must be between 0-10000ms"
+        if not (0 <= error_rate_f <= 1):
+            return False, "❌ Invalid error rate: must be between 0-1"
+        if throughput_f < 0:
+            return False, "❌ Invalid throughput: must be positive"
+        if cpu_util_f is not None and not (0 <= cpu_util_f <= 1):
+            return False, "❌ Invalid CPU utilization: must be between 0-1"
+        if memory_util_f is not None and not (0 <= memory_util_f <= 1):
+            return False, "❌ Invalid memory utilization: must be between 0-1"
+        return True, ""
+    except Exception as e:
+        logger.error(f"Validation error: {e}", exc_info=True)
+        return False, f"❌ Validation error: {str(e)}"
 # === Thread-Safe Data Structures ===
 class ThreadSafeEventStore:
     """Thread-safe storage for reliability events"""
+    def __init__(self, max_size: int = Constants.MAX_EVENTS_STORED):
         self._events = deque(maxlen=max_size)
         self._lock = threading.RLock()
         logger.info(f"Initialized ThreadSafeEventStore with max_size={max_size}")
         with self._lock:
             return len(self._events)
+# === FAISS Integration (FIXED: Single-writer pattern for thread safety) ===
+class ProductionFAISSIndex:
+    """
+    Production-safe FAISS index with single-writer pattern
+    CRITICAL FIX: FAISS is NOT thread-safe for concurrent writes
+    Solution: Queue-based single writer thread + atomic saves
+    """
     def __init__(self, index, texts: List[str]):
         self.index = index
         self.texts = texts
         self._lock = threading.RLock()
+        # Single writer thread (no concurrent write conflicts)
+        self._write_queue: Queue = Queue()
+        self._writer_thread = threading.Thread(
+            target=self._writer_loop,
+            daemon=True,
+            name="FAISSWriter"
+        )
+        self._writer_thread.start()
+        # ProcessPool for encoding (avoids GIL + memory leaks)
+        self._encoder_pool = ProcessPoolExecutor(max_workers=2)
+        self._shutdown = threading.Event()
+        logger.info(
+            f"Initialized ProductionFAISSIndex with {len(texts)} vectors, "
+            f"single-writer pattern"
+        )
+    def add_async(self, vector: np.ndarray, text: str) -> None:
+        """
+        Add vector and text asynchronously (thread-safe)
+        FIXED: Queue-based design - no concurrent FAISS writes
+        """
+        self._write_queue.put((vector, text))
+        logger.debug(f"Queued vector for indexing: {text[:50]}...")
+    def _writer_loop(self) -> None:
+        """
+        Single writer thread - processes queue in batches
+        This ensures only ONE thread ever writes to FAISS index
+        """
+        batch = []
+        last_save = datetime.datetime.now()
+        save_interval = datetime.timedelta(
+            seconds=Constants.FAISS_SAVE_INTERVAL_SECONDS
+        )
+        while not self._shutdown.is_set():
+            try:
+                # Collect batch (non-blocking with timeout)
+                import queue
+                try:
+                    item = self._write_queue.get(timeout=1.0)
+                    batch.append(item)
+                except queue.Empty:
+                    pass
+                # Process batch when ready
+                if len(batch) >= Constants.FAISS_BATCH_SIZE or \
+                   (batch and datetime.datetime.now() - last_save > save_interval):
+                    self._flush_batch(batch)
+                    batch = []
+                    # Periodic save
+                    if datetime.datetime.now() - last_save > save_interval:
+                        self._save_atomic()
+                        last_save = datetime.datetime.now()
+            except Exception as e:
+                logger.error(f"Writer loop error: {e}", exc_info=True)
+    def _flush_batch(self, batch: List[Tuple[np.ndarray, str]]) -> None:
+        """
+        Flush batch to FAISS index
+        SAFE: Only called from single writer thread
+        """
+        if not batch:
             return
         try:
+            vectors = np.vstack([v for v, _ in batch])
+            texts = [t for _, t in batch]
+            # SAFE: Single writer - no concurrent access
             self.index.add(vectors)
+            with self._lock:  # Only lock for text list modification
+                self.texts.extend(texts)
+            logger.info(f"Flushed batch of {len(batch)} vectors to FAISS index")
         except Exception as e:
+            logger.error(f"Error flushing batch: {e}", exc_info=True)
+    def _save_atomic(self) -> None:
+        """
+        Atomic save with fsync for durability
+        FIXED: Prevents corruption on crash
+        """
         try:
             import faiss
+            # Write to temporary file first
+            with tempfile.NamedTemporaryFile(
+                mode='wb',
+                delete=False,
+                dir=os.path.dirname(config.INDEX_FILE),
+                prefix='index_',
+                suffix='.tmp'
+            ) as tmp:
+                temp_path = tmp.name
+            # Write index
+            faiss.write_index(self.index, temp_path)
+            # Fsync for durability
+            with open(temp_path, 'r+b') as f:
+                f.flush()
+                os.fsync(f.fileno())
+            # Atomic rename
+            os.replace(temp_path, config.INDEX_FILE)
+            # Save texts with atomic write
+            with self._lock:
+                texts_copy = self.texts.copy()
+            with atomicwrites.atomic_write(
+                config.TEXTS_FILE,
+                mode='w',
+                overwrite=True
+            ) as f:
+                json.dump(texts_copy, f)
+            logger.info(
+                f"Atomically saved FAISS index with {len(texts_copy)} vectors"
+            )
         except Exception as e:
             logger.error(f"Error saving index: {e}", exc_info=True)
     def get_count(self) -> int:
         """Get total count of vectors"""
         with self._lock:
+            return len(self.texts) + self._write_queue.qsize()
     def force_save(self) -> None:
         """Force immediate save of pending vectors"""
+        logger.info("Forcing FAISS index save...")
+        # Wait for queue to drain (with timeout)
+        timeout = 10.0
+        start = datetime.datetime.now()
+        while not self._write_queue.empty():
+            if (datetime.datetime.now() - start).total_seconds() > timeout:
+                logger.warning("Force save timeout - queue not empty")
+                break
+            import time
+            time.sleep(0.1)
+        self._save_atomic()
+    def shutdown(self) -> None:
+        """Graceful shutdown"""
+        logger.info("Shutting down FAISS index...")
+        self._shutdown.set()
+        self.force_save()
+        self._writer_thread.join(timeout=5.0)
+        self._encoder_pool.shutdown(wait=True)
 # === FAISS & Embeddings Setup ===
 try:
         logger.info(f"Loading existing FAISS index from {config.INDEX_FILE}")
         index = faiss.read_index(config.INDEX_FILE)
+        if index.d != Constants.VECTOR_DIM:
+            logger.warning(
+                f"Index dimension mismatch: {index.d} != {Constants.VECTOR_DIM}. "
+                f"Creating new index."
+            )
+            index = faiss.IndexFlatL2(Constants.VECTOR_DIM)
             incident_texts = []
         else:
             with open(config.TEXTS_FILE, "r") as f:
             logger.info(f"Loaded {len(incident_texts)} incident texts")
     else:
         logger.info("Creating new FAISS index")
+        index = faiss.IndexFlatL2(Constants.VECTOR_DIM)
         incident_texts = []
+    thread_safe_index = ProductionFAISSIndex(index, incident_texts)
 except ImportError as e:
     logger.warning(f"FAISS or SentenceTransformers not available: {e}")
     model = None
     thread_safe_index = None
+    # === Predictive Models ===
 class SimplePredictiveEngine:
     """
+    Lightweight forecasting engine with proper constant usage
+    FIXED: All magic numbers extracted to Constants
     """
+    def __init__(self, history_window: int = Constants.HISTORY_WINDOW):
         self.history_window = history_window
         self.service_history: Dict[str, deque] = {}
         self.prediction_cache: Dict[str, Tuple[ForecastResult, datetime.datetime]] = {}
+        self.max_cache_age = datetime.timedelta(minutes=Constants.CACHE_EXPIRY_MINUTES)
         self._lock = threading.RLock()
         logger.info(f"Initialized SimplePredictiveEngine with history_window={history_window}")
     def add_telemetry(self, service: str, event_data: Dict) -> None:
+        """Add telemetry data to service history"""
         with self._lock:
             if service not in self.service_history:
                 self.service_history[service] = deque(maxlen=self.history_window)
             telemetry_point = {
+                'timestamp': datetime.datetime.now(datetime.timezone.utc),
                 'latency': event_data.get('latency_p99', 0),
                 'error_rate': event_data.get('error_rate', 0),
                 'throughput': event_data.get('throughput', 0),
             }
             self.service_history[service].append(telemetry_point)
             self._clean_cache()
     def _clean_cache(self) -> None:
         """Remove expired entries from prediction cache"""
+        now = datetime.datetime.now(datetime.timezone.utc)
         expired = [k for k, (_, ts) in self.prediction_cache.items()
                    if now - ts > self.max_cache_age]
         for k in expired:
         if expired:
             logger.debug(f"Cleaned {len(expired)} expired cache entries")
+    def forecast_service_health(
+        self,
+        service: str,
+        lookahead_minutes: int = Constants.FORECAST_LOOKAHEAD_MINUTES
+    ) -> List[ForecastResult]:
+        """Forecast service health metrics"""
         with self._lock:
+            if service not in self.service_history or \
+               len(self.service_history[service]) < Constants.FORECAST_MIN_DATA_POINTS:
                 return []
             history = list(self.service_history[service])
         with self._lock:
             for forecast in forecasts:
                 cache_key = f"{service}_{forecast.metric}"
+                self.prediction_cache[cache_key] = (forecast, datetime.datetime.now(datetime.timezone.utc))
         return forecasts
+    def _forecast_latency(
+        self,
+        history: List,
+        lookahead_minutes: int
+    ) -> Optional[ForecastResult]:
+        """Forecast latency using linear regression"""
         try:
             latencies = [point['latency'] for point in history[-20:]]
+            if len(latencies) < Constants.FORECAST_MIN_DATA_POINTS:
                 return None
+            # Linear trend
             x = np.arange(len(latencies))
             slope, intercept = np.polyfit(x, latencies, 1)
             next_x = len(latencies)
             predicted_latency = slope * next_x + intercept
+            # Calculate confidence
             residuals = latencies - (slope * x + intercept)
             confidence = max(0, 1 - (np.std(residuals) / max(1, np.mean(latencies))))
             # Determine trend and risk
+            if slope > Constants.SLOPE_THRESHOLD_INCREASING:
                 trend = "increasing"
+                risk = "critical" if predicted_latency > Constants.LATENCY_EXTREME else "high"
+            elif slope < Constants.SLOPE_THRESHOLD_DECREASING:
+                trend = "decreasing"
                 risk = "low"
             else:
                 trend = "stable"
+                risk = "low" if predicted_latency < Constants.LATENCY_WARNING else "medium"
+            # Calculate time to reach critical threshold
             time_to_critical = None
+            if slope > 0 and predicted_latency < Constants.LATENCY_EXTREME:
                 denominator = predicted_latency - latencies[-1]
+                if abs(denominator) > 0.1:
+                    minutes_to_critical = lookahead_minutes * \
+                        (Constants.LATENCY_EXTREME - predicted_latency) / denominator
                     if minutes_to_critical > 0:
+                        time_to_critical = minutes_to_critical
             return ForecastResult(
                 metric="latency",
             logger.error(f"Latency forecast error: {e}", exc_info=True)
             return None
+    def _forecast_error_rate(
+        self,
+        history: List,
+        lookahead_minutes: int
+    ) -> Optional[ForecastResult]:
+        """Forecast error rate using exponential smoothing"""
         try:
             error_rates = [point['error_rate'] for point in history[-15:]]
+            if len(error_rates) < Constants.FORECAST_MIN_DATA_POINTS:
                 return None
             # Exponential smoothing
             if recent_trend > 0.02:
                 trend = "increasing"
+                risk = "critical" if predicted_rate > Constants.ERROR_RATE_CRITICAL else "high"
             elif recent_trend < -0.01:
                 trend = "decreasing"
                 risk = "low"
             else:
                 trend = "stable"
+                risk = "low" if predicted_rate < Constants.ERROR_RATE_WARNING else "medium"
             # Confidence based on volatility
             confidence = max(0, 1 - (np.std(error_rates) / max(0.01, np.mean(error_rates))))
             logger.error(f"Error rate forecast error: {e}", exc_info=True)
             return None
+    def _forecast_resources(
+        self,
+        history: List,
+        lookahead_minutes: int
+    ) -> List[ForecastResult]:
+        """Forecast CPU and memory utilization"""
         forecasts = []
         # CPU forecast
         cpu_values = [point['cpu_util'] for point in history if point.get('cpu_util') is not None]
+        if len(cpu_values) >= Constants.FORECAST_MIN_DATA_POINTS:
             try:
                 predicted_cpu = np.mean(cpu_values[-5:])
                 trend = "increasing" if cpu_values[-1] > np.mean(cpu_values[-10:-5]) else "stable"
                 risk = "low"
+                if predicted_cpu > Constants.CPU_CRITICAL:
                     risk = "critical"
+                elif predicted_cpu > Constants.CPU_WARNING:
                     risk = "high"
                 elif predicted_cpu > 0.7:
                     risk = "medium"
         # Memory forecast
         memory_values = [point['memory_util'] for point in history if point.get('memory_util') is not None]
+        if len(memory_values) >= Constants.FORECAST_MIN_DATA_POINTS:
             try:
                 predicted_memory = np.mean(memory_values[-5:])
                 trend = "increasing" if memory_values[-1] > np.mean(memory_values[-10:-5]) else "stable"
                 risk = "low"
+                if predicted_memory > Constants.MEMORY_CRITICAL:
                     risk = "critical"
+                elif predicted_memory > Constants.MEMORY_WARNING:
                     risk = "high"
                 elif predicted_memory > 0.7:
                     risk = "medium"
         return forecasts
     def get_predictive_insights(self, service: str) -> Dict[str, Any]:
+        """Generate actionable insights from forecasts"""
         forecasts = self.forecast_service_health(service)
         critical_risks = [f for f in forecasts if f.risk_level in ["high", "critical"]]
             if forecast.metric == "latency" and forecast.risk_level in ["high", "critical"]:
                 warnings.append(f"📈 Latency expected to reach {forecast.predicted_value:.0f}ms")
                 if forecast.time_to_threshold:
+                    minutes = int(forecast.time_to_threshold)
+                    recommendations.append(f"⏰ Critical latency (~{Constants.LATENCY_EXTREME}ms) in ~{minutes} minutes")
                 recommendations.append("🔧 Consider scaling or optimizing dependencies")
             elif forecast.metric == "error_rate" and forecast.risk_level in ["high", "critical"]:
         return {
             'service': service,
+            'forecasts': [
+                {
+                    'metric': f.metric,
+                    'predicted_value': f.predicted_value,
+                    'confidence': f.confidence,
+                    'trend': f.trend,
+                    'risk_level': f.risk_level,
+                    'time_to_threshold': f.time_to_threshold
+                }
+                for f in forecasts
+            ],
             'warnings': warnings[:3],
             'recommendations': list(dict.fromkeys(recommendations))[:3],
             'critical_risk_count': len(critical_risks),
+            'forecast_timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat()
         }
 class BusinessImpactCalculator:
+    """Calculate business impact of anomalies"""
     def __init__(self, revenue_per_request: float = 0.01):
         self.revenue_per_request = revenue_per_request
+        logger.info(f"Initialized BusinessImpactCalculator")
+    def calculate_impact(
+        self,
+        event: ReliabilityEvent,
+        duration_minutes: int = 5
+    ) -> Dict[str, Any]:
+        """Calculate business impact for a reliability event"""
+        base_revenue_per_minute = Constants.BASE_REVENUE_PER_MINUTE
         impact_multiplier = 1.0
         # Impact factors
+        if event.latency_p99 > Constants.LATENCY_CRITICAL:
             impact_multiplier += 0.5
         if event.error_rate > 0.1:
             impact_multiplier += 0.8
+        if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
             impact_multiplier += 0.3
         revenue_loss = base_revenue_per_minute * impact_multiplier * (duration_minutes / 60)
+        base_users_affected = Constants.BASE_USERS
+        user_impact_multiplier = (event.error_rate * 10) + \
+            (max(0, event.latency_p99 - 100) / 500)
         affected_users = int(base_users_affected * user_impact_multiplier)
         # Severity classification
         else:
             severity = "LOW"
+        logger.info(
+            f"Business impact: ${revenue_loss:.2f} revenue loss, "
+            f"{affected_users} users, {severity} severity"
+        )
         return {
             'revenue_loss_estimate': round(revenue_loss, 2),
             'throughput_reduction_pct': round(min(100, user_impact_multiplier * 100), 1)
         }
 class AdvancedAnomalyDetector:
+    """Enhanced anomaly detection with adaptive thresholds"""
     def __init__(self):
         self.historical_data = deque(maxlen=100)
         self.adaptive_thresholds = {
+            'latency_p99': Constants.LATENCY_WARNING,
+            'error_rate': Constants.ERROR_RATE_WARNING
         }
         self._lock = threading.RLock()
         logger.info("Initialized AdvancedAnomalyDetector")
     def detect_anomaly(self, event: ReliabilityEvent) -> bool:
+        """Detect if event is anomalous using adaptive thresholds"""
         with self._lock:
             latency_anomaly = event.latency_p99 > self.adaptive_thresholds['latency_p99']
             error_anomaly = event.error_rate > self.adaptive_thresholds['error_rate']
             resource_anomaly = False
+            if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
                 resource_anomaly = True
+            if event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL:
                 resource_anomaly = True
             self._update_thresholds(event)
             is_anomaly = latency_anomaly or error_anomaly or resource_anomaly
             if is_anomaly:
+                logger.info(
+                    f"Anomaly detected for {event.component}: "
+                    f"latency={latency_anomaly}, error={error_anomaly}, "
+                    f"resource={resource_anomaly}"
+                )
             return is_anomaly
             self.adaptive_thresholds['latency_p99'] = new_threshold
             logger.debug(f"Updated adaptive latency threshold to {new_threshold:.2f}ms")
+            # === Multi-Agent System ===
 class AgentSpecialization(Enum):
     """Agent specialization types"""
     DETECTIVE = "anomaly_detection"
     DIAGNOSTICIAN = "root_cause_analysis"
     PREDICTIVE = "predictive_analytics"
 class BaseAgent:
     """Base class for all specialized agents"""
         """Base analysis method to be implemented by specialized agents"""
         raise NotImplementedError
 class AnomalyDetectionAgent(BaseAgent):
+    """Specialized agent for anomaly detection and pattern recognition"""
     def __init__(self):
         super().__init__(AgentSpecialization.DETECTIVE)
         logger.info("Initialized AnomalyDetectionAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
+        """Perform comprehensive anomaly analysis"""
         try:
             anomaly_score = self._calculate_anomaly_score(event)
             }
     def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
+        """Calculate comprehensive anomaly score (0-1)"""
         scores = []
         # Latency anomaly (weighted 40%)
+        if event.latency_p99 > Constants.LATENCY_WARNING:
+            latency_score = min(1.0, (event.latency_p99 - Constants.LATENCY_WARNING) / 500)
             scores.append(0.4 * latency_score)
         # Error rate anomaly (weighted 30%)
+        if event.error_rate > Constants.ERROR_RATE_WARNING:
             error_score = min(1.0, event.error_rate / 0.3)
             scores.append(0.3 * error_score)
         # Resource anomaly (weighted 30%)
         resource_score = 0
+        if event.cpu_util and event.cpu_util > Constants.CPU_WARNING:
+            resource_score += 0.15 * min(1.0, (event.cpu_util - Constants.CPU_WARNING) / 0.2)
+        if event.memory_util and event.memory_util > Constants.MEMORY_WARNING:
+            resource_score += 0.15 * min(1.0, (event.memory_util - Constants.MEMORY_WARNING) / 0.2)
         scores.append(resource_score)
         return min(1.0, sum(scores))
     def _classify_severity(self, anomaly_score: float) -> str:
+        """Classify severity tier based on anomaly score"""
         if anomaly_score > 0.8:
             return "CRITICAL"
         elif anomaly_score > 0.6:
             return "LOW"
     def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
+        """Identify which metrics are outside normal ranges"""
         affected = []
         # Latency checks
+        if event.latency_p99 > Constants.LATENCY_EXTREME:
             affected.append({
+                "metric": "latency",
+                "value": event.latency_p99,
+                "severity": "CRITICAL",
+                "threshold": Constants.LATENCY_WARNING
             })
+        elif event.latency_p99 > Constants.LATENCY_CRITICAL:
             affected.append({
+                "metric": "latency",
+                "value": event.latency_p99,
+                "severity": "HIGH",
+                "threshold": Constants.LATENCY_WARNING
             })
+        elif event.latency_p99 > Constants.LATENCY_WARNING:
             affected.append({
+                "metric": "latency",
+                "value": event.latency_p99,
+                "severity": "MEDIUM",
+                "threshold": Constants.LATENCY_WARNING
             })
         # Error rate checks
+        if event.error_rate > Constants.ERROR_RATE_CRITICAL:
             affected.append({
+                "metric": "error_rate",
+                "value": event.error_rate,
+                "severity": "CRITICAL",
+                "threshold": Constants.ERROR_RATE_WARNING
             })
+        elif event.error_rate > Constants.ERROR_RATE_HIGH:
             affected.append({
+                "metric": "error_rate",
+                "value": event.error_rate,
+                "severity": "HIGH",
+                "threshold": Constants.ERROR_RATE_WARNING
             })
+        elif event.error_rate > Constants.ERROR_RATE_WARNING:
             affected.append({
+                "metric": "error_rate",
+                "value": event.error_rate,
+                "severity": "MEDIUM",
+                "threshold": Constants.ERROR_RATE_WARNING
             })
         # CPU checks
+        if event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL:
             affected.append({
+                "metric": "cpu",
+                "value": event.cpu_util,
+                "severity": "CRITICAL",
+                "threshold": Constants.CPU_WARNING
             })
+        elif event.cpu_util and event.cpu_util > Constants.CPU_WARNING:
             affected.append({
+                "metric": "cpu",
+                "value": event.cpu_util,
+                "severity": "HIGH",
+                "threshold": Constants.CPU_WARNING
             })
         # Memory checks
+        if event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL:
             affected.append({
+                "metric": "memory",
+                "value": event.memory_util,
+                "severity": "CRITICAL",
+                "threshold": Constants.MEMORY_WARNING
             })
+        elif event.memory_util and event.memory_util > Constants.MEMORY_WARNING:
             affected.append({
+                "metric": "memory",
+                "value": event.memory_util,
+                "severity": "HIGH",
+                "threshold": Constants.MEMORY_WARNING
             })
         return affected
+    def _generate_detection_recommendations(
+        self,
+        event: ReliabilityEvent,
+        anomaly_score: float
+    ) -> List[str]:
+        """Generate actionable recommendations"""
         recommendations = []
         affected_metrics = self._identify_affected_metrics(event)
         elif anomaly_score > 0.4:
             recommendations.append("📊 MONITOR: Early warning signs detected")
+        return recommendations[:4]
 class RootCauseAgent(BaseAgent):
+    """Specialized agent for root cause analysis"""
     def __init__(self):
         super().__init__(AgentSpecialization.DIAGNOSTICIAN)
         logger.info("Initialized RootCauseAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
+        """Perform root cause analysis"""
         try:
             causes = self._analyze_potential_causes(event)
             }
     def _analyze_potential_causes(self, event: ReliabilityEvent) -> List[Dict[str, Any]]:
+        """Analyze potential root causes based on event patterns"""
         causes = []
         # Pattern 1: Database/External Dependency Failure
+        if event.latency_p99 > Constants.LATENCY_EXTREME and event.error_rate > 0.2:
             causes.append({
                 "cause": "Database/External Dependency Failure",
                 "confidence": 0.85,
             })
         # Pattern 2: Resource Exhaustion
+        if (event.cpu_util and event.cpu_util > Constants.CPU_CRITICAL and
+            event.memory_util and event.memory_util > Constants.MEMORY_CRITICAL):
             causes.append({
                 "cause": "Resource Exhaustion",
                 "confidence": 0.90,
             })
         # Pattern 3: Application Bug / Configuration Issue
+        if event.error_rate > Constants.ERROR_RATE_CRITICAL and event.latency_p99 < 200:
             causes.append({
                 "cause": "Application Bug / Configuration Issue",
                 "confidence": 0.75,
             })
         # Pattern 4: Gradual Performance Degradation
+        if (200 <= event.latency_p99 <= 400 and
+            Constants.ERROR_RATE_WARNING <= event.error_rate <= Constants.ERROR_RATE_HIGH):
             causes.append({
                 "cause": "Gradual Performance Degradation",
                 "confidence": 0.65,
         return causes
     def _identify_evidence(self, event: ReliabilityEvent) -> List[str]:
+        """Identify evidence patterns in the event data"""
         evidence = []
         if event.latency_p99 > event.error_rate * 1000:
             evidence.append("latency_disproportionate_to_errors")
+        if (event.cpu_util and event.cpu_util > Constants.CPU_WARNING and
+            event.memory_util and event.memory_util > Constants.MEMORY_WARNING):
             evidence.append("correlated_resource_exhaustion")
+        if event.error_rate > Constants.ERROR_RATE_HIGH and event.latency_p99 < Constants.LATENCY_CRITICAL:
             evidence.append("errors_without_latency_impact")
         return evidence
     def _prioritize_investigation(self, causes: List[Dict[str, Any]]) -> str:
+        """Determine investigation priority"""
         for cause in causes:
             if "Database" in cause["cause"] or "Resource Exhaustion" in cause["cause"]:
                 return "HIGH"
         return "MEDIUM"
 class PredictiveAgent(BaseAgent):
+    """Specialized agent for predictive analytics"""
+    def __init__(self, engine: SimplePredictiveEngine):
         super().__init__(AgentSpecialization.PREDICTIVE)
+        self.engine = engine
         logger.info("Initialized PredictiveAgent")
     async def analyze(self, event: ReliabilityEvent) -> Dict[str, Any]:
+        """Perform predictive analysis for future risks"""
         try:
             event_data = {
                 'latency_p99': event.latency_p99,
                 'recommendations': [f"Analysis error: {str(e)}"]
             }
+# FIXED: Add circuit breaker for agent resilience
+@circuit(failure_threshold=3, recovery_timeout=30, name="agent_circuit_breaker")
+async def call_agent_with_protection(agent: BaseAgent, event: ReliabilityEvent) -> Dict[str, Any]:
     """
+    Call agent with circuit breaker protection
+    FIXED: Prevents cascading failures from misbehaving agents
     """
+    try:
+        result = await asyncio.wait_for(
+            agent.analyze(event),
+            timeout=Constants.AGENT_TIMEOUT_SECONDS
+        )
+        return result
+    except asyncio.TimeoutError:
+        logger.warning(f"Agent {agent.specialization.value} timed out")
+        raise
+    except Exception as e:
+        logger.error(f"Agent {agent.specialization.value} error: {e}", exc_info=True)
+        raise
+class OrchestrationManager:
+    """Orchestrates multiple specialized agents for comprehensive analysis"""
+    def __init__(
+        self,
+        detective: Optional[AnomalyDetectionAgent] = None,
+        diagnostician: Optional[RootCauseAgent] = None,
+        predictive: Optional[PredictiveAgent] = None
+    ):
+        """
+        Initialize orchestration manager
+        FIXED: Dependency injection for testability
+        """
         self.agents = {
+            AgentSpecialization.DETECTIVE: detective or AnomalyDetectionAgent(),
+            AgentSpecialization.DIAGNOSTICIAN: diagnostician or RootCauseAgent(),
+            AgentSpecialization.PREDICTIVE: predictive or PredictiveAgent(SimplePredictiveEngine()),
         }
         logger.info(f"Initialized OrchestrationManager with {len(self.agents)} agents")
         """
         Coordinate multiple agents for comprehensive analysis
+        FIXED: Improved timeout handling with circuit breakers
         """
+        # Create tasks for all agents
+        agent_tasks = []
+        agent_specs = []
+        for spec, agent in self.agents.items():
+            agent_tasks.append(call_agent_with_protection(agent, event))
+            agent_specs.append(spec)
+        # FIXED: Parallel execution with global timeout
         agent_results = {}
+        try:
+            # Run all agents in parallel with global timeout
+            results = await asyncio.wait_for(
+                asyncio.gather(*agent_tasks, return_exceptions=True),
+                timeout=Constants.AGENT_TIMEOUT_SECONDS + 1
+            )
+            # Process results
+            for spec, result in zip(agent_specs, results):
+                if isinstance(result, Exception):
+                    logger.error(f"Agent {spec.value} failed: {result}")
+                    continue
+                agent_results[spec.value] = result
+                logger.debug(f"Agent {spec.value} completed successfully")
+        except asyncio.TimeoutError:
+            logger.warning("Agent orchestration timed out")
+        except Exception as e:
+            logger.error(f"Agent orchestration error: {e}", exc_info=True)
         return self._synthesize_agent_findings(event, agent_results)
+    def _synthesize_agent_findings(
+        self,
+        event: ReliabilityEvent,
+        agent_results: Dict
+    ) -> Dict[str, Any]:
+        """Combine insights from all specialized agents"""
         detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
         diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
         predictive_result = agent_results.get(AgentSpecialization.PREDICTIVE.value)
                 'severity': detective_result['findings'].get('severity_tier', 'UNKNOWN'),
                 'anomaly_confidence': detective_result['confidence'],
                 'primary_metrics_affected': [
+                    metric["metric"] for metric in
                     detective_result['findings'].get('primary_metrics_affected', [])
                 ]
             },
             ),
             'agent_metadata': {
                 'participating_agents': list(agent_results.keys()),
+                'analysis_timestamp': datetime.datetime.now(datetime.timezone.utc).isoformat()
             }
         }
         return synthesis
+    def _prioritize_actions(
+        self,
+        detection_actions: List[str],
+        diagnosis_actions: List[str],
+        predictive_actions: List[str]
+    ) -> List[str]:
+        """Combine and prioritize actions from multiple agents"""
         all_actions = detection_actions + diagnosis_actions + predictive_actions
         seen = set()
         unique_actions = []
             if action not in seen:
                 seen.add(action)
                 unique_actions.append(action)
+        return unique_actions[:5]
+        # === Enhanced Reliability Engine ===
 class EnhancedReliabilityEngine:
     """
+    Main engine for processing reliability events
+    FIXED: Dependency injection for all components
     """
+    def __init__(
+        self,
+        orchestrator: Optional[OrchestrationManager] = None,
+        policy_engine: Optional[PolicyEngine] = None,
+        event_store: Optional[ThreadSafeEventStore] = None,
+        anomaly_detector: Optional[AdvancedAnomalyDetector] = None,
+        business_calculator: Optional[BusinessImpactCalculator] = None
+    ):
+        """
+        Initialize reliability engine with dependency injection
+        FIXED: All dependencies injected for testability
+        """
+        self.orchestrator = orchestrator or OrchestrationManager()
+        self.policy_engine = policy_engine or PolicyEngine()
+        self.event_store = event_store or ThreadSafeEventStore()
+        self.anomaly_detector = anomaly_detector or AdvancedAnomalyDetector()
+        self.business_calculator = business_calculator or BusinessImpactCalculator()
         self.performance_metrics = {
             'total_incidents_processed': 0,
             'multi_agent_analyses': 0,
         logger.info("Initialized EnhancedReliabilityEngine")
     async def process_event_enhanced(
+        self,
+        component: str,
+        latency: float,
         error_rate: float,
+        throughput: float = 1000,
         cpu_util: Optional[float] = None,
         memory_util: Optional[float] = None
     ) -> Dict[str, Any]:
         """
         Process a reliability event through the complete analysis pipeline
+        FIXED: Proper async/await throughout
         """
+        logger.info(
+            f"Processing event for {component}: latency={latency}ms, "
+            f"error_rate={error_rate*100:.1f}%"
+        )
+        # Validate component ID
+        is_valid, error_msg = validate_component_id(component)
+        if not is_valid:
+            return {'error': error_msg, 'status': 'INVALID'}
         # Create event
+        try:
+            event = ReliabilityEvent(
+                component=component,
+                latency_p99=latency,
+                error_rate=error_rate,
+                throughput=throughput,
+                cpu_util=cpu_util,
+                memory_util=memory_util,
+                upstream_deps=["auth-service", "database"] if component == "api-service" else []
+            )
+        except Exception as e:
+            logger.error(f"Event creation error: {e}", exc_info=True)
+            return {'error': f'Invalid event data: {str(e)}', 'status': 'INVALID'}
         # Multi-agent analysis
+        agent_analysis = await self.orchestrator.orchestrate_analysis(event)
         # Anomaly detection
+        is_anomaly = self.anomaly_detector.detect_anomaly(event)
         # Determine severity based on agent confidence
         agent_confidence = 0.0
         if agent_analysis and 'incident_summary' in agent_analysis:
             agent_confidence = agent_analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
         else:
             agent_confidence = 0.8 if is_anomaly else 0.1
         # Set event severity
         if agent_confidence > 0.8:
+            severity = EventSeverity.CRITICAL
         elif agent_confidence > 0.6:
+            severity = EventSeverity.HIGH
         elif agent_confidence > 0.4:
+            severity = EventSeverity.MEDIUM
         else:
+            severity = EventSeverity.LOW
+        # Create mutable copy with updated severity
+        event = event.model_copy(update={'severity': severity})
         # Evaluate healing policies
+        healing_actions = self.policy_engine.evaluate_policies(event)
         # Calculate business impact
+        business_impact = self.business_calculator.calculate_impact(event) if is_anomaly else None
         # Store in vector database for similarity detection
         if thread_safe_index is not None and model is not None and is_anomaly:
             try:
+                # FIXED: Non-blocking encoding with ProcessPoolExecutor
                 analysis_text = agent_analysis.get('recommended_actions', ['No analysis'])[0]
                 vector_text = f"{component} {latency} {error_rate} {analysis_text}"
+                # Encode asynchronously
+                loop = asyncio.get_event_loop()
+                vec = await loop.run_in_executor(
+                    thread_safe_index._encoder_pool,
+                    model.encode,
+                    [vector_text]
+                )
+                thread_safe_index.add_async(np.array(vec, dtype=np.float32), vector_text)
             except Exception as e:
                 logger.error(f"Error storing vector: {e}", exc_info=True)
         # Build comprehensive result
         result = {
+            "timestamp": event.timestamp.isoformat(),
             "component": component,
             "latency_p99": latency,
             "error_rate": error_rate,
         }
         # Store event in history
+        self.event_store.add(event)
         # Update performance metrics
         with self._lock:
         return result
+# === Initialize Engine (with dependency injection) ===
 enhanced_engine = EnhancedReliabilityEngine()
+# === Rate Limiting ===
+class RateLimiter:
+    """Simple rate limiter for request throttling"""
+    def __init__(self, max_per_minute: int = Constants.MAX_REQUESTS_PER_MINUTE):
+        self.max_per_minute = max_per_minute
+        self.requests: deque = deque(maxlen=max_per_minute)
+        self._lock = threading.RLock()
+    def is_allowed(self) -> Tuple[bool, str]:
+        """Check if request is allowed"""
+        with self._lock:
+            now = datetime.datetime.now(datetime.timezone.utc)
+            # Remove requests older than 1 minute
+            one_minute_ago = now - datetime.timedelta(minutes=1)
+            while self.requests and self.requests[0] < one_minute_ago:
+                self.requests.popleft()
+            # Check rate limit
+            if len(self.requests) >= self.max_per_minute:
+                return False, f"Rate limit exceeded: {self.max_per_minute} requests/minute"
+            # Add current request
+            self.requests.append(now)
+            return True, ""
+rate_limiter = RateLimiter()
 # === Gradio UI ===
 def create_enhanced_ui():
     """
+    Create the comprehensive Gradio UI for the reliability framework
+    FIXED: Uses native async handlers (no event loop creation)
+    FIXED: Rate limiting on all endpoints
     """
     with gr.Blocks(title="🧠 Enterprise Agentic Reliability Framework", theme="soft") as demo:
         **Multi-Agent AI System for Production Reliability**
         *Specialized AI agents working together to detect, diagnose, predict, and heal system issues*
+        🔒 **Security Patched** | ⚡ **Performance Optimized** | 🧪 **Production Ready**
         """)
         with gr.Row():
                 latency = gr.Slider(
                     minimum=10, maximum=1000, value=100, step=1,
                     label="Latency P99 (ms)",
+                    info=f"Alert threshold: >{Constants.LATENCY_WARNING}ms (adaptive)"
                 )
                 error_rate = gr.Slider(
                     minimum=0, maximum=0.5, value=0.02, step=0.001,
                     label="Error Rate",
+                    info=f"Alert threshold: >{Constants.ERROR_RATE_WARNING}"
                 )
                 throughput = gr.Number(
                     value=1000,
                 )
                 memory_util = gr.Slider(
                     minimum=0, maximum=1, value=0.3, step=0.01,
+                    label="Memory Utilization",
                     info="0.0 - 1.0 scale"
                 )
                 submit_btn = gr.Button("🚀 Submit Telemetry Event", variant="primary", size="lg")
                     gr.Markdown("""
                     **Specialized AI Agents:**
                     - 🕵️ **Detective**: Anomaly detection & pattern recognition
+                    - 🔍 **Diagnostician**: Root cause analysis & investigation
                     - 🔮 **Predictive**: Future risk forecasting & trend analysis
                     """)
                     gr.Markdown("""
                     **Future Risk Forecasting:**
                     - 📈 Latency trends and thresholds
+                    - 🚨 Error rate predictions
                     - 🔥 Resource utilization forecasts
                     - ⏰ Time-to-failure estimates
                     """)
             - **💰 Business Impact**: Revenue and user impact quantification
             - **🎯 Adaptive Detection**: ML-powered thresholds that learn from your environment
             - **📚 Vector Memory**: FAISS-based incident memory for similarity detection
+            - **⚡ Production Ready**: Circuit breakers, cooldowns, thread safety, and enterprise features
+            - **🔒 Security Patched**: All critical CVEs fixed (Gradio 5.50.0+, Requests 2.32.5+)
             """)
         with gr.Accordion("🔧 Healing Policies", open=False):
             policy_info = []
+            for policy in enhanced_engine.policy_engine.policies:
                 if policy.enabled:
                     actions = ", ".join([action.value for action in policy.actions])
+                    policy_info.append(
+                        f"**{policy.name}** (Priority {policy.priority}): {actions}\n"
+                        f"  - Cooldown: {policy.cool_down_seconds}s\n"
+                        f"  - Max executions: {policy.max_executions_per_hour}/hour"
+                    )
             gr.Markdown("\n\n".join(policy_info))
+        # FIXED: Native async handler (no event loop creation needed)
+        async def submit_event_enhanced_async(
+            component, latency, error_rate, throughput, cpu_util, memory_util
+        ):
             """
+            Async event handler - uses Gradio's native async support
+            CRITICAL FIX: No event loop creation - Gradio handles this
+            FIXED: Rate limiting added
+            FIXED: Comprehensive error handling
             """
             try:
+                # Rate limiting check
+                allowed, rate_msg = rate_limiter.is_allowed()
+                if not allowed:
+                    logger.warning(f"Rate limit exceeded")
+                    return rate_msg, {}, {}, gr.Dataframe(value=[])
                 # Type conversion
+                try:
+                    latency = float(latency)
+                    error_rate = float(error_rate)
+                    throughput = float(throughput) if throughput else 1000
+                    cpu_util = float(cpu_util) if cpu_util else None
+                    memory_util = float(memory_util) if memory_util else None
+                except (ValueError, TypeError) as e:
+                    error_msg = f"❌ Invalid input types: {str(e)}"
+                    logger.warning(error_msg)
+                    return error_msg, {}, {}, gr.Dataframe(value=[])
+                # Input validation
+                is_valid, error_msg = validate_inputs(
+                    latency, error_rate, throughput, cpu_util, memory_util
+                )
                 if not is_valid:
                     logger.warning(f"Invalid input: {error_msg}")
                     return error_msg, {}, {}, gr.Dataframe(value=[])
+                # FIXED: Direct async call - no event loop creation needed
+                result = await enhanced_engine.process_event_enhanced(
+                    component, latency, error_rate, throughput, cpu_util, memory_util
+                )
+                # Handle errors
+                if 'error' in result:
+                    return f"❌ {result['error']}", {}, {}, gr.Dataframe(value=[])
+                # Build table data (THREAD-SAFE)
                 table_data = []
+                for event in enhanced_engine.event_store.get_recent(15):
                     table_data.append([
+                        event.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
                         event.component,
+                        f"{event.latency_p99:.0f}ms",
                         f"{event.error_rate:.3f}",
+                        f"{event.throughput:.0f}",
                         event.severity.value.upper(),
                         "Multi-agent analysis"
                     ])
                 # Format output message
                 status_emoji = "🚨" if result["status"] == "ANOMALY" else "✅"
+                output_msg = f"{status_emoji} **{result['status']}**\n"
                 if "multi_agent_analysis" in result:
                     analysis = result["multi_agent_analysis"]
                     confidence = analysis.get('incident_summary', {}).get('anomaly_confidence', 0)
+                    output_msg += f"🎯 **Confidence**: {confidence*100:.1f}%\n"
                     predictive_data = analysis.get('predictive_insights', {})
                     if predictive_data.get('critical_risk_count', 0) > 0:
+                        output_msg += f"🔮 **PREDICTIVE**: {predictive_data['critical_risk_count']} critical risks forecast\n"
                     if analysis.get('recommended_actions'):
                         actions_preview = ', '.join(analysis['recommended_actions'][:2])
+                        output_msg += f"💡 **Top Insights**: {actions_preview}\n"
+                if result.get("business_impact"):
                     impact = result["business_impact"]
                     output_msg += (
+                        f"💰 **Business Impact**: ${impact['revenue_loss_estimate']:.2f} | "
                         f"👥 {impact['affected_users_estimate']} users | "
+                        f"🚨 {impact['severity_level']}\n"
                     )
+                if result.get("healing_actions") and result["healing_actions"] != ["no_action"]:
                     actions = ", ".join(result["healing_actions"])
+                    output_msg += f"🔧 **Auto-Actions**: {actions}"
                 agent_insights_data = result.get("multi_agent_analysis", {})
                 predictive_insights_data = agent_insights_data.get('predictive_insights', {})
                     )
                 )
             except Exception as e:
                 error_msg = f"❌ Error processing event: {str(e)}"
                 logger.error(error_msg, exc_info=True)
                 return error_msg, {}, {}, gr.Dataframe(value=[])
+        # FIXED: Use async handler directly
         submit_btn.click(
+            fn=submit_event_enhanced_async,  # Native async support
             inputs=[component, latency, error_rate, throughput, cpu_util, memory_util],
             outputs=[output_text, agent_insights, predictive_insights, events_table]
         )
     return demo
 # === Main Entry Point ===
 if __name__ == "__main__":
     logger.info("=" * 80)
+    logger.info("Starting Enterprise Agentic Reliability Framework (PATCHED VERSION)")
     logger.info("=" * 80)
+    logger.info(f"Python version: {os.sys.version}")
+    logger.info(f"Total events in history: {enhanced_engine.event_store.count()}")
     logger.info(f"Vector index size: {thread_safe_index.get_count() if thread_safe_index else 0}")
+    logger.info(f"Agents initialized: {len(enhanced_engine.orchestrator.agents)}")
+    logger.info(f"Policies loaded: {len(enhanced_engine.policy_engine.policies)}")
     logger.info(f"Configuration: HF_TOKEN={'SET' if config.HF_TOKEN else 'NOT SET'}")
+    logger.info(f"Rate limit: {Constants.MAX_REQUESTS_PER_MINUTE} requests/minute")
     logger.info("=" * 80)
+    try:
+        demo = create_enhanced_ui()
+        logger.info("Launching Gradio UI on 0.0.0.0:7860...")
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False,
+            show_error=True
+        )
+    except KeyboardInterrupt:
+        logger.info("Received shutdown signal...")
+    except Exception as e:
+        logger.error(f"Application error: {e}", exc_info=True)
+    finally:
+        # Graceful shutdown
+        logger.info("Shutting down gracefully...")
+        if thread_safe_index:
+            logger.info("Saving pending vectors before shutdown...")
+            thread_safe_index.shutdown()
+        logger.info("=" * 80)
+        logger.info("Application shutdown complete")
+        logger.info("=" * 80)