Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Running

App Files Files Community

petter2025 commited on Feb 28

Commit

e40632a

verified ·

1 Parent(s): 811ceea

Delete healing_policies.py

Browse files

Files changed (1) hide show

healing_policies.py +0 -375

healing_policies.py DELETED Viewed

@@ -1,375 +0,0 @@
-"""
-Policy Engine for Automated Healing Actions
-Fixed version with thread safety and memory leak prevention
-"""
-import datetime
-import threading
-import logging
-from collections import OrderedDict
-from typing import Dict, List, Optional
-from models import HealingPolicy, HealingAction, EventSeverity, ReliabilityEvent, PolicyCondition
-logger = logging.getLogger(__name__)
-# Default healing policies with structured conditions
-DEFAULT_HEALING_POLICIES = [
-    HealingPolicy(
-        name="high_latency_restart",
-        conditions=[
-            PolicyCondition(metric="latency_p99", operator="gt", threshold=500.0)
-        ],
-        actions=[HealingAction.RESTART_CONTAINER, HealingAction.ALERT_TEAM],
-        priority=1,
-        cool_down_seconds=300,
-        max_executions_per_hour=5
-    ),
-    HealingPolicy(
-        name="critical_error_rate_rollback",
-        conditions=[
-            PolicyCondition(metric="error_rate", operator="gt", threshold=0.3)
-        ],
-        actions=[HealingAction.ROLLBACK, HealingAction.CIRCUIT_BREAKER, HealingAction.ALERT_TEAM],
-        priority=1,
-        cool_down_seconds=600,
-        max_executions_per_hour=3
-    ),
-    HealingPolicy(
-        name="high_error_rate_traffic_shift",
-        conditions=[
-            PolicyCondition(metric="error_rate", operator="gt", threshold=0.15)
-        ],
-        actions=[HealingAction.TRAFFIC_SHIFT, HealingAction.ALERT_TEAM],
-        priority=2,
-        cool_down_seconds=300,
-        max_executions_per_hour=5
-    ),
-    HealingPolicy(
-        name="resource_exhaustion_scale",
-        conditions=[
-            PolicyCondition(metric="cpu_util", operator="gt", threshold=0.9),
-            PolicyCondition(metric="memory_util", operator="gt", threshold=0.9)
-        ],
-        actions=[HealingAction.SCALE_OUT],
-        priority=2,
-        cool_down_seconds=600,
-        max_executions_per_hour=10
-    ),
-    HealingPolicy(
-        name="moderate_latency_circuit_breaker",
-        conditions=[
-            PolicyCondition(metric="latency_p99", operator="gt", threshold=300.0)
-        ],
-        actions=[HealingAction.CIRCUIT_BREAKER],
-        priority=3,
-        cool_down_seconds=180,
-        max_executions_per_hour=8
-    )
-]
-class PolicyEngine:
-    """
-    Thread-safe policy engine with cooldown and rate limiting
-    CRITICAL FIXES:
-    - Added RLock for thread safety
-    - Fixed cooldown race condition (atomic check + update)
-    - Implemented LRU eviction to prevent memory leak
-    - Added priority-based policy evaluation
-    - Added rate limiting per policy
-    """
-    def __init__(
-        self,
-        policies: Optional[List[HealingPolicy]] = None,
-        max_cooldown_history: int = 10000,
-        max_execution_history: int = 1000
-    ):
-        """
-        Initialize policy engine
-        Args:
-            policies: List of healing policies (uses defaults if None)
-            max_cooldown_history: Maximum cooldown entries to keep (LRU)
-            max_execution_history: Maximum execution history per policy
-        """
-        self.policies = policies or DEFAULT_HEALING_POLICIES
-        # FIXED: Added RLock for thread safety
-        self._lock = threading.RLock()
-        # FIXED: Use OrderedDict for LRU eviction (prevents memory leak)
-        self.last_execution: OrderedDict[str, float] = OrderedDict()
-        self.max_cooldown_history = max_cooldown_history
-        # Rate limiting: track executions per hour per policy
-        self.execution_timestamps: Dict[str, List[float]] = {}
-        self.max_execution_history = max_execution_history
-        # Sort policies by priority (lower number = higher priority)
-        self.policies = sorted(self.policies, key=lambda p: p.priority)
-        logger.info(
-            f"Initialized PolicyEngine with {len(self.policies)} policies, "
-            f"max_cooldown_history={max_cooldown_history}"
-        )
-    def evaluate_policies(self, event: ReliabilityEvent) -> List[HealingAction]:
-        """
-        Evaluate all policies against the event and return matching actions
-        FIXED: Atomic check + update under lock (prevents race condition)
-        FIXED: Priority-based evaluation
-        Args:
-            event: Reliability event to evaluate
-        Returns:
-            List of healing actions to execute
-        """
-        applicable_actions = []
-        current_time = datetime.datetime.now(datetime.timezone.utc).timestamp()
-        # Evaluate policies in priority order
-        for policy in self.policies:
-            if not policy.enabled:
-                continue
-            policy_key = f"{policy.name}_{event.component}"
-            # FIXED: All cooldown operations under lock (atomic)
-            with self._lock:
-                # Check cooldown
-                last_exec = self.last_execution.get(policy_key, 0)
-                if current_time - last_exec < policy.cool_down_seconds:
-                    logger.debug(
-                        f"Policy {policy.name} for {event.component} on cooldown "
-                        f"({current_time - last_exec:.0f}s / {policy.cool_down_seconds}s)"
-                    )
-                    continue
-                # Check rate limit
-                if self._is_rate_limited(policy_key, policy, current_time):
-                    logger.warning(
-                        f"Policy {policy.name} for {event.component} rate limited "
-                        f"(max {policy.max_executions_per_hour}/hour)"
-                    )
-                    continue
-                # FIXED: Only update timestamp if conditions match
-                if self._evaluate_conditions(policy.conditions, event):
-                    applicable_actions.extend(policy.actions)
-                    # Update cooldown timestamp (INSIDE lock, AFTER condition check)
-                    self._update_cooldown(policy_key, current_time)
-                    # Track execution for rate limiting
-                    self._record_execution(policy_key, current_time)
-                    logger.info(
-                        f"Policy {policy.name} triggered for {event.component}: "
-                        f"actions={[a.value for a in policy.actions]}"
-                    )
-        # Deduplicate actions while preserving order
-        seen = set()
-        unique_actions = []
-        for action in applicable_actions:
-            if action not in seen:
-                seen.add(action)
-                unique_actions.append(action)
-        return unique_actions if unique_actions else [HealingAction.NO_ACTION]
-    def _evaluate_conditions(
-        self,
-        conditions: List[PolicyCondition],
-        event: ReliabilityEvent
-    ) -> bool:
-        """
-        Evaluate all conditions against event (AND logic)
-        Args:
-            conditions: List of policy conditions
-            event: Reliability event
-        Returns:
-            True if all conditions match, False otherwise
-        """
-        for condition in conditions:
-            # Get event value
-            event_value = getattr(event, condition.metric, None)
-            # Handle None values
-            if event_value is None:
-                logger.debug(
-                    f"Condition failed: {condition.metric} is None on event"
-                )
-                return False
-            # Evaluate operator
-            if not self._compare_values(
-                event_value,
-                condition.operator,
-                condition.threshold
-            ):
-                logger.debug(
-                    f"Condition failed: {event_value} {condition.operator} "
-                    f"{condition.threshold} = False"
-                )
-                return False
-        return True
-    def _compare_values(
-        self,
-        event_value: float,
-        operator: str,
-        threshold: float
-    ) -> bool:
-        """
-        Compare values based on operator with type safety
-        FIXED: Added type checking and better error handling
-        Args:
-            event_value: Value from event
-            operator: Comparison operator
-            threshold: Threshold value
-        Returns:
-            Comparison result
-        """
-        try:
-            # Type validation
-            if not isinstance(event_value, (int, float)):
-                logger.error(
-                    f"Invalid event_value type: {type(event_value)}, expected number"
-                )
-                return False
-            if not isinstance(threshold, (int, float)):
-                logger.error(
-                    f"Invalid threshold type: {type(threshold)}, expected number"
-                )
-                return False
-            # Operator evaluation
-            if operator == "gt":
-                return event_value > threshold
-            elif operator == "lt":
-                return event_value < threshold
-            elif operator == "eq":
-                return abs(event_value - threshold) < 1e-6  # Float equality
-            elif operator == "gte":
-                return event_value >= threshold
-            elif operator == "lte":
-                return event_value <= threshold
-            else:
-                logger.error(f"Unknown operator: {operator}")
-                return False
-        except (TypeError, ValueError) as e:
-            logger.error(f"Comparison error: {e}", exc_info=True)
-            return False
-    def _update_cooldown(self, policy_key: str, timestamp: float) -> None:
-        """
-        Update cooldown timestamp with LRU eviction
-        FIXED: Prevents unbounded memory growth
-        Args:
-            policy_key: Policy identifier
-            timestamp: Current timestamp
-        """
-        # Update timestamp
-        self.last_execution[policy_key] = timestamp
-        # Move to end (most recently used)
-        self.last_execution.move_to_end(policy_key)
-        # LRU eviction if too large
-        while len(self.last_execution) > self.max_cooldown_history:
-            evicted_key, _ = self.last_execution.popitem(last=False)
-            logger.debug(f"Evicted cooldown entry: {evicted_key}")
-    def _is_rate_limited(
-        self,
-        policy_key: str,
-        policy: HealingPolicy,
-        current_time: float
-    ) -> bool:
-        """
-        Check if policy is rate limited
-        Args:
-            policy_key: Policy identifier
-            policy: Policy configuration
-            current_time: Current timestamp
-        Returns:
-            True if rate limited, False otherwise
-        """
-        if policy_key not in self.execution_timestamps:
-            return False
-        # Remove executions older than 1 hour
-        one_hour_ago = current_time - 3600
-        recent_executions = [
-            ts for ts in self.execution_timestamps[policy_key]
-            if ts > one_hour_ago
-        ]
-        self.execution_timestamps[policy_key] = recent_executions
-        # Check rate limit
-        return len(recent_executions) >= policy.max_executions_per_hour
-    def _record_execution(self, policy_key: str, timestamp: float) -> None:
-        """
-        Record policy execution for rate limiting
-        Args:
-            policy_key: Policy identifier
-            timestamp: Execution timestamp
-        """
-        if policy_key not in self.execution_timestamps:
-            self.execution_timestamps[policy_key] = []
-        self.execution_timestamps[policy_key].append(timestamp)
-        # Limit history size (memory management)
-        if len(self.execution_timestamps[policy_key]) > self.max_execution_history:
-            self.execution_timestamps[policy_key] = \
-                self.execution_timestamps[policy_key][-self.max_execution_history:]
-    def get_policy_stats(self) -> Dict[str, Dict]:
-        """
-        Get statistics about policy execution
-        Returns:
-            Dictionary of policy statistics
-        """
-        with self._lock:
-            stats = {}
-            for policy in self.policies:
-                policy_stats = {
-                    "name": policy.name,
-                    "priority": policy.priority,
-                    "enabled": policy.enabled,
-                    "cooldown_seconds": policy.cool_down_seconds,
-                    "max_per_hour": policy.max_executions_per_hour,
-                    "total_components": sum(
-                        1 for key in self.last_execution.keys()
-                        if key.startswith(f"{policy.name}_")
-                    )
-                }
-                stats[policy.name] = policy_stats
-            return stats