Spaces:
No application file
No application file
| """ | |
| OOMKillCondition - Simulates memory-limit failures causing repeated restarts | |
| """ | |
| from typing import Dict, List, Any, Optional | |
| from ..coenv_environment import World | |
| class OOMKillCondition: | |
| """Injects OOMKill failures into pods""" | |
| def __init__(self, world: World, config: Dict[str, Any]): | |
| self.world = world | |
| self.config = config | |
| def inject(self, target_deployment: Optional[str] = None, failure_rate: Optional[float] = None): | |
| """ | |
| Inject OOMKill failures into pods | |
| Args: | |
| target_deployment: Specific deployment to target (None for random) | |
| failure_rate: Probability of each pod failing (0.0-1.0) | |
| """ | |
| if failure_rate is None: | |
| failure_rate = self.config.get("oom_kill_failure_rate", 0.6) | |
| else: | |
| # Ensure failure_rate is a float | |
| failure_rate = float(failure_rate) | |
| deployments = self.world.get_deployments() | |
| if target_deployment is not None: | |
| target_deps = [d for d in deployments if d.name == target_deployment] | |
| else: | |
| # Target a random deployment | |
| target_deps = [self.world.rng.choice(deployments)] if deployments else [] | |
| for deployment in target_deps: | |
| # Get pods for this deployment | |
| pods = [p for p in self.world.get_pods() if p.deployment == deployment.name] | |
| for pod in pods: | |
| if failure_rate is not None and float(self.world.rng.random()) < failure_rate: | |
| # Simulate OOMKill by setting high memory usage and restart count | |
| patch = { | |
| "status": "Running", # OOMKill pods often show as Running but crash | |
| "restarts": int(self.world.rng.integers(10, 31)) # High restart count from OOM | |
| } | |
| self.world.apply_patch("pod", pod.name, patch) | |
| # Also reduce the pod's memory limit to simulate the condition that caused OOM | |
| mem_patch = { | |
| "mem_limit": max(64, pod.mem_limit // 2) if pod.mem_limit else 128 | |
| } | |
| self.world.apply_patch("pod", pod.name, mem_patch) | |
| # Add event | |
| self._add_oom_event(pod.name) | |
| def _add_oom_event(self, pod_name: str): | |
| """Add an OOMKill event""" | |
| from ..models import ClusterEvent | |
| from datetime import datetime | |
| event = ClusterEvent( | |
| event_id=f"event-oom-{int(self.world.rng.integers(1000, 10000))}", | |
| timestamp=datetime.now().isoformat(), | |
| type="Warning", | |
| reason="OOMKilling", | |
| message=f"Container {pod_name} exceeded memory limit", | |
| involved_object=pod_name | |
| ) | |
| self.world.events.append(event) |