openenv / server /conditions /oom_kill.py
Sandeep Suresh
Renamed COEnv with coenv
0394a5e
"""
OOMKillCondition - Simulates memory-limit failures causing repeated restarts
"""
from typing import Dict, List, Any, Optional
from ..coenv_environment import World
class OOMKillCondition:
"""Injects OOMKill failures into pods"""
def __init__(self, world: World, config: Dict[str, Any]):
self.world = world
self.config = config
def inject(self, target_deployment: Optional[str] = None, failure_rate: Optional[float] = None):
"""
Inject OOMKill failures into pods
Args:
target_deployment: Specific deployment to target (None for random)
failure_rate: Probability of each pod failing (0.0-1.0)
"""
if failure_rate is None:
failure_rate = self.config.get("oom_kill_failure_rate", 0.6)
else:
# Ensure failure_rate is a float
failure_rate = float(failure_rate)
deployments = self.world.get_deployments()
if target_deployment is not None:
target_deps = [d for d in deployments if d.name == target_deployment]
else:
# Target a random deployment
target_deps = [self.world.rng.choice(deployments)] if deployments else []
for deployment in target_deps:
# Get pods for this deployment
pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
for pod in pods:
if failure_rate is not None and float(self.world.rng.random()) < failure_rate:
# Simulate OOMKill by setting high memory usage and restart count
patch = {
"status": "Running", # OOMKill pods often show as Running but crash
"restarts": int(self.world.rng.integers(10, 31)) # High restart count from OOM
}
self.world.apply_patch("pod", pod.name, patch)
# Also reduce the pod's memory limit to simulate the condition that caused OOM
mem_patch = {
"mem_limit": max(64, pod.mem_limit // 2) if pod.mem_limit else 128
}
self.world.apply_patch("pod", pod.name, mem_patch)
# Add event
self._add_oom_event(pod.name)
def _add_oom_event(self, pod_name: str):
"""Add an OOMKill event"""
from ..models import ClusterEvent
from datetime import datetime
event = ClusterEvent(
event_id=f"event-oom-{int(self.world.rng.integers(1000, 10000))}",
timestamp=datetime.now().isoformat(),
type="Warning",
reason="OOMKilling",
message=f"Container {pod_name} exceeded memory limit",
involved_object=pod_name
)
self.world.events.append(event)