Spaces:
No application file
No application file
File size: 2,970 Bytes
6cc6670 0394a5e 6cc6670 b578a5d 6cc6670 b578a5d 6cc6670 b578a5d 6cc6670 b578a5d 6cc6670 b578a5d 6cc6670 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | """
OOMKillCondition - Simulates memory-limit failures causing repeated restarts
"""
from typing import Dict, List, Any, Optional
from ..coenv_environment import World
class OOMKillCondition:
"""Injects OOMKill failures into pods"""
def __init__(self, world: World, config: Dict[str, Any]):
self.world = world
self.config = config
def inject(self, target_deployment: Optional[str] = None, failure_rate: Optional[float] = None):
"""
Inject OOMKill failures into pods
Args:
target_deployment: Specific deployment to target (None for random)
failure_rate: Probability of each pod failing (0.0-1.0)
"""
if failure_rate is None:
failure_rate = self.config.get("oom_kill_failure_rate", 0.6)
else:
# Ensure failure_rate is a float
failure_rate = float(failure_rate)
deployments = self.world.get_deployments()
if target_deployment is not None:
target_deps = [d for d in deployments if d.name == target_deployment]
else:
# Target a random deployment
target_deps = [self.world.rng.choice(deployments)] if deployments else []
for deployment in target_deps:
# Get pods for this deployment
pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
for pod in pods:
if failure_rate is not None and float(self.world.rng.random()) < failure_rate:
# Simulate OOMKill by setting high memory usage and restart count
patch = {
"status": "Running", # OOMKill pods often show as Running but crash
"restarts": int(self.world.rng.integers(10, 31)) # High restart count from OOM
}
self.world.apply_patch("pod", pod.name, patch)
# Also reduce the pod's memory limit to simulate the condition that caused OOM
mem_patch = {
"mem_limit": max(64, pod.mem_limit // 2) if pod.mem_limit else 128
}
self.world.apply_patch("pod", pod.name, mem_patch)
# Add event
self._add_oom_event(pod.name)
def _add_oom_event(self, pod_name: str):
"""Add an OOMKill event"""
from ..models import ClusterEvent
from datetime import datetime
event = ClusterEvent(
event_id=f"event-oom-{int(self.world.rng.integers(1000, 10000))}",
timestamp=datetime.now().isoformat(),
type="Warning",
reason="OOMKilling",
message=f"Container {pod_name} exceeded memory limit",
involved_object=pod_name
)
self.world.events.append(event) |