File size: 2,970 Bytes
6cc6670
 
 
 
 
0394a5e
6cc6670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b578a5d
6cc6670
 
 
 
 
 
b578a5d
6cc6670
 
 
b578a5d
6cc6670
 
 
 
 
 
 
 
 
 
 
 
 
 
b578a5d
6cc6670
 
 
b578a5d
6cc6670
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
OOMKillCondition - Simulates memory-limit failures causing repeated restarts
"""

from typing import Dict, List, Any, Optional
from ..coenv_environment import World


class OOMKillCondition:
    """Injects OOMKill failures into pods"""
    
    def __init__(self, world: World, config: Dict[str, Any]):
        self.world = world
        self.config = config
        
    def inject(self, target_deployment: Optional[str] = None, failure_rate: Optional[float] = None):
        """
        Inject OOMKill failures into pods
        
        Args:
            target_deployment: Specific deployment to target (None for random)
            failure_rate: Probability of each pod failing (0.0-1.0)
        """
        if failure_rate is None:
            failure_rate = self.config.get("oom_kill_failure_rate", 0.6)
        else:
            # Ensure failure_rate is a float
            failure_rate = float(failure_rate)
            
        deployments = self.world.get_deployments()
        
        if target_deployment is not None:
            target_deps = [d for d in deployments if d.name == target_deployment]
        else:
            # Target a random deployment
            target_deps = [self.world.rng.choice(deployments)] if deployments else []
            
        for deployment in target_deps:
            # Get pods for this deployment
            pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
            
            for pod in pods:
                if failure_rate is not None and float(self.world.rng.random()) < failure_rate:
                    # Simulate OOMKill by setting high memory usage and restart count
                    patch = {
                        "status": "Running",  # OOMKill pods often show as Running but crash
                        "restarts": int(self.world.rng.integers(10, 31))  # High restart count from OOM
                    }
                    self.world.apply_patch("pod", pod.name, patch)
                    
                    # Also reduce the pod's memory limit to simulate the condition that caused OOM
                    mem_patch = {
                        "mem_limit": max(64, pod.mem_limit // 2) if pod.mem_limit else 128
                    }
                    self.world.apply_patch("pod", pod.name, mem_patch)
                    
                    # Add event
                    self._add_oom_event(pod.name)
    
    def _add_oom_event(self, pod_name: str):
        """Add an OOMKill event"""
        from ..models import ClusterEvent
        from datetime import datetime
        
        event = ClusterEvent(
            event_id=f"event-oom-{int(self.world.rng.integers(1000, 10000))}",
            timestamp=datetime.now().isoformat(),
            type="Warning",
            reason="OOMKilling",
            message=f"Container {pod_name} exceeded memory limit",
            involved_object=pod_name
        )
        self.world.events.append(event)