Spaces:
No application file
No application file
File size: 1,916 Bytes
6cc6670 0394a5e 6cc6670 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | """
Incident Task - Hard difficulty
Handle multi-service cascading incident
"""
from typing import Dict, Any
from ..coenv_environment import World
from ..conditions.cascade_failure import CascadeFailureCondition
class IncidentTask:
"""Incident task implementation"""
def __init__(self, world: World, config: Dict[str, Any]):
self.world = world
self.config = config
self.task_id = "incident"
self.description = "Handle multi-service cascading incident"
def reset(self):
"""Reset the task to initial state"""
self.world.reset_to_healthy()
cascade_condition = CascadeFailureCondition(self.world, self.config)
cascade_condition.inject(root_cause_service="auth-service", failure_probability=0.6)
self.objective = "Auth-service OOMKill has caused cascading failures. Identify the root cause, fix memory limits, restart workloads, and verify downstream recovery."
def is_complete(self) -> bool:
"""Check if the task is complete"""
key_services = ["auth-service", "api-gateway", "frontend"]
healthy_services = 0
for service_name in key_services:
deployment = next((d for d in self.world.get_deployments() if d.name == service_name), None)
if deployment:
running_pods = [p for p in self.world.get_pods()
if p.deployment == service_name and p.status == "Running"]
if len(running_pods) >= deployment.desired_replicas * 0.8:
healthy_services += 1
return healthy_services >= len(key_services) * 0.67
def get_observation(self) -> Dict[str, Any]:
"""Get current observation for the task"""
observation = self.world.get_full_state()
observation["objective"] = self.objective
return observation
|