Spaces:

SandyTheAdventurer
/

openenv

No application file

App Files Files Community

Sandeep Suresh commited on 21 days ago

Commit

b578a5d

1 Parent(s): 7796d67

feat: Refactor random number generation to use NumPy and enhance simulation logic

Browse files

Files changed (11) hide show

.gitignore +1 -0
pyproject.toml +5 -1
requirements.txt +5 -4
server/COEnv_environment.py +191 -16
server/app.py +33 -7
server/conditions/cascade_failure.py +8 -9
server/conditions/crash_loop.py +5 -6
server/conditions/node_failure.py +3 -4
server/conditions/oom_kill.py +5 -6
server/models.py +5 -2
server/utils.py +32 -32

.gitignore CHANGED Viewed

@@ -10,6 +10,7 @@ opencode.json
 venv/
 ENV/
 .env/
 .env.*
 .env

 venv/
 ENV/
 .env/
+.env
 .env.*
 .env

pyproject.toml CHANGED Viewed

@@ -42,4 +42,8 @@ server = "COEnv.server.app:main"
 [tool.setuptools]
 include-package-data = true
 packages = ["COEnv", "COEnv.server"]
-package-dir = { "COEnv" = ".", "COEnv.server" = "server" }

 [tool.setuptools]
 include-package-data = true
 packages = ["COEnv", "COEnv.server"]
+package-dir = { "COEnv" = ".", "COEnv.server" = "server" }
+[tool.pytest.ini_options]
+pythonpath = ["."]
+testpaths = ["tests"]

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-fastapi>=0.100.0
-uvicorn>=0.23.0
-pydantic>=2.0.0
-requests>=2.31.0

+fastapi[standard]
+uvicorn
+pydantic
+numpy
+openenv-core

server/COEnv_environment.py CHANGED Viewed

@@ -7,7 +7,7 @@ This is the brain of the whole project.
 from typing import Dict, List, Any, Optional, Literal
 from datetime import datetime
-import random
 import time
 from .models import (
@@ -20,12 +20,19 @@ from .models import (
 class World:
     """In-memory Kubernetes cluster simulator"""
-    def __init__(self, config: Dict[str, Any]):
         self.config = config
         self.cluster_state = self._initialize_healthy_cluster()
         self.step_count = 0
         self.events = []
         self._event_counter = 0
     def _initialize_healthy_cluster(self) -> Dict[str, List[Dict]]:
         """Initialize a healthy cluster state based on config"""
@@ -67,7 +74,7 @@ class World:
             # Create pods for this deployment
             for j in range(dep["replicas"]):
-                pod_name = f"{dep['name']}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
                 pods.append({
                     "name": pod_name,
                     "status": "Running",
@@ -140,9 +147,28 @@ class World:
     def get_pods(self, namespace: Optional[str] = None, selector: Optional[Dict[str, str]] = None) -> List[PodStatus]:
         """Returns filtered pod list (mimics kubectl get pods)"""
-        pods = [PodStatus(**pod) for pod in self.cluster_state["pods"]]
-        # Simple filtering by namespace (not fully implemented - just returns all for now)
-        return pods
     def get_nodes(self) -> List[NodeStatus]:
         """Get all nodes as Pydantic models"""
@@ -230,7 +256,7 @@ class World:
             for i in range(desired_replicas - current_count):
                 deployment = next((d for d in self.cluster_state["deployments"] if d["name"] == deployment_name), None)
                 if deployment:
-                    pod_name = f"{deployment_name}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
                     node = nodes[i % len(nodes)] if nodes else None
                     self.cluster_state["pods"].append({
                         "name": pod_name,
@@ -266,7 +292,7 @@ class World:
             event_type: Literal["Normal"] = "Normal"  # type: ignore
             event = ClusterEvent(
-                event_id=f"event-delpod-{random.randint(1000, 9999)}",
                 timestamp=datetime.now().isoformat(),
                 type=event_type,
                 reason="UserDeleted",
@@ -286,7 +312,7 @@ class World:
         for pod in pods_to_delete:
             event_type: Literal["Normal"] = "Normal"  # type: ignore
             event = ClusterEvent(
-                event_id=f"event-restart-{random.randint(1000, 9999)}",
                 timestamp=datetime.now().isoformat(),
                 type=event_type,
                 reason="RolledOut",
@@ -299,6 +325,155 @@ class World:
         self.cluster_state["pods"] = [p for p in self.cluster_state["pods"] if p.get("deployment") != deployment]
         return True
     def tick(self):
         """Advances simulated time by one step. Pods in CrashLoopBackOff increment their restart counter. Pending pods on ready nodes eventually transition to Running. Dead nodes stay dead unless drained."""
@@ -306,8 +481,8 @@ class World:
         # Simulate some natural changes in resource usage
         for node in self.cluster_state["nodes"]:
-            node["cpu_usage"] = max(0, min(100, node["cpu_usage"] + random.uniform(-5, 5)))
-            node["mem_usage"] = max(0, min(100, node["mem_usage"] + random.uniform(-5, 5)))
             node["last_updated"] = datetime.now().isoformat()
         # Update pod statuses based on node status
@@ -321,7 +496,7 @@ class World:
                     elif pod["status"] == "Pending":
                         pod["status"] = "Unknown"
                 elif node and node["status"] == "Ready" and pod["status"] == "Pending":
-                    if random.random() > 0.7:
                         pod["status"] = "Running"
             pod["last_updated"] = datetime.now().isoformat()
@@ -341,7 +516,7 @@ class World:
             if current_count < desired:
                 nodes = self.cluster_state["nodes"]
                 for i in range(desired - current_count):
-                    pod_name = f"{deployment['name']}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
                     node = nodes[i % len(nodes)] if nodes else None
                     self.cluster_state["pods"].append({
                         "name": pod_name,
@@ -357,7 +532,7 @@ class World:
                     })
         # Generate occasional events
-        if random.random() < 0.3:
             self._generate_event()
     def _generate_event(self):
@@ -373,7 +548,7 @@ class World:
             {"type": "Normal", "reason": "Killing", "message": "Stopping container"}
         ]
-        event = random.choice(event_types)
         involved_objects = []
         involved_objects.extend([p["name"] for p in self.cluster_state["pods"][:3]])
         involved_objects.extend([d["name"] for d in self.cluster_state["deployments"][:3]])
@@ -389,7 +564,7 @@ class World:
             type=event_type,
             reason=event["reason"],
             message=event["message"],
-            involved_object=random.choice(involved_objects)
         ))
         self._event_counter += 1

 from typing import Dict, List, Any, Optional, Literal
 from datetime import datetime
+import numpy as np
 import time
 from .models import (
 class World:
     """In-memory Kubernetes cluster simulator"""
+    def __init__(self, config: Dict[str, Any], seed: Optional[int] = None):
         self.config = config
+        self.seed = seed
+        self.rng = np.random.default_rng(seed)
         self.cluster_state = self._initialize_healthy_cluster()
         self.step_count = 0
         self.events = []
         self._event_counter = 0
+    def _random_suffix(self, length: int = 5) -> str:
+        """Generate a random lowercase alphabetic suffix."""
+        letters = self.rng.integers(97, 123, size=length)
+        return "".join(chr(int(code)) for code in letters)
     def _initialize_healthy_cluster(self) -> Dict[str, List[Dict]]:
         """Initialize a healthy cluster state based on config"""
             # Create pods for this deployment
             for j in range(dep["replicas"]):
+                pod_name = f"{dep['name']}-{int(self.rng.integers(1000, 10000))}-{self._random_suffix()}"
                 pods.append({
                     "name": pod_name,
                     "status": "Running",
     def get_pods(self, namespace: Optional[str] = None, selector: Optional[Dict[str, str]] = None) -> List[PodStatus]:
         """Returns filtered pod list (mimics kubectl get pods)"""
+        filtered_pods = self.cluster_state["pods"]
+        if namespace is not None:
+            filtered_pods = [
+                pod for pod in filtered_pods
+                if pod.get("namespace", "default") == namespace
+            ]
+        if selector:
+            for key, value in selector.items():
+                if key in {"app", "deployment"}:
+                    filtered_pods = [
+                        pod for pod in filtered_pods
+                        if pod.get("deployment") == value
+                    ]
+                else:
+                    filtered_pods = [
+                        pod for pod in filtered_pods
+                        if pod.get("labels", {}).get(key) == value
+                    ]
+        return [PodStatus(**pod) for pod in filtered_pods]
     def get_nodes(self) -> List[NodeStatus]:
         """Get all nodes as Pydantic models"""
             for i in range(desired_replicas - current_count):
                 deployment = next((d for d in self.cluster_state["deployments"] if d["name"] == deployment_name), None)
                 if deployment:
+                    pod_name = f"{deployment_name}-{int(self.rng.integers(1000, 10000))}-{self._random_suffix()}"
                     node = nodes[i % len(nodes)] if nodes else None
                     self.cluster_state["pods"].append({
                         "name": pod_name,
             event_type: Literal["Normal"] = "Normal"  # type: ignore
             event = ClusterEvent(
+                event_id=f"event-delpod-{int(self.rng.integers(1000, 10000))}",
                 timestamp=datetime.now().isoformat(),
                 type=event_type,
                 reason="UserDeleted",
         for pod in pods_to_delete:
             event_type: Literal["Normal"] = "Normal"  # type: ignore
             event = ClusterEvent(
+                event_id=f"event-restart-{int(self.rng.integers(1000, 10000))}",
                 timestamp=datetime.now().isoformat(),
                 type=event_type,
                 reason="RolledOut",
         self.cluster_state["pods"] = [p for p in self.cluster_state["pods"] if p.get("deployment") != deployment]
         return True
+    def set_hpa(self, deployment: str, min_replicas: int, max_replicas: int, cpu_target_percent: int) -> bool:
+        """Create or update an HPA configuration for a deployment."""
+        target_deployment = next(
+            (d for d in self.cluster_state["deployments"] if d["name"] == deployment),
+            None,
+        )
+        if target_deployment is None:
+            return False
+        hpa_name = f"{deployment}-hpa"
+        now = datetime.now().isoformat()
+        existing_hpa = next((h for h in self.cluster_state["hpas"] if h.get("name") == hpa_name), None)
+        if existing_hpa is None:
+            self.cluster_state["hpas"].append({
+                "name": hpa_name,
+                "min_replicas": min_replicas,
+                "max_replicas": max_replicas,
+                "current_replicas": max(min_replicas, min(target_deployment["desired_replicas"], max_replicas)),
+                "cpu_target_percent": cpu_target_percent,
+                "last_updated": now,
+            })
+        else:
+            existing_hpa.update({
+                "min_replicas": min_replicas,
+                "max_replicas": max_replicas,
+                "cpu_target_percent": cpu_target_percent,
+                "current_replicas": max(min_replicas, min(target_deployment["desired_replicas"], max_replicas)),
+                "last_updated": now,
+            })
+        # Keep the deployment desired replicas within configured HPA bounds.
+        bounded_replicas = max(min_replicas, min(target_deployment["desired_replicas"], max_replicas))
+        target_deployment["desired_replicas"] = bounded_replicas
+        target_deployment["last_updated"] = now
+        event_type: Literal["Normal"] = "Normal"  # type: ignore
+        self.events.append(ClusterEvent(
+            event_id=f"event-hpa-{int(self.rng.integers(1000, 10000))}",
+            timestamp=now,
+            type=event_type,
+            reason="HorizontalPodAutoscalerUpdated",
+            message=(
+                f"HPA configured for deployment/{deployment}: "
+                f"min={min_replicas}, max={max_replicas}, cpu_target={cpu_target_percent}%"
+            ),
+            involved_object=deployment,
+        ))
+        return True
+    def drain_node(self, node_name: str) -> bool:
+        """Mark a node unschedulable and evict/reschedule pods currently on it."""
+        node = next((n for n in self.cluster_state["nodes"] if n["name"] == node_name), None)
+        if node is None:
+            return False
+        node["status"] = "SchedulingDisabled"
+        node["last_updated"] = datetime.now().isoformat()
+        candidate_nodes = [
+            n for n in self.cluster_state["nodes"]
+            if n["name"] != node_name and n.get("status") == "Ready"
+        ]
+        pods_on_node = [p for p in self.cluster_state["pods"] if p.get("node") == node_name]
+        for i, pod in enumerate(pods_on_node):
+            replacement = candidate_nodes[i % len(candidate_nodes)] if candidate_nodes else None
+            pod["node"] = replacement["name"] if replacement else None
+            pod["status"] = "Pending"
+            pod["last_updated"] = datetime.now().isoformat()
+            event_type: Literal["Normal"] = "Normal"  # type: ignore
+            self.events.append(ClusterEvent(
+                event_id=f"event-evict-{int(self.rng.integers(1000, 10000))}",
+                timestamp=datetime.now().isoformat(),
+                type=event_type,
+                reason="Evicted",
+                message=f"pod/{pod['name']} evicted from drained node/{node_name}",
+                involved_object=pod["name"],
+            ))
+        event_type: Literal["Normal"] = "Normal"  # type: ignore
+        self.events.append(ClusterEvent(
+            event_id=f"event-drain-{int(self.rng.integers(1000, 10000))}",
+            timestamp=datetime.now().isoformat(),
+            type=event_type,
+            reason="NodeDrained",
+            message=f"node/{node_name} cordoned and drained",
+            involved_object=node_name,
+        ))
+        return True
+    def describe(self, resource_type: str, name: str) -> Dict[str, Any]:
+        """Return kubectl-describe style details for a specific resource."""
+        collection_map = {
+            "deployment": "deployments",
+            "pod": "pods",
+            "node": "nodes",
+            "service": "services",
+            "configmap": "configmaps",
+            "hpa": "hpas",
+        }
+        collection_name = collection_map.get(resource_type)
+        if collection_name is None:
+            return {
+                "type": resource_type,
+                "name": name,
+                "found": False,
+                "error": f"Unsupported resource_type: {resource_type}",
+            }
+        resource = next(
+            (item for item in self.cluster_state.get(collection_name, []) if item.get("name") == name),
+            None,
+        )
+        if resource is None:
+            return {
+                "type": resource_type,
+                "name": name,
+                "found": False,
+                "error": f"{resource_type} '{name}' not found",
+            }
+        related_pods = []
+        if resource_type == "deployment":
+            related_pods = [p for p in self.cluster_state["pods"] if p.get("deployment") == name]
+        elif resource_type == "node":
+            related_pods = [p for p in self.cluster_state["pods"] if p.get("node") == name]
+        elif resource_type == "service":
+            selector_app = resource.get("selector", {}).get("app")
+            if selector_app:
+                related_pods = [p for p in self.cluster_state["pods"] if p.get("deployment") == selector_app]
+        related_events = [e.model_dump() for e in self.events if e.involved_object in {name, resource_type}]
+        return {
+            "type": resource_type,
+            "name": name,
+            "found": True,
+            "resource": dict(resource),
+            "related_pods": related_pods,
+            "recent_events": related_events[-10:],
+            "step": self.step_count,
+            "timestamp": datetime.now().isoformat(),
+        }
     def tick(self):
         """Advances simulated time by one step. Pods in CrashLoopBackOff increment their restart counter. Pending pods on ready nodes eventually transition to Running. Dead nodes stay dead unless drained."""
         # Simulate some natural changes in resource usage
         for node in self.cluster_state["nodes"]:
+            node["cpu_usage"] = max(0, min(100, node["cpu_usage"] + float(self.rng.uniform(-5, 5))))
+            node["mem_usage"] = max(0, min(100, node["mem_usage"] + float(self.rng.uniform(-5, 5))))
             node["last_updated"] = datetime.now().isoformat()
         # Update pod statuses based on node status
                     elif pod["status"] == "Pending":
                         pod["status"] = "Unknown"
                 elif node and node["status"] == "Ready" and pod["status"] == "Pending":
+                    if float(self.rng.random()) > 0.7:
                         pod["status"] = "Running"
             pod["last_updated"] = datetime.now().isoformat()
             if current_count < desired:
                 nodes = self.cluster_state["nodes"]
                 for i in range(desired - current_count):
+                    pod_name = f"{deployment['name']}-{int(self.rng.integers(1000, 10000))}-{self._random_suffix()}"
                     node = nodes[i % len(nodes)] if nodes else None
                     self.cluster_state["pods"].append({
                         "name": pod_name,
                     })
         # Generate occasional events
+        if float(self.rng.random()) < 0.3:
             self._generate_event()
     def _generate_event(self):
             {"type": "Normal", "reason": "Killing", "message": "Stopping container"}
         ]
+        event = self.rng.choice(event_types)
         involved_objects = []
         involved_objects.extend([p["name"] for p in self.cluster_state["pods"][:3]])
         involved_objects.extend([d["name"] for d in self.cluster_state["deployments"][:3]])
             type=event_type,
             reason=event["reason"],
             message=event["message"],
+            involved_object=str(self.rng.choice(involved_objects))
         ))
         self._event_counter += 1

server/app.py CHANGED Viewed

@@ -3,6 +3,7 @@ COEnv FastAPI Application
 Exposes /reset /step /state endpoints
 """
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 from typing import Dict, Any, Optional, List, Literal
@@ -11,8 +12,13 @@ import json
 import os
 import sys
-from .COEnv_environment import World
-from .models import ClusterObservation, RewardSignal, KubeAction
 app = FastAPI(title="COEnv", description="Kubernetes Simulator for OpenEnv")
@@ -56,13 +62,22 @@ def load_config():
 def get_condition_for_task(task_id: str):
     """Get the condition injector for a task"""
     if task_id == "pod_recovery":
-        from .conditions.crash_loop import CrashLoopCondition
         return CrashLoopCondition(world_instance, config)
     elif task_id == "autoscaling":
-        from .conditions.oom_kill import OOMKillCondition
         return OOMKillCondition(world_instance, config)
     elif task_id == "incident":
-        from .conditions.cascade_failure import CascadeFailureCondition
         return CascadeFailureCondition(world_instance, config)
     return None
@@ -82,7 +97,7 @@ async def startup_event():
     """Initialize the world on startup"""
     global world_instance, current_task, current_objective
     load_config()
-    world_instance = World(config)
     print("COEnv initialized")
@@ -279,5 +294,16 @@ async def get_state():
     return world_instance.get_observation(current_objective).model_dump()
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 Exposes /reset /step /state endpoints
 """
+from openenv.core.env_server import create_app
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 from typing import Dict, Any, Optional, List, Literal
 import os
 import sys
+try:
+    from .COEnv_environment import World
+    from .models import ClusterObservation, RewardSignal, KubeAction
+except ImportError:
+    # Support running as a top-level module inside container images.
+    from COEnv_environment import World
+    from models import ClusterObservation, RewardSignal, KubeAction
 app = FastAPI(title="COEnv", description="Kubernetes Simulator for OpenEnv")
 def get_condition_for_task(task_id: str):
     """Get the condition injector for a task"""
     if task_id == "pod_recovery":
+        try:
+            from .conditions.crash_loop import CrashLoopCondition
+        except ImportError:
+            from conditions.crash_loop import CrashLoopCondition
         return CrashLoopCondition(world_instance, config)
     elif task_id == "autoscaling":
+        try:
+            from .conditions.oom_kill import OOMKillCondition
+        except ImportError:
+            from conditions.oom_kill import OOMKillCondition
         return OOMKillCondition(world_instance, config)
     elif task_id == "incident":
+        try:
+            from .conditions.cascade_failure import CascadeFailureCondition
+        except ImportError:
+            from conditions.cascade_failure import CascadeFailureCondition
         return CascadeFailureCondition(world_instance, config)
     return None
     """Initialize the world on startup"""
     global world_instance, current_task, current_objective
     load_config()
+    world_instance = World(config, seed=config.get("seed"))
     print("COEnv initialized")
     return world_instance.get_observation(current_objective).model_dump()
+@app.get("/health")
+async def health():
+    """Container health endpoint used by Docker health checks."""
+    return {"status": "ok"}
+def main() -> None:
+    """Application entrypoint for local execution."""
+    uvicorn.run(app, host="0.0.0.0", port=8000)
 if __name__ == "__main__":
+    main()

server/conditions/cascade_failure.py CHANGED Viewed

@@ -4,7 +4,6 @@ CascadeFailureCondition - Simulates multi-service dependency failure
 from typing import Dict, List, Any, Optional
 from ..COEnv_environment import World
-import random
 class CascadeFailureCondition:
@@ -32,14 +31,14 @@ class CascadeFailureCondition:
             deployments = self.world.get_deployments()
             critical_deployments = [d for d in deployments if d.name in critical_services]
             if critical_deployments:
-                root_cause_service = random.choice(critical_deployments).name
             else:
                 deployments = self.world.get_deployments()
-                root_cause_service = random.choice(deployments).name if deployments else "frontend"
         root_deployment = next((d for d in self.world.get_deployments() if d.name == root_cause_service), None)
         if root_deployment:
-            from ..oom_kill import OOMKillCondition
             oom_condition = OOMKillCondition(self.world, self.config)
             oom_condition.inject(target_deployment=root_cause_service, failure_rate=0.8)
@@ -47,15 +46,15 @@ class CascadeFailureCondition:
         deployments = self.world.get_deployments()
         for deployment in deployments:
-            if deployment.name != root_cause_service and failure_probability is not None and random.random() < failure_probability:
-                failure_type = random.choice(["crashloop", "oom", "slow"])
                 if failure_type == "crashloop":
-                    from ..crash_loop import CrashLoopCondition
                     condition = CrashLoopCondition(self.world, self.config)
                     condition.inject(target_deployment=deployment.name, failure_rate=0.6)
                 elif failure_type == "oom":
-                    from ..oom_kill import OOMKillCondition
                     condition = OOMKillCondition(self.world, self.config)
                     condition.inject(target_deployment=deployment.name, failure_rate=0.6)
                 else:
@@ -75,7 +74,7 @@ class CascadeFailureCondition:
         from datetime import datetime
         event = ClusterEvent(
-            event_id=f"event-cascade-{random.randint(1000, 9999)}",
             timestamp=datetime.now().isoformat(),
             type=event_type,
             reason="CascadeFailure",

 from typing import Dict, List, Any, Optional
 from ..COEnv_environment import World
 class CascadeFailureCondition:
             deployments = self.world.get_deployments()
             critical_deployments = [d for d in deployments if d.name in critical_services]
             if critical_deployments:
+                root_cause_service = self.world.rng.choice(critical_deployments).name
             else:
                 deployments = self.world.get_deployments()
+                root_cause_service = self.world.rng.choice(deployments).name if deployments else "frontend"
         root_deployment = next((d for d in self.world.get_deployments() if d.name == root_cause_service), None)
         if root_deployment:
+            from .oom_kill import OOMKillCondition
             oom_condition = OOMKillCondition(self.world, self.config)
             oom_condition.inject(target_deployment=root_cause_service, failure_rate=0.8)
         deployments = self.world.get_deployments()
         for deployment in deployments:
+            if deployment.name != root_cause_service and failure_probability is not None and float(self.world.rng.random()) < failure_probability:
+                failure_type = str(self.world.rng.choice(["crashloop", "oom", "slow"]))
                 if failure_type == "crashloop":
+                    from .crash_loop import CrashLoopCondition
                     condition = CrashLoopCondition(self.world, self.config)
                     condition.inject(target_deployment=deployment.name, failure_rate=0.6)
                 elif failure_type == "oom":
+                    from .oom_kill import OOMKillCondition
                     condition = OOMKillCondition(self.world, self.config)
                     condition.inject(target_deployment=deployment.name, failure_rate=0.6)
                 else:
         from datetime import datetime
         event = ClusterEvent(
+            event_id=f"event-cascade-{int(self.world.rng.integers(1000, 10000))}",
             timestamp=datetime.now().isoformat(),
             type=event_type,
             reason="CascadeFailure",

server/conditions/crash_loop.py CHANGED Viewed

@@ -4,7 +4,6 @@ CrashLoopCondition - Simulates pods stuck in CrashLoopBackOff
 from typing import Dict, List, Any, Optional
 from ..COEnv_environment import World
-import random
 class CrashLoopCondition:
@@ -32,27 +31,27 @@ class CrashLoopCondition:
         if target_deployment is not None:
             target_deps = [d for d in deployments if d.name == target_deployment]
         else:
-            target_deps = [random.choice(deployments)] if deployments else []
         for deployment in target_deps:
             pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
             for pod in pods:
-                if failure_rate is not None and random.random() < failure_rate:
                     patch = {
                         "status": "CrashLoopBackOff",
-                        "restarts": random.randint(5, 20)
                     }
                     self.world.apply_patch("pod", pod.name, patch)
                     self._add_crashloop_event(pod.name)
     def _add_crashloop_event(self, pod_name: str):
         """Add a crashloop event"""
-        from .models import ClusterEvent
         from datetime import datetime
         event = ClusterEvent(
-            event_id=f"event-crashloop-{random.randint(1000, 9999)}",
             timestamp=datetime.now().isoformat(),
             type="Warning",
             reason="BackOff",

 from typing import Dict, List, Any, Optional
 from ..COEnv_environment import World
 class CrashLoopCondition:
         if target_deployment is not None:
             target_deps = [d for d in deployments if d.name == target_deployment]
         else:
+            target_deps = [self.world.rng.choice(deployments)] if deployments else []
         for deployment in target_deps:
             pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
             for pod in pods:
+                if failure_rate is not None and float(self.world.rng.random()) < failure_rate:
                     patch = {
                         "status": "CrashLoopBackOff",
+                        "restarts": int(self.world.rng.integers(5, 21))
                     }
                     self.world.apply_patch("pod", pod.name, patch)
                     self._add_crashloop_event(pod.name)
     def _add_crashloop_event(self, pod_name: str):
         """Add a crashloop event"""
+        from ..models import ClusterEvent
         from datetime import datetime
         event = ClusterEvent(
+            event_id=f"event-crashloop-{int(self.world.rng.integers(1000, 10000))}",
             timestamp=datetime.now().isoformat(),
             type="Warning",
             reason="BackOff",

server/conditions/node_failure.py CHANGED Viewed

@@ -4,7 +4,6 @@ NodeFailureCondition - Simulates node outages and scheduling disruption
 from typing import Dict, List, Any, Optional
 from ..COEnv_environment import World
-import random
 class NodeFailureCondition:
@@ -32,7 +31,7 @@ class NodeFailureCondition:
         if target_node:
             target_nodes = [n for n in nodes if n.name == target_node]
         else:
-            target_nodes = [n for n in nodes if failure_rate is not None and random.random() < failure_rate]
         for node in target_nodes:
             patch = {
@@ -54,11 +53,11 @@ class NodeFailureCondition:
     def _add_node_failure_event(self, node_name: str):
         """Add a node failure event"""
-        from models import ClusterEvent
         from datetime import datetime
         event = ClusterEvent(
-            event_id=f"event-nodefail-{random.randint(1000, 9999)}",
             timestamp=datetime.now().isoformat(),
             type="Warning",
             reason="NodeNotReady",

 from typing import Dict, List, Any, Optional
 from ..COEnv_environment import World
 class NodeFailureCondition:
         if target_node:
             target_nodes = [n for n in nodes if n.name == target_node]
         else:
+            target_nodes = [n for n in nodes if failure_rate is not None and float(self.world.rng.random()) < failure_rate]
         for node in target_nodes:
             patch = {
     def _add_node_failure_event(self, node_name: str):
         """Add a node failure event"""
+        from ..models import ClusterEvent
         from datetime import datetime
         event = ClusterEvent(
+            event_id=f"event-nodefail-{int(self.world.rng.integers(1000, 10000))}",
             timestamp=datetime.now().isoformat(),
             type="Warning",
             reason="NodeNotReady",

server/conditions/oom_kill.py CHANGED Viewed

@@ -4,7 +4,6 @@ OOMKillCondition - Simulates memory-limit failures causing repeated restarts
 from typing import Dict, List, Any, Optional
 from ..COEnv_environment import World
-import random
 class OOMKillCondition:
@@ -34,18 +33,18 @@ class OOMKillCondition:
             target_deps = [d for d in deployments if d.name == target_deployment]
         else:
             # Target a random deployment
-            target_deps = [random.choice(deployments)] if deployments else []
         for deployment in target_deps:
             # Get pods for this deployment
             pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
             for pod in pods:
-                if failure_rate is not None and random.random() < failure_rate:
                     # Simulate OOMKill by setting high memory usage and restart count
                     patch = {
                         "status": "Running",  # OOMKill pods often show as Running but crash
-                        "restarts": random.randint(10, 30)  # High restart count from OOM
                     }
                     self.world.apply_patch("pod", pod.name, patch)
@@ -60,11 +59,11 @@ class OOMKillCondition:
     def _add_oom_event(self, pod_name: str):
         """Add an OOMKill event"""
-        from .models import ClusterEvent
         from datetime import datetime
         event = ClusterEvent(
-            event_id=f"event-oom-{random.randint(1000, 9999)}",
             timestamp=datetime.now().isoformat(),
             type="Warning",
             reason="OOMKilling",

 from typing import Dict, List, Any, Optional
 from ..COEnv_environment import World
 class OOMKillCondition:
             target_deps = [d for d in deployments if d.name == target_deployment]
         else:
             # Target a random deployment
+            target_deps = [self.world.rng.choice(deployments)] if deployments else []
         for deployment in target_deps:
             # Get pods for this deployment
             pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
             for pod in pods:
+                if failure_rate is not None and float(self.world.rng.random()) < failure_rate:
                     # Simulate OOMKill by setting high memory usage and restart count
                     patch = {
                         "status": "Running",  # OOMKill pods often show as Running but crash
+                        "restarts": int(self.world.rng.integers(10, 31))  # High restart count from OOM
                     }
                     self.world.apply_patch("pod", pod.name, patch)
     def _add_oom_event(self, pod_name: str):
         """Add an OOMKill event"""
+        from ..models import ClusterEvent
         from datetime import datetime
         event = ClusterEvent(
+            event_id=f"event-oom-{int(self.world.rng.integers(1000, 10000))}",
             timestamp=datetime.now().isoformat(),
             type="Warning",
             reason="OOMKilling",

server/models.py CHANGED Viewed

@@ -88,7 +88,7 @@ All typed models are mandatory for OpenEnv spec compliance.
 Every endpoint uses these.
 """
-from pydantic import BaseModel, Field
 from typing import List, Dict, Any, Optional, Literal
 from datetime import datetime
@@ -171,7 +171,10 @@ class ClusterObservation(BaseModel):
     deployments: List[DeploymentStatus]
     services: List[ServiceStatus]
     configmaps: List[ConfigMapStatus]
-    hpas: List[HPAStatus]
     events: List[ClusterEvent]
     step: int
     objective: str

 Every endpoint uses these.
 """
+from pydantic import BaseModel, Field, AliasChoices
 from typing import List, Dict, Any, Optional, Literal
 from datetime import datetime
     deployments: List[DeploymentStatus]
     services: List[ServiceStatus]
     configmaps: List[ConfigMapStatus]
+    hpas: List[HPAStatus] = Field(
+        default_factory=list,
+        validation_alias=AliasChoices("hpa", "hpas")
+    )
     events: List[ClusterEvent]
     step: int
     objective: str

server/utils.py CHANGED Viewed

@@ -4,12 +4,21 @@ Random failure rate generators, latency simulators, resource usage curves.
 Makes the simulation feel realistic and non-deterministic in the right ways.
 """
-import random
 import math
 from typing import Dict, List, Any, Optional
 from datetime import datetime, timedelta
 class ProbabilityHelpers:
     """Helpers for generating realistic probabilities and distributions"""
@@ -17,23 +26,15 @@ class ProbabilityHelpers:
     def weighted_random_choice(choices: List[Any], weights: List[float]) -> Any:
         """Make a weighted random choice"""
         if not choices or not weights or len(choices) != len(weights):
-            return random.choice(choices) if choices else None
         # Normalize weights
         total_weight = sum(weights)
         if total_weight == 0:
-            return random.choice(choices)
         normalized_weights = [w / total_weight for w in weights]
-        # Make choice
-        r = random.random()
-        cumulative_weight = 0
-        for choice, weight in zip(choices, normalized_weights):
-            cumulative_weight += weight
-            if r <= cumulative_weight:
-                return choice
-        return choices[-1]  # Fallback
     @staticmethod
     def exponential_backoff(attempt: int, base_delay: float = 1.0, max_delay: float = 60.0) -> float:
@@ -44,8 +45,7 @@ class ProbabilityHelpers:
     @staticmethod
     def poisson_arrival_rate(lambda_rate: float, time_window: float) -> int:
         """Generate number of events in time window using Poisson distribution"""
-        # Simple approximation - in reality would use numpy.random.poisson
-        return int(lambda_rate * time_window + random.gauss(0, math.sqrt(lambda_rate * time_window)))
     @staticmethod
     def failure_probability_over_time(base_rate: float, time_elapsed: float,
@@ -57,7 +57,7 @@ class ProbabilityHelpers:
     @staticmethod
     def random_failure_rate(min_rate: float = 0.1, max_rate: float = 0.9) -> float:
         """Generate a random failure rate within bounds"""
-        return random.uniform(min_rate, max_rate)
 class LatencySimulator:
@@ -75,7 +75,7 @@ class LatencySimulator:
         """Get simulated latency in milliseconds"""
         # Base latency + load-dependent component + random jitter
         load_latency = self.base_latency_ms * (self.load_factor - 1.0) * 2
-        jitter = random.gauss(0, self.base_latency_ms * 0.1)
         latency = self.base_latency_ms + max(0, load_latency) + jitter
         return max(1.0, latency)  # Minimum 1ms latency
@@ -83,7 +83,7 @@ class LatencySimulator:
                              spike_multiplier: float = 5.0) -> float:
         """Get latency with occasional spikes"""
         latency = self.get_latency()
-        if random.random() < spike_probability:
             latency *= spike_multiplier
         return latency
@@ -92,7 +92,7 @@ class ResourceUsageSimulator:
     """Simulates realistic CPU and memory usage patterns"""
     def __init__(self):
-        self.time_offset = random.uniform(0, 2 * math.pi)
     def get_cpu_usage(self, base_usage: float = 0.3,
                      variation: float = 0.2) -> float:
@@ -102,7 +102,7 @@ class ResourceUsageSimulator:
         daily_pattern = 0.5 * math.sin(2 * math.pi * time_factor / 24) + 0.5
         usage = base_usage + variation * daily_pattern
-        usage += random.gauss(0, 0.05)  # Noise
         return max(0.0, min(1.0, usage)) * 100  # Clamp to 0-100%
     def get_memory_usage(self, base_usage: float = 0.4,
@@ -113,7 +113,7 @@ class ResourceUsageSimulator:
         leak_factor = 0.1 * time_factor  # Slow leak over week
         usage = base_usage + leak_factor
-        usage += random.gauss(0, 0.03)  # Noise
         return max(0.0, min(1.0, usage)) * 100  # Clamp to 0-100%
     def get_resource_curve(self, resource_type: str,
@@ -121,11 +121,11 @@ class ResourceUsageSimulator:
         """Get resource usage following a specific curve"""
         if resource_type == "cpu":
             # CPU: periodic with bursts
-            return 0.3 + 0.4 * math.sin(time_elapsed / 100) + 0.2 * random.random()
         elif resource_type == "memory":
             # Memory: gradual increase with occasional GC drops
             base = 0.2 + 0.6 * (1 - math.exp(-time_elapsed / 1000))
-            gc_drop = 0.3 if random.random() < 0.01 else 0  # Occasional GC
             return max(0, base - gc_drop)
         elif resource_type == "disk":
             # Disk: steady growth
@@ -144,30 +144,30 @@ class NetworkSimulator:
     def simulate_partition(self) -> bool:
         """Return True if network partition is simulated"""
-        return random.random() < self.partition_probability
     def get_latency(self) -> float:
         """Get network latency in milliseconds"""
         # Base latency with occasional spikes
-        latency = self.latency_ms + random.gauss(0, self.latency_ms * 0.2)
-        if random.random() < 0.05:  # 5% chance of spike
-            latency *= random.uniform(2, 10)
         return max(1.0, latency)
     def get_bandwidth(self) -> float:
         """Get available bandwidth in Mbps"""
         # Bandwidth varies with usage and conditions
-        usage_factor = random.uniform(0.3, 0.9)
-        condition_factor = random.uniform(0.8, 1.2)
         return self.bandwidth_mbps * usage_factor * condition_factor
 def generate_failure_scenario(config: Dict[str, Any]) -> Dict[str, Any]:
     """Generate a random failure scenario based on config"""
     scenario = {
-        "type": random.choice(["crashloop", "oom", "node_failure", "cascade"]),
-        "severity": random.uniform(0.3, 0.9),
-        "duration": random.randint(30, 300),  # seconds
         "affected_components": []
     }
@@ -186,5 +186,5 @@ def generate_failure_scenario(config: Dict[str, Any]) -> Dict[str, Any]:
 def apply_realistic_noise(value: float, noise_percent: float = 10.0) -> float:
     """Apply realistic noise to a value"""
-    noise = random.gauss(0, value * (noise_percent / 100.0))
     return max(0, value + noise)

 Makes the simulation feel realistic and non-deterministic in the right ways.
 """
+import numpy as np
 import math
 from typing import Dict, List, Any, Optional
 from datetime import datetime, timedelta
+_RNG = np.random.default_rng()
+def set_random_seed(seed: Optional[int]) -> None:
+    """Set module-level RNG seed for deterministic utility behavior."""
+    global _RNG
+    _RNG = np.random.default_rng(seed)
 class ProbabilityHelpers:
     """Helpers for generating realistic probabilities and distributions"""
     def weighted_random_choice(choices: List[Any], weights: List[float]) -> Any:
         """Make a weighted random choice"""
         if not choices or not weights or len(choices) != len(weights):
+            return _RNG.choice(choices) if choices else None
         # Normalize weights
         total_weight = sum(weights)
         if total_weight == 0:
+            return _RNG.choice(choices)
         normalized_weights = [w / total_weight for w in weights]
+        return _RNG.choice(choices, p=normalized_weights)
     @staticmethod
     def exponential_backoff(attempt: int, base_delay: float = 1.0, max_delay: float = 60.0) -> float:
     @staticmethod
     def poisson_arrival_rate(lambda_rate: float, time_window: float) -> int:
         """Generate number of events in time window using Poisson distribution"""
+        return int(_RNG.poisson(max(lambda_rate * time_window, 0)))
     @staticmethod
     def failure_probability_over_time(base_rate: float, time_elapsed: float,
     @staticmethod
     def random_failure_rate(min_rate: float = 0.1, max_rate: float = 0.9) -> float:
         """Generate a random failure rate within bounds"""
+        return float(_RNG.uniform(min_rate, max_rate))
 class LatencySimulator:
         """Get simulated latency in milliseconds"""
         # Base latency + load-dependent component + random jitter
         load_latency = self.base_latency_ms * (self.load_factor - 1.0) * 2
+        jitter = float(_RNG.normal(0, self.base_latency_ms * 0.1))
         latency = self.base_latency_ms + max(0, load_latency) + jitter
         return max(1.0, latency)  # Minimum 1ms latency
                              spike_multiplier: float = 5.0) -> float:
         """Get latency with occasional spikes"""
         latency = self.get_latency()
+        if float(_RNG.random()) < spike_probability:
             latency *= spike_multiplier
         return latency
     """Simulates realistic CPU and memory usage patterns"""
     def __init__(self):
+        self.time_offset = float(_RNG.uniform(0, 2 * math.pi))
     def get_cpu_usage(self, base_usage: float = 0.3,
                      variation: float = 0.2) -> float:
         daily_pattern = 0.5 * math.sin(2 * math.pi * time_factor / 24) + 0.5
         usage = base_usage + variation * daily_pattern
+        usage += float(_RNG.normal(0, 0.05))  # Noise
         return max(0.0, min(1.0, usage)) * 100  # Clamp to 0-100%
     def get_memory_usage(self, base_usage: float = 0.4,
         leak_factor = 0.1 * time_factor  # Slow leak over week
         usage = base_usage + leak_factor
+        usage += float(_RNG.normal(0, 0.03))  # Noise
         return max(0.0, min(1.0, usage)) * 100  # Clamp to 0-100%
     def get_resource_curve(self, resource_type: str,
         """Get resource usage following a specific curve"""
         if resource_type == "cpu":
             # CPU: periodic with bursts
+            return 0.3 + 0.4 * math.sin(time_elapsed / 100) + 0.2 * float(_RNG.random())
         elif resource_type == "memory":
             # Memory: gradual increase with occasional GC drops
             base = 0.2 + 0.6 * (1 - math.exp(-time_elapsed / 1000))
+            gc_drop = 0.3 if float(_RNG.random()) < 0.01 else 0  # Occasional GC
             return max(0, base - gc_drop)
         elif resource_type == "disk":
             # Disk: steady growth
     def simulate_partition(self) -> bool:
         """Return True if network partition is simulated"""
+        return float(_RNG.random()) < self.partition_probability
     def get_latency(self) -> float:
         """Get network latency in milliseconds"""
         # Base latency with occasional spikes
+        latency = self.latency_ms + float(_RNG.normal(0, self.latency_ms * 0.2))
+        if float(_RNG.random()) < 0.05:  # 5% chance of spike
+            latency *= float(_RNG.uniform(2, 10))
         return max(1.0, latency)
     def get_bandwidth(self) -> float:
         """Get available bandwidth in Mbps"""
         # Bandwidth varies with usage and conditions
+        usage_factor = float(_RNG.uniform(0.3, 0.9))
+        condition_factor = float(_RNG.uniform(0.8, 1.2))
         return self.bandwidth_mbps * usage_factor * condition_factor
 def generate_failure_scenario(config: Dict[str, Any]) -> Dict[str, Any]:
     """Generate a random failure scenario based on config"""
     scenario = {
+        "type": str(_RNG.choice(["crashloop", "oom", "node_failure", "cascade"])),
+        "severity": float(_RNG.uniform(0.3, 0.9)),
+        "duration": int(_RNG.integers(30, 301)),  # seconds
         "affected_components": []
     }
 def apply_realistic_noise(value: float, noise_percent: float = 10.0) -> float:
     """Apply realistic noise to a value"""
+    noise = float(_RNG.normal(0, value * (noise_percent / 100.0)))
     return max(0, value + noise)