arisu04 commited on
Commit
6cc6670
·
1 Parent(s): c8daa82

feat: Implement COEnv Executor and Graders for task management

Browse files

- Added `executor.py` to handle action execution and manage task states.
- Created grading modules for different tasks: `grader_autoscaling.py`, `grader_incident.py`, and `grader_pod_recovery.py`.
- Introduced Pydantic models in `models.py` for structured data representation.
- Removed outdated `requirements.txt`.
- Developed task implementations for autoscaling, incident handling, and pod recovery in `tasks` directory.
- Added utility functions in `utils.py` for realistic simulation of network and resource conditions.
- Implemented action validation in `validator.py` to ensure action integrity before execution.
- Created `worker.py` to manage episode execution and integrate with the executor and validator.
- Added tests for environment initialization in `test_environment.py`.

.opencode/agents/executor-agent.ts ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Agent } from "@opencode-ai/plugin"
2
+
3
+ export const ExecutorAgent: Agent = {
4
+ name: "executor",
5
+ description: "Handles general tasks",
6
+
7
+ async run(ctx) {
8
+ return `Executing task: ${ctx.input}`
9
+ }
10
+ }
11
+
.opencode/agents/memory-agent.ts ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Agent } from "@opencode-ai/plugin"
2
+
3
+ export const MemoryAgent: Agent = {
4
+ name: "memory",
5
+ description: "Handles memory storage and retrieval",
6
+
7
+ async run(ctx) {
8
+ const input = ctx.input
9
+
10
+ if (input.includes("remember")) {
11
+ return "Saved to memory"
12
+ }
13
+
14
+ if (input.includes("recall")) {
15
+ return "Here is your memory"
16
+ }
17
+
18
+ return null
19
+ }
20
+ }
.opencode/agents/planner-agent.ts ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Agent } from "@opencode-ai/plugin"
2
+
3
+ export const PlannerAgent: Agent = {
4
+ name: "planner",
5
+ description: "Decides which agent to call",
6
+
7
+ async run(ctx) {
8
+ const input = ctx.input
9
+
10
+ if (input.includes("remember")) {
11
+ return ctx.call("memory-agent", input)
12
+ }
13
+
14
+ if (input.includes("search")) {
15
+ return ctx.call("web-agent", input)
16
+ }
17
+
18
+ return ctx.call("executor-agent", input)
19
+ }
20
+ }
.opencode/agents/web-agent.ts ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Agent } from "@opencode-ai/plugin"
2
+
3
+ export const WebAgent: Agent = {
4
+ name: "web",
5
+ description: "Fetches web data",
6
+
7
+ async run(ctx) {
8
+ return `Searching web for: ${ctx.input}`
9
+ }
10
+ }
COEnv_Project_Documentation.md ADDED
@@ -0,0 +1,704 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # COEnv — Project Documentation
2
+ ### Meta × Hugging Face OpenEnv RL Hackathon
3
+
4
+ ---
5
+
6
+ ## Table of Contents
7
+
8
+ 1. [What Is This Project?](#1-what-is-this-project)
9
+ 2. [Why Kubernetes?](#2-why-kubernetes)
10
+ 3. [How It Works — The Big Picture](#3-how-it-works--the-big-picture)
11
+ 4. [The Three Layers Explained](#4-the-three-layers-explained)
12
+ 5. [Team Ownership](#5-team-ownership)
13
+ 6. [Full Project Directory Structure](#6-full-project-directory-structure)
14
+ 7. [The Three Tasks (Easy → Medium → Hard)](#7-the-three-tasks-easy--medium--hard)
15
+ 8. [Reward & Grading Design](#8-reward--grading-design)
16
+ 9. [The Complete Episode Flow](#9-the-complete-episode-flow)
17
+ 10. [OpenEnv Spec Compliance Checklist](#10-openenv-spec-compliance-checklist)
18
+ 11. [Submission Checklist](#11-submission-checklist)
19
+ 12. [Key Technical Decisions](#12-key-technical-decisions)
20
+
21
+ ---
22
+
23
+ ## 1. What Is This Project?
24
+
25
+ **COEnv** is a Reinforcement Learning environment that simulates real-world Kubernetes cluster operations. An AI agent (LLM) is placed inside a broken or degraded Kubernetes cluster and must figure out the right sequence of operations to fix it — just like a real Site Reliability Engineer (SRE) would.
26
+
27
+ This is built for the **Meta × Hugging Face OpenEnv RL Hackathon**, which requires:
28
+ - A real-world task simulation (not games or toys)
29
+ - Full OpenEnv interface implementation (`step()`, `reset()`, `state()`)
30
+ - At least 3 tasks with programmatic graders (easy → medium → hard)
31
+ - A meaningful reward function that gives partial credit throughout the episode
32
+ - A working `inference.py` that runs an LLM agent and logs structured output
33
+ - Deployment on Hugging Face Spaces with a working Dockerfile
34
+
35
+ **In simple terms:** We fake a Kubernetes cluster in Python memory, break it in specific ways, and then let an LLM try to fix it step by step — scoring it on how well it does.
36
+
37
+ ---
38
+
39
+ ## 2. Why Kubernetes?
40
+
41
+ Kubernetes (k8s) is the industry-standard container orchestration system used by virtually every tech company running production software. Managing it is genuinely difficult and is a daily job for SREs and DevOps engineers worldwide.
42
+
43
+ **Why it's a perfect RL environment:**
44
+
45
+ | RL Concept | Kubernetes Equivalent |
46
+ |---|---|
47
+ | State | Cluster state (pod statuses, node health, resource usage) |
48
+ | Action | kubectl commands (scale, patch, delete, restart) |
49
+ | Reward | How close the cluster is to a healthy target state |
50
+ | Episode | One incident recovery scenario |
51
+ | Done | All SLOs restored / all pods healthy |
52
+
53
+ **Why it's novel for OpenEnv:** None of Meta's reference environments (calendar, REPL, browser, CARLA, reasoning gym) touch infrastructure operations. This fills a real gap.
54
+
55
+ **Why it's practical:** Companies would immediately use an environment like this to train or evaluate agents that assist SREs — the real-world utility score (30% of judging) is very high.
56
+
57
+ ---
58
+
59
+ ## 3. How It Works — The Big Picture
60
+
61
+ Think of the project as three concentric layers:
62
+
63
+ ```
64
+ ┌─────────────────────────────────────────────────────────┐
65
+ │ LAYER 1 — RL ENVIRONMENT │
66
+ │ inference.py ←→ main.py (FastAPI) ←→ tasks/graders │
67
+ │ (Sandeep) │
68
+ ├─────────────────────────────────────────────────────────┤
69
+ │ LAYER 2 — SIMULATION ENGINE │
70
+ │ world.py ←→ models.py ←→ conditions/ │
71
+ │ (You) │
72
+ ├─────────────────────────────────────────────────────────┤
73
+ │ LAYER 3 — ACTION SPACE │
74
+ │ worker.py ←→ executor.py ←→ actions/ ←→ validator│
75
+ │ (Third Person) │
76
+ └─────────────────────────────────────────────────────────┘
77
+ ```
78
+
79
+ **Layer 1 (Sandeep)** is what the judges see — the API endpoints, the inference script, the task definitions, the graders, the README.
80
+
81
+ **Layer 2 (You)** is the fake Kubernetes cluster. It holds the state of the cluster, knows how pods transition between statuses, and can inject failures. Everything sits in Python dictionaries — no real Kubernetes cluster runs.
82
+
83
+ **Layer 3 (Third Person)** is the action space — the specific operations the LLM agent is allowed to perform, and the validation/execution bridge that translates those actions into state changes in the simulator.
84
+
85
+ ---
86
+
87
+ ## 4. The Three Layers Explained
88
+
89
+ ### Layer 1 — RL Environment (Sandeep)
90
+
91
+ This layer is the **public contract** of the project. It's what OpenEnv's `validate` command checks, what the judges' scripts call, and what the LLM agent talks to.
92
+
93
+ **`main.py` — FastAPI application**
94
+
95
+ The central API server. It exposes exactly three mandatory endpoints:
96
+
97
+ - `POST /reset` — Starts a new episode. Sets up a broken cluster using one of the condition injectors. Returns the initial `ClusterObservation` (what the agent sees first).
98
+ - `POST /step` — Receives an action from the agent. Validates it, executes it on the simulated cluster, advances time by one tick, and returns the new observation + reward + done flag + info.
99
+ - `GET /state` — Returns the full current cluster state. Used for debugging and grading.
100
+
101
+ **`inference.py` — LLM agent runner**
102
+
103
+ This is the script the hackathon validators actually run. It:
104
+ 1. Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from environment variables
105
+ 2. Calls `/reset` to start an episode
106
+ 3. Feeds the observation to the LLM using the OpenAI client
107
+ 4. Parses the LLM's response as a structured action
108
+ 5. Calls `/step` with that action
109
+ 6. Prints structured stdout logs after every step:
110
+ ```
111
+ [START] task=pod-recovery env=coenv model=Qwen3-VL-30B
112
+ [STEP] step=1 action=delete_pod('frontend-7d9f-xkp2') reward=0.20 done=false error=null
113
+ [STEP] step=2 action=scale('frontend',3) reward=0.60 done=false error=null
114
+ [END] success=true steps=2 rewards=0.20,0.60
115
+ ```
116
+ 7. Repeats until `done=true` or `max_steps` is reached
117
+
118
+ **`openenv.yaml` — Spec metadata**
119
+
120
+ Required for `openenv validate` to pass. Contains:
121
+ - Environment name, version, description
122
+ - List of task IDs with difficulty labels
123
+ - References to the action schema and observation schema
124
+
125
+ **`classes/tasks/` — Task definitions**
126
+
127
+ Three Python files, each defining one task:
128
+ - What the broken state looks like (which condition to inject)
129
+ - What the agent's objective is (in plain English, passed to the LLM as a prompt)
130
+ - What counts as success
131
+ - Maximum number of steps allowed
132
+
133
+ **`classes/graders/` — Reward graders**
134
+
135
+ Three Python files, each implementing a `grade(world_state) -> float` function. Graders must be fully deterministic — same world state always returns same score. They implement partial credit: a grader doesn't just say "fixed or not fixed" but scores partial progress (e.g., 2 out of 5 pods fixed = 0.4).
136
+
137
+ **`Dockerfile`**
138
+
139
+ Single-stage Python container. Installs `requirements.txt`, copies the project, exposes port 8000, runs `uvicorn main:app`. Must build and run cleanly — this is a hard pass/fail gate.
140
+
141
+ **`README.md`**
142
+
143
+ Mandatory documentation. Must include: environment overview, motivation, action space definition, observation space definition, task descriptions with difficulty labels, setup instructions, baseline scores table.
144
+
145
+ ---
146
+
147
+ ### Layer 2 — Simulation Engine (You)
148
+
149
+ This is the **most important layer technically**. It's what makes the environment believable. Since we cannot run a real Kubernetes cluster inside a 2 vCPU / 8 GB HF Space container, the entire cluster is simulated as an in-memory Python object.
150
+
151
+ **`classes/world.py` — The cluster simulator**
152
+
153
+ This is the brain of the project. It maintains the complete cluster state as a Python dictionary, structured like a real Kubernetes API response:
154
+
155
+ ```python
156
+ cluster_state = {
157
+ "nodes": [
158
+ {"name": "node-1", "status": "Ready", "cpu_capacity": 4, "mem_capacity": 8192},
159
+ {"name": "node-2", "status": "NotReady", "cpu_capacity": 4, "mem_capacity": 8192}
160
+ ],
161
+ "deployments": [
162
+ {"name": "frontend", "desired_replicas": 3, "available_replicas": 1, "image": "nginx:1.21"}
163
+ ],
164
+ "pods": [
165
+ {"name": "frontend-7d9f-xkp2", "status": "CrashLoopBackOff", "node": "node-1", "restarts": 7},
166
+ {"name": "frontend-7d9f-ab3c", "status": "Running", "node": "node-1", "restarts": 0},
167
+ {"name": "frontend-7d9f-mn8x", "status": "Pending", "node": None, "restarts": 0}
168
+ ],
169
+ "services": [...],
170
+ "configmaps": [...],
171
+ "hpa": [...]
172
+ }
173
+ ```
174
+
175
+ Key methods:
176
+ - `reset(condition)` — Wipes state, injects a failure condition, returns initial observation
177
+ - `get_pods(namespace, selector)` — Returns filtered pod list (mimics `kubectl get pods`)
178
+ - `apply_patch(resource_type, name, patch)` — Applies a patch to a resource
179
+ - `scale(deployment_name, replicas)` — Changes replica count
180
+ - `delete_pod(pod_name)` — Removes a pod (it gets recreated by the deployment controller on next tick)
181
+ - `tick()` — Advances simulated time by one step. Pods in `CrashLoopBackOff` increment their restart counter. Pending pods on ready nodes eventually transition to `Running`. Dead nodes stay dead unless drained.
182
+ - `get_observation()` — Serialises the current state into a `ClusterObservation` Pydantic model
183
+
184
+ **`classes/models.py` — Pydantic typed models**
185
+
186
+ All data structures are defined here. This is mandatory for OpenEnv spec compliance — typed models enforce the action/observation contract.
187
+
188
+ ```python
189
+ class PodStatus(BaseModel):
190
+ name: str
191
+ status: Literal["Running", "Pending", "CrashLoopBackOff", "OOMKilled", "Terminating", "Unknown"]
192
+ node: Optional[str]
193
+ restarts: int
194
+ cpu_usage: float
195
+ mem_usage: float
196
+
197
+ class NodeStatus(BaseModel):
198
+ name: str
199
+ status: Literal["Ready", "NotReady", "SchedulingDisabled"]
200
+ cpu_capacity: float
201
+ mem_capacity: float
202
+ cpu_usage: float
203
+ mem_usage: float
204
+
205
+ class ClusterObservation(BaseModel):
206
+ nodes: List[NodeStatus]
207
+ pods: List[PodStatus]
208
+ deployments: List[DeploymentStatus]
209
+ services: List[ServiceStatus]
210
+ events: List[ClusterEvent] # recent k8s events (error messages, warnings)
211
+ step: int
212
+ objective: str # plain English description of what to fix
213
+
214
+ class RewardSignal(BaseModel):
215
+ reward: float # 0.0 to 1.0 incremental reward this step
216
+ cumulative: float # total reward so far
217
+ done: bool
218
+ info: Dict[str, Any] # breakdown: why this reward was given
219
+ ```
220
+
221
+ **`classes/conditions/` — Failure injectors**
222
+
223
+ Each condition is a Python class with a single `inject(cluster_state) -> cluster_state` method that takes a healthy cluster and returns a broken one. This is how each task starts with a specific failure scenario:
224
+
225
+ - `crash_loop.py` — Sets 3 pods to `CrashLoopBackOff` with high restart counts. Simulates a bad image tag or missing environment variable.
226
+ - `oom_kill.py` — Sets pods to `OOMKilled`. Memory limits are set too low in the deployment spec. Pods keep restarting.
227
+ - `node_failure.py` — Sets one node to `NotReady`. All pods on that node go to `Unknown`. New pods are `Pending` (no space to schedule).
228
+ - `cascade_failure.py` — Combines multiple failures: one OOMKilled service causes downstream 503s in two dependent services, creating a cascading failure across 3 deployments.
229
+
230
+ **`classes/utils.py` — Probability and simulation helpers**
231
+
232
+ Utility functions that make the simulation feel realistic:
233
+ - `sample_cpu_usage(base_load, noise_factor)` — Returns a slightly randomised CPU % (real clusters are never exactly at baseline)
234
+ - `sample_latency(healthy_latency, degradation_factor)` — Simulates p95 request latency under load
235
+ - `should_pod_recover(restarts, backoff_seconds)` — Determines if a `CrashLoopBackOff` pod would naturally recover (it usually won't — that's the point)
236
+ - `generate_cluster_events(pod_list)` — Creates realistic k8s event messages like `"Back-off restarting failed container"` or `"OOMKilled: container exceeded memory limit"`
237
+
238
+ **`config.json` — Cluster defaults**
239
+
240
+ Single source of truth for all simulation parameters:
241
+
242
+ ```json
243
+ {
244
+ "cluster": {
245
+ "num_nodes": 3,
246
+ "cpu_per_node": 4,
247
+ "mem_per_node_gb": 8
248
+ },
249
+ "tasks": {
250
+ "pod_recovery": { "max_steps": 15, "success_threshold": 0.9 },
251
+ "autoscaling": { "max_steps": 20, "success_threshold": 0.85 },
252
+ "incident": { "max_steps": 30, "success_threshold": 0.80 }
253
+ },
254
+ "simulation": {
255
+ "tick_interval_seconds": 30,
256
+ "crash_backoff_max_seconds": 300,
257
+ "hpa_cooldown_seconds": 180
258
+ }
259
+ }
260
+ ```
261
+
262
+ ---
263
+
264
+ ### Layer 3 — Action Space & Workers (Third Person)
265
+
266
+ This layer defines what the LLM is allowed to do, makes sure it's valid, and executes it against the simulator.
267
+
268
+ **`classes/actions/` — Typed action definitions**
269
+
270
+ Each action is a Pydantic model. The LLM must output one of these (Sandeep's inference.py prompts it to respond in JSON matching one of these schemas):
271
+
272
+ ```python
273
+ class ScaleAction(BaseModel):
274
+ action_type: Literal["scale"]
275
+ deployment: str # e.g. "frontend"
276
+ replicas: int # e.g. 3
277
+
278
+ class DeletePodAction(BaseModel):
279
+ action_type: Literal["delete_pod"]
280
+ pod_name: str # e.g. "frontend-7d9f-xkp2"
281
+
282
+ class PatchAction(BaseModel):
283
+ action_type: Literal["patch"]
284
+ resource_type: str # "deployment" | "configmap" | "service"
285
+ name: str
286
+ patch: Dict[str, Any] # the fields to update
287
+
288
+ class RolloutRestartAction(BaseModel):
289
+ action_type: Literal["rollout_restart"]
290
+ deployment: str
291
+
292
+ class SetHPAAction(BaseModel):
293
+ action_type: Literal["set_hpa"]
294
+ deployment: str
295
+ min_replicas: int
296
+ max_replicas: int
297
+ cpu_target_percent: int
298
+
299
+ class DrainNodeAction(BaseModel):
300
+ action_type: Literal["drain_node"]
301
+ node_name: str
302
+
303
+ class DescribeAction(BaseModel):
304
+ action_type: Literal["describe"]
305
+ resource_type: str
306
+ name: str # "investigation" action — no state change, returns detail
307
+ ```
308
+
309
+ **`classes/validator.py` — Action validation**
310
+
311
+ Before any action touches the world state, the validator checks it:
312
+ - Does the target resource exist? (Can't delete a pod that doesn't exist)
313
+ - Is the scale value sane? (Can't scale to 0 or to 1000 replicas)
314
+ - Is the node already drained? (Can't drain twice)
315
+ - Is the deployment name a real deployment?
316
+
317
+ If validation fails, it returns an error string. This flows directly into the `[STEP] error=` field in stdout logs. The step still counts against the agent's limit �� bad actions are penalised by wasting steps.
318
+
319
+ **`classes/executor.py` — Action execution bridge**
320
+
321
+ Maps each validated action type to the correct `world.py` method call:
322
+
323
+ ```python
324
+ def execute(action: KubeAction, world: World) -> ExecutionResult:
325
+ if action.action_type == "scale":
326
+ world.scale(action.deployment, action.replicas)
327
+ elif action.action_type == "delete_pod":
328
+ world.delete_pod(action.pod_name)
329
+ elif action.action_type == "rollout_restart":
330
+ world.rollout_restart(action.deployment)
331
+ ...
332
+ world.tick() # always advance time after an action
333
+ return ExecutionResult(observation=world.get_observation(), ...)
334
+ ```
335
+
336
+ **`classes/worker.py` — Agent episode loop**
337
+
338
+ Manages the full lifecycle of a single episode. Sandeep's `inference.py` calls this:
339
+
340
+ ```python
341
+ class Worker:
342
+ def run_episode(self, task_id, world, max_steps) -> EpisodeResult:
343
+ obs = world.reset(task=task_id)
344
+ rewards = []
345
+ for step in range(1, max_steps + 1):
346
+ action = self.get_action(obs) # calls LLM
347
+ result = executor.execute(action, world)
348
+ rewards.append(result.reward)
349
+ if result.done:
350
+ break
351
+ return EpisodeResult(rewards=rewards, steps=step, success=result.done)
352
+ ```
353
+
354
+ ---
355
+
356
+ ## 5. Team Ownership
357
+
358
+ | Module | Owner | Why It's Their Responsibility |
359
+ |---|---|---|
360
+ | `main.py` | Sandeep | He owns the public API contract |
361
+ | `inference.py` | Sandeep | He owns the hackathon submission script |
362
+ | `openenv.yaml` | Sandeep | He owns spec compliance |
363
+ | `Dockerfile` | Sandeep | He owns deployment |
364
+ | `README.md` | Sandeep | He owns documentation |
365
+ | `classes/tasks/` | Sandeep | He defines what success looks like |
366
+ | `classes/graders/` | Sandeep | He owns the scoring logic |
367
+ | `classes/world.py` | You | You own the cluster simulator |
368
+ | `classes/models.py` | You | You own all typed data models |
369
+ | `classes/utils.py` | You | You own simulation helpers |
370
+ | `classes/conditions/` | You | You own failure injection |
371
+ | `config.json` | You | You own all parameters |
372
+ | `classes/worker.py` | Third person | They own the episode loop |
373
+ | `classes/actions/` | Third person | They own the action space |
374
+ | `classes/executor.py` | Third person | They own action execution |
375
+ | `classes/validator.py` | Third person | They own action validation |
376
+ | `tests/` | All three | Each writes tests for their own module |
377
+
378
+ ---
379
+
380
+ ## 6. Full Project Directory Structure
381
+
382
+ ```text
383
+ COEnv/
384
+ ├── .dockerignore # Docker build exclusions
385
+ ├── __init__.py # Module exports
386
+ ├── README.md # Project documentation
387
+ ├── openenv.yaml # OpenEnv manifest
388
+ ├── pyproject.toml # Project metadata and dependencies
389
+ ├── uv.lock # Locked dependencies
390
+ ├── client.py # CoenvEnv client / inference-side runner
391
+ ├── models.py # Shared action and observation models
392
+ ├── config.json # Cluster defaults and simulation params
393
+ ├── mkdocs.yml # Docs site configuration
394
+ ├── tests/ # End-to-end and unit tests
395
+ │ ├── test_environment.py # From test_world.py
396
+ │ ├── test_conditions.py # From test_conditions.py
397
+ │ ├── test_models.py # From test_models.py
398
+ │ ├── test_actions.py # From test_actions.py
399
+ │ ├── test_executor.py # From test_executor.py
400
+ │ ├── test_graders.py # From test_graders.py
401
+ │ ├── test_tasks.py # From test_tasks.py
402
+ │ └── test_integration.py # End-to-end reset→step→state flow
403
+ └── server/
404
+ ├── __init__.py # Server module exports
405
+ ├── COEnv_environment.py # Core environment logic
406
+ ├── app.py # FastAPI app exposing /reset /step /state
407
+ ├── Dockerfile # Container image definition
408
+ ├── utils.py # Simulation helpers
409
+ ├── validator.py # Action validation
410
+ ├── executor.py # Action execution bridge
411
+ ├── worker.py # Episode loop manager
412
+ ├── tasks/
413
+ │ ├── __init__.py
414
+ │ ├── task_pod_recovery.py
415
+ │ ├── task_autoscaling.py
416
+ │ └── task_incident.py
417
+ ├── graders/
418
+ │ ├── __init__.py
419
+ │ ├── grader_pod_recovery.py
420
+ │ ├── grader_autoscaling.py
421
+ │ └── grader_incident.py
422
+ ├── conditions/
423
+ │ ├── __init__.py
424
+ │ ├── crash_loop.py
425
+ │ ├── oom_kill.py
426
+ │ ├── node_failure.py
427
+ │ └── cascade_failure.py
428
+ └── actions/
429
+ ├── __init__.py
430
+ ├── scale_action.py
431
+ ��── patch_action.py
432
+ ├── delete_pod_action.py
433
+ ├── rollout_action.py
434
+ ├── hpa_action.py
435
+ ├── drain_action.py
436
+ └── describe_action.py
437
+ ```
438
+
439
+ ---
440
+
441
+ ## 7. The Three Tasks (Easy → Medium → Hard)
442
+
443
+ ### Task 1 — Pod Recovery (Easy)
444
+
445
+ **What's broken:** A frontend deployment has 3 pods stuck in `CrashLoopBackOff`. The restart count is climbing. The root cause is a wrong environment variable in the deployment spec pointing to a database host that doesn't exist.
446
+
447
+ **What the agent must do:**
448
+ 1. Observe the broken pods and read the k8s events (which mention a connection refused error)
449
+ 2. Identify the bad `DB_HOST` environment variable using a `describe` or `patch` inspect action
450
+ 3. Patch the deployment with the correct `DB_HOST` value
451
+ 4. Optionally delete the crash-looping pods to speed up recovery (they'll get recreated with the new config)
452
+ 5. Verify all 3 pods reach `Running` state
453
+
454
+ **Objective string shown to agent:** *"The frontend deployment is crash-looping. Diagnose and fix the root cause so that all pods reach Running state."*
455
+
456
+ **Max steps:** 15
457
+ **Success threshold:** All 3 pods in `Running` state (score ≥ 0.9)
458
+
459
+ **Partial rewards:**
460
+ - +0.1 for each pod that stops crash-looping
461
+ - +0.2 for correctly patching the environment variable
462
+ - +0.3 bonus for all pods Running within 10 steps
463
+
464
+ ---
465
+
466
+ ### Task 2 — HPA Autoscaling Under Traffic Spike (Medium)
467
+
468
+ **What's broken:** The cluster is healthy but receiving 10× normal traffic. The deployment has no HPA configured, is running on fixed 2 replicas, and is already at 95% CPU. Request latency is climbing past the SLO threshold.
469
+
470
+ **What the agent must do:**
471
+ 1. Observe high CPU usage and rising latency in the observation
472
+ 2. Immediately scale up the deployment to handle current load
473
+ 3. Configure a HorizontalPodAutoscaler (HPA) with appropriate min/max replicas and CPU target
474
+ 4. Set correct CPU resource requests/limits on the deployment so HPA has a baseline to work with
475
+ 5. Verify that latency drops back below the SLO threshold
476
+
477
+ **Objective string shown to agent:** *"Traffic has spiked 10×. The api-server deployment is overloaded. Configure autoscaling and ensure p95 latency stays below 500ms."*
478
+
479
+ **Max steps:** 20
480
+ **Success threshold:** p95 latency < 500ms, HPA configured, replicas ≥ 4 (score ≥ 0.85)
481
+
482
+ **Partial rewards:**
483
+ - +0.15 for scaling up replicas immediately (within 3 steps)
484
+ - +0.20 for configuring HPA correctly
485
+ - +0.25 for latency dropping below 1000ms
486
+ - +0.30 for latency dropping below 500ms (SLO met)
487
+ - -0.10 penalty for scaling beyond 12 replicas unnecessarily (resource waste)
488
+
489
+ ---
490
+
491
+ ### Task 3 — Multi-Service Cascading Incident (Hard)
492
+
493
+ **What's broken:** The `auth-service` deployment has pods getting OOMKilled because memory limits are set 4× too low relative to actual usage. This causes the `api-gateway` to fail authentication checks and return 503s. Downstream, the `data-processor` service is also throwing errors because it depends on the gateway. Three services are degraded simultaneously.
494
+
495
+ **What the agent must do:**
496
+ 1. Identify the blast radius — which services are affected and why
497
+ 2. Investigate `auth-service` to find the OOMKill root cause (memory limits too low)
498
+ 3. Patch `auth-service` deployment with correct memory limits
499
+ 4. Rollout restart `auth-service` so new pods come up with correct limits
500
+ 5. Drain the partially-failed node where most OOMKilled pods were running, to force clean rescheduling
501
+ 6. Verify `api-gateway` 503 errors stop (automatically once auth recovers)
502
+ 7. Verify `data-processor` error rate drops (automatically once gateway recovers)
503
+ 8. Confirm all three services are fully healthy
504
+
505
+ **Objective string shown to agent:** *"A cascading incident has degraded auth-service, api-gateway, and data-processor. Identify the root cause and restore all three services to healthy state without data loss."*
506
+
507
+ **Max steps:** 30
508
+ **Success threshold:** All 3 services healthy, error rate < 0.1% (score ≥ 0.80)
509
+
510
+ **Partial rewards:**
511
+ - +0.10 for correctly identifying `auth-service` as the root cause (within 5 steps)
512
+ - +0.15 for patching memory limits correctly
513
+ - +0.15 for auth-service pods reaching Running
514
+ - +0.20 for api-gateway 503s stopping
515
+ - +0.20 for data-processor errors resolving
516
+ - +0.10 for draining the bad node cleanly
517
+ - -0.15 penalty for deleting services or breaking healthy components
518
+
519
+ ---
520
+
521
+ ## 8. Reward & Grading Design
522
+
523
+ The grading philosophy follows what the PS requires: reward signal over the **full trajectory**, not just at the end.
524
+
525
+ ### Reward Principles
526
+
527
+ **Partial progress is always rewarded.** If the agent fixes 1 out of 3 broken pods, it gets 1/3 of the maximum reward for that milestone — not zero.
528
+
529
+ **Speed bonus.** Fixing the issue in fewer steps earns a small bonus. This incentivises efficient reasoning.
530
+
531
+ **Waste penalty.** Unnecessary destructive actions (scaling to 0, deleting healthy pods, draining a healthy node) subtract from the reward. This teaches the agent to be surgical.
532
+
533
+ **Idempotency.** Repeating the same correct action doesn't give extra reward but doesn't penalise either (except for wasted steps).
534
+
535
+ ### Grader Implementation Pattern
536
+
537
+ Each grader implements:
538
+
539
+ ```python
540
+ def grade(world_state: dict, step: int, max_steps: int) -> float:
541
+ score = 0.0
542
+
543
+ # Milestone 1: Partial progress
544
+ running_pods = [p for p in world_state["pods"] if p["status"] == "Running"]
545
+ score += (len(running_pods) / total_expected_pods) * 0.5
546
+
547
+ # Milestone 2: Full success
548
+ if all(p["status"] == "Running" for p in world_state["pods"]):
549
+ score += 0.4
550
+
551
+ # Speed bonus
552
+ efficiency = 1.0 - (step / max_steps)
553
+ score += efficiency * 0.1
554
+
555
+ return min(score, 1.0) # always clamp to [0, 1]
556
+ ```
557
+
558
+ ---
559
+
560
+ ## 9. The Complete Episode Flow
561
+
562
+ Here is the full step-by-step flow of one complete episode, from start to finish:
563
+
564
+ ```
565
+ 1. JUDGE / VALIDATOR runs:
566
+ python inference.py
567
+
568
+ 2. inference.py reads env vars:
569
+ API_BASE_URL, MODEL_NAME, HF_TOKEN
570
+
571
+ 3. inference.py calls:
572
+ POST /reset { "task": "pod_recovery" }
573
+
574
+ 4. main.py receives /reset:
575
+ → Calls task_pod_recovery.get_condition() → crash_loop.inject(cluster_state)
576
+ → world.reset(broken_state)
577
+ → Returns ClusterObservation (3 CrashLoopBackOff pods, events, objective string)
578
+
579
+ 5. stdout prints:
580
+ [START] task=pod-recovery env=coenv model=Qwen3-30B
581
+
582
+ 6. inference.py builds LLM prompt:
583
+ "You are an SRE. Current cluster state: [observation JSON].
584
+ Objective: Fix the frontend deployment crash loop.
585
+ Respond with a JSON action from the available action types."
586
+
587
+ 7. LLM responds:
588
+ { "action_type": "describe", "resource_type": "deployment", "name": "frontend" }
589
+
590
+ 8. inference.py calls:
591
+ POST /step { action }
592
+
593
+ 9. main.py receives /step:
594
+ → validator.validate(action, world) → OK
595
+ → executor.execute(action, world)
596
+ → world.tick()
597
+ → grader.grade(world.state, step=1) → reward=0.00 (just investigating)
598
+ → Returns observation, reward=0.00, done=false, info={...}
599
+
600
+ 10. stdout prints:
601
+ [STEP] step=1 action=describe('deployment','frontend') reward=0.00 done=false error=null
602
+
603
+ 11. LLM sees deployment spec, notices DB_HOST=wrong-host.internal
604
+ LLM responds: { "action_type": "patch", "resource_type": "deployment",
605
+ "name": "frontend",
606
+ "patch": {"env": [{"name": "DB_HOST", "value": "db.prod.internal"}]} }
607
+
608
+ 12. POST /step { patch action }
609
+ → executor patches deployment in world state
610
+ → world.tick() — pods begin restarting with new config
611
+ → grader → reward=0.20 (correct patch applied)
612
+
613
+ 13. [STEP] step=2 action=patch('frontend',{env...}) reward=0.20 done=false error=null
614
+
615
+ 14. LLM responds: { "action_type": "delete_pod", "pod_name": "frontend-7d9f-xkp2" }
616
+ → world deletes pod, recreates with correct env, status → Running
617
+ → grader → reward=0.40
618
+
619
+ 15. Repeat for remaining 2 pods...
620
+
621
+ 16. All 3 pods Running. grader → reward=1.0, done=true
622
+
623
+ 17. stdout prints:
624
+ [END] success=true steps=8 rewards=0.00,0.20,0.40,0.55,0.70,0.85,0.95,1.00
625
+ ```
626
+
627
+ ---
628
+
629
+ ## 10. OpenEnv Spec Compliance Checklist
630
+
631
+ | Requirement | File | Status |
632
+ |---|---|---|
633
+ | Typed Observation model | `classes/models.py` → `ClusterObservation` | Required |
634
+ | Typed Action model | `classes/models.py` → `KubeAction` | Required |
635
+ | Typed Reward model | `classes/models.py` → `RewardSignal` | Required |
636
+ | `step(action) → (obs, reward, done, info)` | `main.py` → `POST /step` | Required |
637
+ | `reset() → initial_observation` | `main.py` → `POST /reset` | Required |
638
+ | `state() → current_state` | `main.py` → `GET /state` | Required |
639
+ | `openenv.yaml` with metadata | `openenv.yaml` | Required |
640
+ | `openenv validate` passes | Tested via pre-validation script | Required |
641
+ | Min 3 tasks | `classes/tasks/` — 3 files | Required |
642
+ | Easy → medium → hard difficulty | task_pod_recovery / task_autoscaling / task_incident | Required |
643
+ | Graders return 0.0–1.0 | `classes/graders/` — 3 graders | Required |
644
+ | Graders are deterministic | Pure functions, no randomness | Required |
645
+ | Partial reward signals | All 3 graders implement milestone scoring | Required |
646
+ | Penalise bad actions | validator.py + grader penalty terms | Required |
647
+ | `inference.py` in root | `inference.py` | Required |
648
+ | `[START]` log line | `inference.py` → `log_start()` | Required |
649
+ | `[STEP]` log per step | `inference.py` → `log_step()` | Required |
650
+ | `[END]` log always emitted | `inference.py` → `finally: log_end()` | Required |
651
+ | Reads `API_BASE_URL` with default | `inference.py` | Required |
652
+ | Reads `MODEL_NAME` with default | `inference.py` | Required |
653
+ | Reads `HF_TOKEN` (no default) | `inference.py` | Required |
654
+ | Uses OpenAI client | `from openai import OpenAI` | Required |
655
+ | `Dockerfile` builds cleanly | `Dockerfile` | Required |
656
+ | HF Space deploys and responds | Deployed on Hugging Face | Required |
657
+ | Inference runs in < 20 min | Max 30 steps × ~20s/step = ~10 min | Required |
658
+ | Runs in 2 vCPU / 8 GB RAM | Pure Python in-memory sim, no real k8s | Required |
659
+ | README with all required sections | `README.md` | Required |
660
+
661
+ ---
662
+
663
+ ## 11. Submission Checklist
664
+
665
+ Before submitting, verify all of these:
666
+
667
+ - [ ] `inference.py` is in the **root directory** (not inside `classes/`)
668
+ - [ ] `inference.py` has default values for `API_BASE_URL` and `MODEL_NAME`
669
+ - [ ] `inference.py` raises `ValueError` if `HF_TOKEN` is missing
670
+ - [ ] `[START]`, `[STEP]`, `[END]` format matches the spec **exactly** (field names, order, lowercase booleans)
671
+ - [ ] `openenv validate` passes locally
672
+ - [ ] `docker build` completes without errors
673
+ - [ ] `docker run` starts the server and responds to `GET /state`
674
+ - [ ] HF Space is in **Running** state (not Building, not Stopped)
675
+ - [ ] All 3 tasks can be reset and stepped without crashing
676
+ - [ ] All 3 graders return a float between 0.0 and 1.0
677
+ - [ ] Running `inference.py` end-to-end completes in under 20 minutes
678
+ - [ ] `README.md` includes baseline scores table
679
+ - [ ] `tests/test_integration.py` passes cleanly
680
+
681
+ ---
682
+
683
+ ## 12. Key Technical Decisions
684
+
685
+ ### Why a simulated cluster, not a real one?
686
+
687
+ Running `kind` or `minikube` inside a Hugging Face Space container with 2 vCPU / 8 GB RAM is not feasible. The Kubernetes control plane alone (etcd + apiserver + scheduler + controller-manager) consumes ~1.5–2 GB RAM before any workloads run. An in-memory Python simulator is the only viable approach within the hardware constraints. It is also faster (no scheduling latency), fully deterministic (same input = same output), and easier to test.
688
+
689
+ ### Why a constrained action space?
690
+
691
+ Free-form kubectl text strings are nearly impossible to grade deterministically. By defining ~7 typed Pydantic action models, we make the action space clear to the LLM (easier to prompt), easy to validate (Pydantic does the type checking), and easy to grade (executor calls predictable world methods). This also keeps the action space small enough that the LLM can reason about it effectively without getting lost in kubectl's hundreds of sub-commands.
692
+
693
+ ### Why FastAPI?
694
+
695
+ OpenEnv environments are expected to be HTTP servers. FastAPI gives automatic OpenAPI documentation (at `/docs`), Pydantic integration for request/response validation, async support for when we need it, and a clean decorator syntax that makes `main.py` easy to read. It is also trivial to run with `uvicorn` inside a Docker container.
696
+
697
+ ### Why partial rewards matter for the hackathon
698
+
699
+ The PS explicitly states: *"The reward function must provide feedback throughout the task trajectory, not just at completion."* Binary rewards (0 until success, then 1) are explicitly penalised in the environment design score. Our graders implement milestone-based partial rewards, which also makes the environment more useful for actual RL training — sparse rewards make training slow and unstable.
700
+
701
+ ---
702
+
703
+ *COEnv — Meta × Hugging Face OpenEnv RL Hackathon*
704
+ *Team: Sandeep (RL environment) · You (Simulation) · Third Person (Actions & Workers)*
config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_nodes": 5,
3
+ "node_cpu_capacity": 4,
4
+ "node_mem_capacity": 8192,
5
+ "pod_cpu_request": 250,
6
+ "pod_mem_request": 128,
7
+ "pod_cpu_limit": 500,
8
+ "pod_mem_limit": 256,
9
+ "crash_loop_failure_rate": 0.7,
10
+ "oom_kill_failure_rate": 0.6,
11
+ "node_failure_rate": 0.3,
12
+ "cascade_failure_probability": 0.5,
13
+ "task_timeout_values": 300
14
+ }
inference.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ COEnv Inference Script
3
+ Used by validators to run episodes with LLMs
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import json
9
+ import argparse
10
+ import requests
11
+ from typing import Dict, Any, Optional
12
+
13
+ API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
14
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen3-30B")
15
+ HF_TOKEN = os.getenv("HF_TOKEN")
16
+
17
+
18
+ def main():
19
+ parser = argparse.ArgumentParser(description='Run COEnv inference')
20
+ parser.add_argument('--api-base-url', type=str, default=API_BASE_URL, help='Base URL for the COEnv API')
21
+ parser.add_argument('--model-name', type=str, default=MODEL_NAME, help='Name of the model to use')
22
+ parser.add_argument('--hf-token', type=str, default=HF_TOKEN, help='Hugging Face token (if needed)')
23
+ parser.add_argument('--task-id', type=str, default='pod_recovery', help='Task ID to run')
24
+ parser.add_argument('--max-steps', type=int, default=15, help='Maximum steps per episode')
25
+
26
+ args = parser.parse_args()
27
+
28
+ api_base_url = args.api_base_url.rstrip('/')
29
+ model_name = args.model_name
30
+ hf_token = args.hf_token or HF_TOKEN
31
+ task_id = args.task_id
32
+ max_steps = args.max_steps
33
+
34
+ print(f"[START] task={task_id} env=coenv model={model_name}")
35
+
36
+ reset_url = f"{api_base_url}/reset"
37
+ try:
38
+ response = requests.post(reset_url, json={"task": task_id})
39
+ response.raise_for_status()
40
+ observation = response.json()
41
+ except Exception as e:
42
+ print(f"[ERROR] Failed to reset environment: {e}")
43
+ return 1
44
+
45
+ total_reward = []
46
+
47
+ for step in range(1, max_steps + 1):
48
+ action = {
49
+ "action_type": "describe",
50
+ "resource_type": "deployment",
51
+ "name": "frontend"
52
+ }
53
+ action_str = f"describe('deployment','frontend')"
54
+
55
+ step_url = f"{api_base_url}/step"
56
+ try:
57
+ response = requests.post(step_url, json={"action": action})
58
+ response.raise_for_status()
59
+ result = response.json()
60
+
61
+ reward = result.get('reward', 0.0)
62
+ done = result.get('done', False)
63
+ info = result.get('info', {})
64
+ error_str = "null"
65
+
66
+ if 'error' in info and info['error']:
67
+ error_str = f"\"{info['error']}\""
68
+
69
+ total_reward.append(reward)
70
+
71
+ print(f"[STEP] step={step} action={action_str} reward={reward:.2f} done={done} error={error_str}")
72
+
73
+ if done:
74
+ print(f"[END] success={str(done).lower()} steps={step} rewards={total_reward}")
75
+ return 0
76
+
77
+ except Exception as e:
78
+ print(f"[ERROR] Failed to step environment: {e}")
79
+ print(f"[STEP] step={step} action={action_str} reward=0.00 done=false error=\"{str(e)}\"")
80
+
81
+ print(f"[END] success=false steps={max_steps} rewards={total_reward}")
82
+ return 0
83
+
84
+
85
+ if __name__ == "__main__":
86
+ sys.exit(main())
opencode.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "https://opencode.ai/config.json",
3
+
4
+ "agent": {
5
+ "planner": {
6
+ "entry": "./.opencode/agents/planner-agent.ts"
7
+ },
8
+ "memory": {
9
+ "entry": "./.opencode/agents/memory-agent.ts"
10
+ },
11
+ "executor": {
12
+ "entry": "./.opencode/agents/executor-agent.ts"
13
+ },
14
+ "web": {
15
+ "entry": "./.opencode/agents/web-agent.ts"
16
+ }
17
+ }
18
+ }
openenv.yaml CHANGED
@@ -1,7 +1,20 @@
1
- spec_version: 1
2
- name: COEnv
3
- type: space
4
- runtime: fastapi
5
- app: server.app:app
6
- port: 8000
7
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: coenv
2
+ version: 0.1.0
3
+ description: Kubernetes Cluster Simulator for OpenEnv RL Hackathon
4
+ entrypoint: server/app.py
5
+ inference: inference.py
6
+ endpoints:
7
+ reset: /reset
8
+ step: /step
9
+ state: /state
10
+ tasks:
11
+ - id: pod_recovery
12
+ difficulty: easy
13
+ - id: autoscaling
14
+ difficulty: medium
15
+ - id: incident
16
+ difficulty: hard
17
+ artifacts:
18
+ action_schema: server/actions/
19
+ observation_schema: server/models.py
20
+ reward_schema: server/models.py
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi>=0.100.0
2
+ uvicorn>=0.23.0
3
+ pydantic>=2.0.0
4
+ requests>=2.31.0
server/COEnv_environment.py CHANGED
@@ -1,104 +1,430 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
  """
8
- Coenv Environment Implementation.
9
-
10
- A simple test environment that echoes back messages sent to it.
11
- Perfect for testing HTTP server infrastructure.
12
  """
13
 
14
- from uuid import uuid4
15
-
16
- from openenv.core.env_server.interfaces import Environment
17
- from openenv.core.env_server.types import State
18
-
19
- try:
20
- from ..models import CoenvAction, CoenvObservation
21
- except ImportError:
22
- from models import CoenvAction, CoenvObservation
23
-
24
-
25
- class CoenvEnvironment(Environment):
26
- """
27
- A simple echo environment that echoes back messages.
28
-
29
- This environment is designed for testing the HTTP server infrastructure.
30
- It maintains minimal state and simply echoes back whatever message it receives.
31
-
32
- Example:
33
- >>> env = CoenvEnvironment()
34
- >>> obs = env.reset()
35
- >>> print(obs.echoed_message) # "Coenv environment ready!"
36
- >>>
37
- >>> obs = env.step(CoenvAction(message="Hello"))
38
- >>> print(obs.echoed_message) # "Hello"
39
- >>> print(obs.message_length) # 5
40
- """
41
-
42
- # Enable concurrent WebSocket sessions.
43
- # Set to True if your environment isolates state between instances.
44
- # When True, multiple WebSocket clients can connect simultaneously, each
45
- # getting their own environment instance (when using factory mode in app.py).
46
- SUPPORTS_CONCURRENT_SESSIONS: bool = True
47
-
48
- def __init__(self):
49
- """Initialize the COEnv environment."""
50
- self._state = State(episode_id=str(uuid4()), step_count=0)
51
- self._reset_count = 0
52
-
53
- def reset(self) -> CoenvObservation:
54
- """
55
- Reset the environment.
56
-
57
- Returns:
58
- CoenvObservation with a ready message
59
- """
60
- self._state = State(episode_id=str(uuid4()), step_count=0)
61
- self._reset_count += 1
62
-
63
- return CoenvObservation(
64
- echoed_message="Coenv environment ready!",
65
- message_length=0,
66
- done=False,
67
- reward=0.0,
68
- )
69
-
70
- def step(self, action: CoenvAction) -> CoenvObservation: # type: ignore[override]
71
- """
72
- Execute a step in the environment by echoing the message.
73
-
74
- Args:
75
- action: CoenvAction containing the message to echo
76
-
77
- Returns:
78
- CoenvObservation with the echoed message and its length
79
- """
80
- self._state.step_count += 1
81
-
82
- message = action.message
83
- length = len(message)
84
-
85
- # Simple reward: longer messages get higher rewards
86
- reward = length * 0.1
87
 
88
- return CoenvObservation(
89
- echoed_message=message,
90
- message_length=length,
91
- done=False,
92
- reward=reward,
93
- metadata={"original_message": message, "step": self._state.step_count},
94
- )
95
 
96
- @property
97
- def state(self) -> State:
98
- """
99
- Get the current environment state.
100
 
101
- Returns:
102
- Current State with episode_id and step_count
103
- """
104
- return self._state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ COEnv Environment - Cluster Simulator
3
+ In-memory dict that holds cluster state: nodes, pods, deployments, services.
4
+ Has methods like get_pods(), apply_patch(), tick() to advance time.
5
+ This is the brain of the whole project.
6
  """
7
 
8
+ from typing import Dict, List, Any, Optional, Literal
9
+ from datetime import datetime
10
+ import random
11
+ import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ from .models import (
14
+ NodeStatus, PodStatus, DeploymentStatus, ServiceStatus,
15
+ ClusterEvent, ClusterObservation, KubeAction, RewardSignal,
16
+ ConfigMapStatus, HPAStatus
17
+ )
 
 
18
 
 
 
 
 
19
 
20
+ class World:
21
+ """In-memory Kubernetes cluster simulator"""
22
+
23
+ def __init__(self, config: Dict[str, Any]):
24
+ self.config = config
25
+ self.cluster_state = self._initialize_healthy_cluster()
26
+ self.step_count = 0
27
+ self.events = []
28
+ self._event_counter = 0
29
+
30
+ def _initialize_healthy_cluster(self) -> Dict[str, List[Dict]]:
31
+ """Initialize a healthy cluster state based on config"""
32
+ nodes = []
33
+ for i in range(self.config.get("num_nodes", 3)):
34
+ nodes.append({
35
+ "name": f"node-{i+1}",
36
+ "status": "Ready",
37
+ "cpu_capacity": self.config.get("node_cpu_capacity", 4),
38
+ "mem_capacity": self.config.get("node_mem_capacity", 8192),
39
+ "cpu_usage": 0.0,
40
+ "mem_usage": 0.0,
41
+ "last_updated": datetime.now().isoformat()
42
+ })
43
+
44
+ pods = []
45
+ deployments = []
46
+ services = []
47
+ configmaps = []
48
+ hpas = []
49
+
50
+ # Create some default deployments and their pods
51
+ default_deployments = [
52
+ {"name": "frontend", "image": "nginx:1.21", "replicas": 3},
53
+ {"name": "backend", "image": "python:3.9", "replicas": 2},
54
+ {"name": "database", "image": "postgres:13", "replicas": 1},
55
+ {"name": "auth-service", "image": "auth:latest", "replicas": 2},
56
+ {"name": "api-gateway", "image": "nginx:alpine", "replicas": 2}
57
+ ]
58
+
59
+ for dep in default_deployments:
60
+ deployments.append({
61
+ "name": dep["name"],
62
+ "desired_replicas": dep["replicas"],
63
+ "available_replicas": dep["replicas"],
64
+ "image": dep["image"],
65
+ "last_updated": datetime.now().isoformat()
66
+ })
67
+
68
+ # Create pods for this deployment
69
+ for j in range(dep["replicas"]):
70
+ pod_name = f"{dep['name']}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
71
+ pods.append({
72
+ "name": pod_name,
73
+ "status": "Running",
74
+ "node": nodes[j % len(nodes)]["name"] if nodes else None,
75
+ "restarts": 0,
76
+ "cpu_request": self.config.get("pod_cpu_request", 500),
77
+ "mem_request": self.config.get("pod_mem_request", 256),
78
+ "cpu_limit": self.config.get("pod_cpu_limit", 1000),
79
+ "mem_limit": self.config.get("pod_mem_limit", 512),
80
+ "deployment": dep["name"],
81
+ "last_updated": datetime.now().isoformat()
82
+ })
83
+
84
+ # Create some default services
85
+ default_services = [
86
+ {"name": "frontend-service", "type": "ClusterIP", "ports": [{"port": 80, "targetPort": 80}]},
87
+ {"name": "backend-service", "type": "ClusterIP", "ports": [{"port": 8080, "targetPort": 8080}]},
88
+ {"name": "database-service", "type": "ClusterIP", "ports": [{"port": 5432, "targetPort": 5432}]},
89
+ {"name": "auth-service-service", "type": "ClusterIP", "ports": [{"port": 8000, "targetPort": 8000}]}
90
+ ]
91
+
92
+ for svc in default_services:
93
+ services.append({
94
+ "name": svc["name"],
95
+ "type": svc["type"],
96
+ "ports": svc["ports"],
97
+ "selector": {"app": svc["name"].replace("-service", "")},
98
+ "cluster_ip": f"10.96.{len(services)+1}.{len(services)+1}",
99
+ "last_updated": datetime.now().isoformat()
100
+ })
101
+
102
+ # Create some default configmaps
103
+ default_configmaps = [
104
+ {"name": "frontend-config", "data": {"DB_HOST": "db.prod.internal", "DB_PORT": "5432"}},
105
+ {"name": "backend-config", "data": {"LOG_LEVEL": "info", "CACHE_SIZE": "100"}},
106
+ {"name": "database-config", "data": {"MAX_CONNECTIONS": "100", "TIMEOUT": "30"}}
107
+ ]
108
+
109
+ for cm in default_configmaps:
110
+ configmaps.append({
111
+ "name": cm["name"],
112
+ "data": cm["data"],
113
+ "last_updated": datetime.now().isoformat()
114
+ })
115
+
116
+ # Create some default HPAs
117
+ default_hpas = [
118
+ {"name": "frontend-hpa", "min_replicas": 2, "max_replicas": 10, "cpu_target_percent": 70},
119
+ {"name": "backend-hpa", "min_replicas": 1, "max_replicas": 5, "cpu_target_percent": 80}
120
+ ]
121
+
122
+ for hpa in default_hpas:
123
+ hpas.append({
124
+ "name": hpa["name"],
125
+ "min_replicas": hpa["min_replicas"],
126
+ "max_replicas": hpa["max_replicas"],
127
+ "current_replicas": hpa["min_replicas"],
128
+ "cpu_target_percent": hpa["cpu_target_percent"],
129
+ "last_updated": datetime.now().isoformat()
130
+ })
131
+
132
+ return {
133
+ "nodes": nodes,
134
+ "pods": pods,
135
+ "deployments": deployments,
136
+ "services": services,
137
+ "configmaps": configmaps,
138
+ "hpas": hpas
139
+ }
140
+
141
+ def get_pods(self, namespace: Optional[str] = None, selector: Optional[Dict[str, str]] = None) -> List[PodStatus]:
142
+ """Returns filtered pod list (mimics kubectl get pods)"""
143
+ pods = [PodStatus(**pod) for pod in self.cluster_state["pods"]]
144
+ # Simple filtering by namespace (not fully implemented - just returns all for now)
145
+ return pods
146
+
147
+ def get_nodes(self) -> List[NodeStatus]:
148
+ """Get all nodes as Pydantic models"""
149
+ return [NodeStatus(**node) for node in self.cluster_state["nodes"]]
150
+
151
+ def get_deployments(self) -> List[DeploymentStatus]:
152
+ """Get all deployments as Pydantic models"""
153
+ return [DeploymentStatus(**dep) for dep in self.cluster_state["deployments"]]
154
+
155
+ def get_services(self) -> List[ServiceStatus]:
156
+ """Get all services as Pydantic models"""
157
+ return [ServiceStatus(**svc) for svc in self.cluster_state["services"]]
158
+
159
+ def get_configmaps(self) -> List[ConfigMapStatus]:
160
+ """Get all configmaps as Pydantic models"""
161
+ return [ConfigMapStatus(**cm) for cm in self.cluster_state["configmaps"]]
162
+
163
+ def get_hpas(self) -> List[HPAStatus]:
164
+ """Get all HPAs as Pydantic models"""
165
+ return [HPAStatus(**hpa) for hpa in self.cluster_state["hpas"]]
166
+
167
+ def get_events(self) -> List[ClusterEvent]:
168
+ """Get all events"""
169
+ return self.events.copy()
170
+
171
+ def apply_patch(self, resource_type: str, name: str, patch: Dict[str, Any]) -> bool:
172
+ """Apply a patch to a resource"""
173
+ try:
174
+ if resource_type == "deployment":
175
+ for dep in self.cluster_state["deployments"]:
176
+ if dep["name"] == name:
177
+ dep.update(patch)
178
+ dep["last_updated"] = datetime.now().isoformat()
179
+ if "desired_replicas" in patch or "available_replicas" in patch:
180
+ self._update_pods_for_deployment(name, dep.get("desired_replicas", dep["desired_replicas"]))
181
+ return True
182
+
183
+ elif resource_type == "pod":
184
+ for pod in self.cluster_state["pods"]:
185
+ if pod["name"] == name:
186
+ pod.update(patch)
187
+ pod["last_updated"] = datetime.now().isoformat()
188
+ return True
189
+
190
+ elif resource_type == "node":
191
+ for node in self.cluster_state["nodes"]:
192
+ if node["name"] == name:
193
+ node.update(patch)
194
+ node["last_updated"] = datetime.now().isoformat()
195
+ return True
196
+
197
+ elif resource_type == "service":
198
+ for svc in self.cluster_state["services"]:
199
+ if svc["name"] == name:
200
+ svc.update(patch)
201
+ svc["last_updated"] = datetime.now().isoformat()
202
+ return True
203
+
204
+ elif resource_type == "configmap":
205
+ for cm in self.cluster_state["configmaps"]:
206
+ if cm["name"] == name:
207
+ cm.update(patch)
208
+ cm["last_updated"] = datetime.now().isoformat()
209
+ return True
210
+
211
+ elif resource_type == "hpa":
212
+ for hpa in self.cluster_state["hpas"]:
213
+ if hpa["name"] == name:
214
+ hpa.update(patch)
215
+ hpa["last_updated"] = datetime.now().isoformat()
216
+ return True
217
+
218
+ return False
219
+ except Exception as e:
220
+ print(f"Error applying patch: {e}")
221
+ return False
222
+
223
+ def _update_pods_for_deployment(self, deployment_name: str, desired_replicas: int):
224
+ """Update pods count for a deployment"""
225
+ current_pods = [p for p in self.cluster_state["pods"] if p.get("deployment") == deployment_name]
226
+ current_count = len(current_pods)
227
+
228
+ if desired_replicas > current_count:
229
+ nodes = self.cluster_state["nodes"]
230
+ for i in range(desired_replicas - current_count):
231
+ deployment = next((d for d in self.cluster_state["deployments"] if d["name"] == deployment_name), None)
232
+ if deployment:
233
+ pod_name = f"{deployment_name}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
234
+ node = nodes[i % len(nodes)] if nodes else None
235
+ self.cluster_state["pods"].append({
236
+ "name": pod_name,
237
+ "status": "Pending",
238
+ "node": node["name"] if node else None,
239
+ "restarts": 0,
240
+ "cpu_request": self.config.get("pod_cpu_request", 500),
241
+ "mem_request": self.config.get("pod_mem_request", 256),
242
+ "cpu_limit": self.config.get("pod_cpu_limit", 1000),
243
+ "mem_limit": self.config.get("pod_mem_limit", 512),
244
+ "deployment": deployment_name,
245
+ "last_updated": datetime.now().isoformat()
246
+ })
247
+ elif desired_replicas < current_count:
248
+ pods_to_remove = current_pods[desired_replicas:]
249
+ for pod in pods_to_remove:
250
+ self.cluster_state["pods"].remove(pod)
251
+
252
+ def scale(self, deployment_name: str, replicas: int) -> bool:
253
+ """Changes replica count"""
254
+ return self.apply_patch("deployment", deployment_name, {"desired_replicas": replicas})
255
+
256
+ def delete_pod(self, pod_name: str) -> bool:
257
+ """Removes a pod (it gets recreated by the deployment controller on next tick)"""
258
+ pod_index = None
259
+ for i, pod in enumerate(self.cluster_state["pods"]):
260
+ if pod["name"] == pod_name:
261
+ pod_index = i
262
+ break
263
+
264
+ if pod_index is not None:
265
+ del self.cluster_state["pods"][pod_index]
266
+
267
+ event_type: Literal["Normal"] = "Normal" # type: ignore
268
+ event = ClusterEvent(
269
+ event_id=f"event-delpod-{random.randint(1000, 9999)}",
270
+ timestamp=datetime.now().isoformat(),
271
+ type=event_type,
272
+ reason="UserDeleted",
273
+ message=f"pod/{pod_name} deleted by user",
274
+ involved_object=pod_name
275
+ )
276
+ self.events.append(event)
277
+
278
+ return True
279
+ return False
280
+
281
+ def rollout_restart(self, deployment: str) -> bool:
282
+ """Restart a deployment rollout"""
283
+ # Delete all pods for this deployment - they'll get recreated with new config
284
+ pods_to_delete = [p for p in self.cluster_state["pods"] if p.get("deployment") == deployment]
285
+
286
+ for pod in pods_to_delete:
287
+ event_type: Literal["Normal"] = "Normal" # type: ignore
288
+ event = ClusterEvent(
289
+ event_id=f"event-restart-{random.randint(1000, 9999)}",
290
+ timestamp=datetime.now().isoformat(),
291
+ type=event_type,
292
+ reason="RolledOut",
293
+ message=f"Deployment {deployment} rollout restart triggered",
294
+ involved_object=deployment
295
+ )
296
+ self.events.append(event)
297
+
298
+ # Delete pods - they'll be recreated on next tick
299
+ self.cluster_state["pods"] = [p for p in self.cluster_state["pods"] if p.get("deployment") != deployment]
300
+
301
+ return True
302
+
303
+ def tick(self):
304
+ """Advances simulated time by one step. Pods in CrashLoopBackOff increment their restart counter. Pending pods on ready nodes eventually transition to Running. Dead nodes stay dead unless drained."""
305
+ self.step_count += 1
306
+
307
+ # Simulate some natural changes in resource usage
308
+ for node in self.cluster_state["nodes"]:
309
+ node["cpu_usage"] = max(0, min(100, node["cpu_usage"] + random.uniform(-5, 5)))
310
+ node["mem_usage"] = max(0, min(100, node["mem_usage"] + random.uniform(-5, 5)))
311
+ node["last_updated"] = datetime.now().isoformat()
312
+
313
+ # Update pod statuses based on node status
314
+ for pod in self.cluster_state["pods"]:
315
+ node_name = pod.get("node")
316
+ if node_name:
317
+ node = next((n for n in self.cluster_state["nodes"] if n["name"] == node_name), None)
318
+ if node and node["status"] != "Ready":
319
+ if pod["status"] == "Running":
320
+ pod["status"] = "Unknown"
321
+ elif pod["status"] == "Pending":
322
+ pod["status"] = "Unknown"
323
+ elif node and node["status"] == "Ready" and pod["status"] == "Pending":
324
+ if random.random() > 0.7:
325
+ pod["status"] = "Running"
326
+ pod["last_updated"] = datetime.now().isoformat()
327
+
328
+ # Update deployment available replicas based on running pods
329
+ for deployment in self.cluster_state["deployments"]:
330
+ running_pods = [p for p in self.cluster_state["pods"]
331
+ if p.get("deployment") == deployment["name"] and p["status"] == "Running"]
332
+ deployment["available_replicas"] = len(running_pods)
333
+ deployment["last_updated"] = datetime.now().isoformat()
334
+
335
+ # Re-create pods for deployments that need them
336
+ for deployment in self.cluster_state["deployments"]:
337
+ desired = deployment.get("desired_replicas", 0)
338
+ current_pods = [p for p in self.cluster_state["pods"] if p.get("deployment") == deployment["name"]]
339
+ current_count = len(current_pods)
340
+
341
+ if current_count < desired:
342
+ nodes = self.cluster_state["nodes"]
343
+ for i in range(desired - current_count):
344
+ pod_name = f"{deployment['name']}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
345
+ node = nodes[i % len(nodes)] if nodes else None
346
+ self.cluster_state["pods"].append({
347
+ "name": pod_name,
348
+ "status": "Running",
349
+ "node": node["name"] if node else None,
350
+ "restarts": 0,
351
+ "cpu_request": self.config.get("pod_cpu_request", 500),
352
+ "mem_request": self.config.get("pod_mem_request", 256),
353
+ "cpu_limit": self.config.get("pod_cpu_limit", 1000),
354
+ "mem_limit": self.config.get("pod_mem_limit", 512),
355
+ "deployment": deployment["name"],
356
+ "last_updated": datetime.now().isoformat()
357
+ })
358
+
359
+ # Generate occasional events
360
+ if random.random() < 0.3:
361
+ self._generate_event()
362
+
363
+ def _generate_event(self):
364
+ """Generate a realistic cluster event"""
365
+ event_types = [
366
+ {"type": "Normal", "reason": "Scheduled", "message": "Successfully assigned node"},
367
+ {"type": "Warning", "reason": "FailedScheduling", "message": "0/3 nodes are available: 3 Insufficient cpu."},
368
+ {"type": "Normal", "reason": "Pulling", "message": "Pulling image \"nginx:1.21\""},
369
+ {"type": "Normal", "reason": "Pulled", "message": "Successfully pulled image \"nginx:1.21\""},
370
+ {"type": "Normal", "reason": "Created", "message": "Created container"},
371
+ {"type": "Normal", "reason": "Started", "message": "Started container"},
372
+ {"type": "Warning", "reason": "BackOff", "message": "Back-off restarting failed container"},
373
+ {"type": "Normal", "reason": "Killing", "message": "Stopping container"}
374
+ ]
375
+
376
+ event = random.choice(event_types)
377
+ involved_objects = []
378
+ involved_objects.extend([p["name"] for p in self.cluster_state["pods"][:3]])
379
+ involved_objects.extend([d["name"] for d in self.cluster_state["deployments"][:3]])
380
+ involved_objects.extend([n["name"] for n in self.cluster_state["nodes"][:3]])
381
+
382
+ if not involved_objects:
383
+ involved_objects = ["cluster"]
384
+
385
+ event_type: Literal["Normal", "Warning"] = event["type"] # type: ignore
386
+ self.events.append(ClusterEvent(
387
+ event_id=f"event-{self._event_counter:04d}",
388
+ timestamp=datetime.now().isoformat(),
389
+ type=event_type,
390
+ reason=event["reason"],
391
+ message=event["message"],
392
+ involved_object=random.choice(involved_objects)
393
+ ))
394
+ self._event_counter += 1
395
+
396
+ if len(self.events) > 100:
397
+ self.events = self.events[-50:]
398
+
399
+ def get_full_state(self) -> Dict[str, Any]:
400
+ """Get the full cluster state for debugging"""
401
+ return {
402
+ "nodes": self.get_nodes(),
403
+ "pods": self.get_pods(),
404
+ "deployments": self.get_deployments(),
405
+ "services": self.get_services(),
406
+ "configmaps": self.get_configmaps(),
407
+ "hpas": self.get_hpas(),
408
+ "events": self.get_events(),
409
+ "step": self.step_count
410
+ }
411
+
412
+ def reset_to_healthy(self):
413
+ """Reset cluster to healthy state"""
414
+ self.cluster_state = self._initialize_healthy_cluster()
415
+ self.step_count = 0
416
+ self.events = []
417
+ self._event_counter = 0
418
+
419
+ def reset(self, condition=None):
420
+ """Reset the world state and optionally inject a failure condition"""
421
+ self.reset_to_healthy()
422
+ if condition:
423
+ condition.inject(self)
424
+ return self.get_observation()
425
+
426
+ def get_observation(self, objective: str = "Maintain cluster health"):
427
+ """Serialises the current state into a ClusterObservation Pydantic model"""
428
+ observation_dict = self.get_full_state()
429
+ observation_dict["objective"] = objective
430
+ return ClusterObservation(**observation_dict)
server/Dockerfile DELETED
@@ -1,80 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- # Multi-stage build using openenv-base
8
- # This Dockerfile is flexible and works for both:
9
- # - In-repo environments (with local OpenEnv sources)
10
- # - Standalone environments (with openenv from PyPI/Git)
11
- # The build script (openenv build) handles context detection and sets appropriate build args.
12
-
13
- ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
- FROM ${BASE_IMAGE} AS builder
15
-
16
- WORKDIR /app
17
-
18
- # Ensure git is available (required for installing dependencies from VCS)
19
- RUN apt-get update && \
20
- apt-get install -y --no-install-recommends git && \
21
- rm -rf /var/lib/apt/lists/*
22
-
23
- # Build argument to control whether we're building standalone or in-repo
24
- ARG BUILD_MODE=in-repo
25
- ARG ENV_NAME=COEnv
26
-
27
- # Copy environment code (always at root of build context)
28
- COPY . /app/env
29
-
30
- # For in-repo builds, openenv is already vendored in the build context
31
- # For standalone builds, openenv will be installed via pyproject.toml
32
- WORKDIR /app/env
33
-
34
- # Ensure uv is available (for local builds where base image lacks it)
35
- RUN if ! command -v uv >/dev/null 2>&1; then \
36
- curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
- mv /root/.local/bin/uv /usr/local/bin/uv && \
38
- mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
- fi
40
-
41
- # Install dependencies using uv sync
42
- # If uv.lock exists, use it; otherwise resolve on the fly
43
- RUN --mount=type=cache,target=/root/.cache/uv \
44
- if [ -f uv.lock ]; then \
45
- uv sync --frozen --no-install-project --no-editable; \
46
- else \
47
- uv sync --no-install-project --no-editable; \
48
- fi
49
-
50
- RUN --mount=type=cache,target=/root/.cache/uv \
51
- if [ -f uv.lock ]; then \
52
- uv sync --frozen --no-editable; \
53
- else \
54
- uv sync --no-editable; \
55
- fi
56
-
57
- # Final runtime stage
58
- FROM ${BASE_IMAGE}
59
-
60
- WORKDIR /app
61
-
62
- # Copy the virtual environment from builder
63
- COPY --from=builder /app/env/.venv /app/.venv
64
-
65
- # Copy the environment code
66
- COPY --from=builder /app/env /app/env
67
-
68
- # Set PATH to use the virtual environment
69
- ENV PATH="/app/.venv/bin:$PATH"
70
-
71
- # Set PYTHONPATH so imports work correctly
72
- ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
-
74
- # Health check
75
- HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
- CMD curl -f http://localhost:8000/health || exit 1
77
-
78
- # Run the FastAPI server
79
- # The module path is constructed to work with the /app/env structure
80
- CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server/__init__.py CHANGED
@@ -1,11 +1,25 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
 
7
- """Coenv environment server components."""
8
 
9
- from .COEnv_environment import CoenvEnvironment
 
 
 
 
 
 
 
 
 
10
 
11
- __all__ = ["CoenvEnvironment"]
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv - Kubernetes Cluster Simulator for OpenEnv"""
 
 
 
 
2
 
3
+ __version__ = "0.1.0"
4
 
5
+ from .COEnv_environment import World
6
+ from .models import (
7
+ ClusterObservation,
8
+ RewardSignal,
9
+ KubeAction,
10
+ PodStatus,
11
+ NodeStatus,
12
+ DeploymentStatus,
13
+ ServiceStatus
14
+ )
15
 
16
+ __all__ = [
17
+ "World",
18
+ "ClusterObservation",
19
+ "RewardSignal",
20
+ "KubeAction",
21
+ "PodStatus",
22
+ "NodeStatus",
23
+ "DeploymentStatus",
24
+ "ServiceStatus"
25
+ ]
server/actions/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Actions - Action definitions"""
2
+
3
+ __all__ = ["scale_action", "delete_pod_action", "patch_action", "rollout_action", "hpa_action", "drain_action", "describe_action"]
4
+
5
+ from .scale_action import ScaleAction
6
+ from .delete_pod_action import DeletePodAction
7
+ from .patch_action import PatchAction
8
+ from .rollout_action import RolloutRestartAction
9
+ from .hpa_action import SetHPAAction
10
+ from .drain_action import DrainNodeAction
11
+ from .describe_action import DescribeAction
12
+
13
+ __all__ += ["ScaleAction", "DeletePodAction", "PatchAction", "RolloutRestartAction", "SetHPAAction", "DrainNodeAction", "DescribeAction"]
server/actions/delete_pod_action.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Actions - Delete pod action"""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from typing import Literal
5
+
6
+
7
+ class DeletePodAction(BaseModel):
8
+ """Delete a specific pod"""
9
+ action_type: Literal["delete_pod"] = "delete_pod"
10
+ pod_name: str = Field(..., description="Pod name to delete")
server/actions/describe_action.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Actions - Describe action"""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from typing import Literal
5
+
6
+
7
+ class DescribeAction(BaseModel):
8
+ """Describe/get details of a resource"""
9
+ action_type: Literal["describe"] = "describe"
10
+ resource_type: Literal["deployment", "pod", "node", "service", "configmap"] = Field(..., description="Resource type")
11
+ name: str = Field(..., description="Resource name")
server/actions/drain_action.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Actions - Drain action"""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from typing import Literal
5
+
6
+
7
+ class DrainNodeAction(BaseModel):
8
+ """Drain a node (evict all pods)"""
9
+ action_type: Literal["drain_node"] = "drain_node"
10
+ node_name: str = Field(..., description="Node name to drain")
server/actions/hpa_action.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Actions - HPA action"""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from typing import Literal
5
+
6
+
7
+ class SetHPAAction(BaseModel):
8
+ """Set HorizontalPodAutoscaler for a deployment"""
9
+ action_type: Literal["set_hpa"] = "set_hpa"
10
+ deployment: str = Field(..., description="Deployment name")
11
+ min_replicas: int = Field(..., ge=1, le=50, description="Minimum replicas")
12
+ max_replicas: int = Field(..., ge=1, le=100, description="Maximum replicas")
13
+ cpu_target_percent: int = Field(..., ge=1, le=100, description="CPU target percentage")
server/actions/patch_action.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Actions - Patch action"""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from typing import Literal, Dict, Any
5
+
6
+
7
+ class PatchAction(BaseModel):
8
+ """Patch a resource with specific changes"""
9
+ action_type: Literal["patch"] = "patch"
10
+ resource_type: Literal["deployment", "pod", "node", "service", "configmap"] = Field(..., description="Resource type")
11
+ name: str = Field(..., description="Resource name")
12
+ patch: Dict[str, Any] = Field(..., description="Patch to apply")
server/actions/rollout_action.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Actions - Rollout restart action"""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from typing import Literal
5
+
6
+
7
+ class RolloutRestartAction(BaseModel):
8
+ """Restart a deployment rollout"""
9
+ action_type: Literal["rollout_restart"] = "rollout_restart"
10
+ deployment: str = Field(..., description="Deployment name to restart")
server/actions/scale_action.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Actions - Scale action"""
2
+
3
+ from pydantic import BaseModel, Field
4
+ from typing import Literal
5
+
6
+
7
+ class ScaleAction(BaseModel):
8
+ """Scale a deployment to a specific replica count"""
9
+ action_type: Literal["scale"] = "scale"
10
+ deployment: str = Field(..., description="Deployment name to scale")
11
+ replicas: int = Field(..., ge=0, le=100, description="Number of replicas")
server/app.py CHANGED
@@ -1,84 +1,283 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
  """
8
- FastAPI application for the Coenv Environment.
 
 
9
 
10
- This module creates an HTTP server that exposes the CoenvEnvironment
11
- over HTTP and WebSocket endpoints, compatible with EnvClient.
 
 
 
 
 
12
 
13
- Endpoints:
14
- - POST /reset: Reset the environment
15
- - POST /step: Execute an action
16
- - GET /state: Get current environment state
17
- - GET /schema: Get action/observation schemas
18
- - WS /ws: WebSocket endpoint for persistent sessions
19
 
20
- Usage:
21
- # Development (with auto-reload):
22
- uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
 
24
- # Production:
25
- uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
 
 
 
26
 
27
- # Or run directly:
28
- python -m server.app
29
- """
30
 
31
- try:
32
- from openenv.core.env_server.http_server import create_app
33
- except Exception as e: # pragma: no cover
34
- raise ImportError(
35
- "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
36
- ) from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- try:
39
- from ..models import CoenvAction, CoenvObservation
40
- from .COEnv_environment import CoenvEnvironment
41
- except ModuleNotFoundError:
42
- from models import CoenvAction, CoenvObservation
43
- from server.COEnv_environment import CoenvEnvironment
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Create the app with web interface and README integration
47
- app = create_app(
48
- CoenvEnvironment,
49
- CoenvAction,
50
- CoenvObservation,
51
- env_name="COEnv",
52
- max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
53
- )
54
 
 
 
 
 
 
 
 
 
55
 
56
- def main(host: str = "0.0.0.0", port: int = 8000):
57
- """
58
- Entry point for direct execution via uv run or python -m.
59
 
60
- This function enables running the server without Docker:
61
- uv run --project . server
62
- uv run --project . server --port 8001
63
- python -m COEnv.server.app
 
 
 
64
 
65
- Args:
66
- host: Host address to bind to (default: "0.0.0.0")
67
- port: Port number to listen on (default: 8000)
68
 
69
- For production deployments, consider using uvicorn directly with
70
- multiple workers:
71
- uvicorn COEnv.server.app:app --workers 4
72
- """
73
- import uvicorn
74
 
75
- uvicorn.run(app, host=host, port=port)
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- if __name__ == "__main__":
79
- import argparse
80
 
81
- parser = argparse.ArgumentParser()
82
- parser.add_argument("--port", type=int, default=8000)
83
- args = parser.parse_args()
84
- main(port=args.port)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ COEnv FastAPI Application
3
+ Exposes /reset /step /state endpoints
4
+ """
5
 
6
+ from fastapi import FastAPI, HTTPException
7
+ from pydantic import BaseModel, Field
8
+ from typing import Dict, Any, Optional, List, Literal
9
+ import uvicorn
10
+ import json
11
+ import os
12
+ import sys
13
 
14
+ from .COEnv_environment import World
15
+ from .models import ClusterObservation, RewardSignal, KubeAction
 
 
 
 
16
 
17
+ app = FastAPI(title="COEnv", description="Kubernetes Simulator for OpenEnv")
 
 
18
 
19
+ # Global world instance
20
+ world_instance: Optional[World] = None
21
+ config: Dict[str, Any] = {}
22
+ current_task: Optional[str] = None
23
+ current_objective: str = ""
24
 
 
 
 
25
 
26
+ def load_config():
27
+ """Load configuration from config.json"""
28
+ global config
29
+ config_path = os.path.join(os.path.dirname(__file__), "..", "config.json")
30
+ try:
31
+ with open(config_path, 'r') as f:
32
+ config = json.load(f)
33
+ except FileNotFoundError:
34
+ config = {
35
+ "num_nodes": 3,
36
+ "node_cpu_capacity": 4,
37
+ "node_mem_capacity": 8192,
38
+ "pod_cpu_request": 250,
39
+ "pod_mem_request": 128,
40
+ "pod_cpu_limit": 500,
41
+ "pod_mem_limit": 256,
42
+ "crash_loop_failure_rate": 0.7,
43
+ "oom_kill_failure_rate": 0.6,
44
+ "node_failure_rate": 0.3,
45
+ "cascade_failure_probability": 0.5,
46
+ "task_timeout_values": 300,
47
+ "tasks": {
48
+ "pod_recovery": {"max_steps": 15, "success_threshold": 0.9},
49
+ "autoscaling": {"max_steps": 20, "success_threshold": 0.85},
50
+ "incident": {"max_steps": 30, "success_threshold": 0.80}
51
+ }
52
+ }
53
 
 
 
 
 
 
 
54
 
55
+ # Import conditions for task injection
56
+ def get_condition_for_task(task_id: str):
57
+ """Get the condition injector for a task"""
58
+ if task_id == "pod_recovery":
59
+ from .conditions.crash_loop import CrashLoopCondition
60
+ return CrashLoopCondition(world_instance, config)
61
+ elif task_id == "autoscaling":
62
+ from .conditions.oom_kill import OOMKillCondition
63
+ return OOMKillCondition(world_instance, config)
64
+ elif task_id == "incident":
65
+ from .conditions.cascade_failure import CascadeFailureCondition
66
+ return CascadeFailureCondition(world_instance, config)
67
+ return None
68
 
 
 
 
 
 
 
 
 
69
 
70
+ def get_objective_for_task(task_id: str) -> str:
71
+ """Get the objective string for a task"""
72
+ objectives = {
73
+ "pod_recovery": "The frontend deployment is crash-looping. Diagnose and fix the root cause so that all pods reach Running state.",
74
+ "autoscaling": "Traffic has spiked 10×. The api-server deployment is overloaded. Configure autoscaling and ensure p95 latency stays below 500ms.",
75
+ "incident": "A cascading incident has degraded auth-service, api-gateway, and data-processor. Identify the root cause and restore all three services to healthy state without data loss."
76
+ }
77
+ return objectives.get(task_id, "Maintain cluster health")
78
 
 
 
 
79
 
80
+ @app.on_event("startup")
81
+ async def startup_event():
82
+ """Initialize the world on startup"""
83
+ global world_instance, current_task, current_objective
84
+ load_config()
85
+ world_instance = World(config)
86
+ print("COEnv initialized")
87
 
 
 
 
88
 
89
+ class ResetRequest(BaseModel):
90
+ """Request body for /reset endpoint"""
91
+ task: Optional[str] = Field(default="pod_recovery", description="Task ID to initialize")
 
 
92
 
 
93
 
94
+ @app.post("/reset")
95
+ async def reset(request: ResetRequest = ResetRequest()):
96
+ """Reset the environment and return initial observation"""
97
+ global world_instance, current_task, current_objective
98
+
99
+ if world_instance is None:
100
+ raise HTTPException(status_code=500, detail="World not initialized")
101
+
102
+ current_task = request.task
103
+ current_objective = get_objective_for_task(request.task)
104
+
105
+ # Get condition for the task and inject it
106
+ condition = get_condition_for_task(request.task)
107
+
108
+ # Reset with the condition
109
+ observation = world_instance.reset(condition)
110
+
111
+ return observation
112
+
113
+
114
+ class StepRequest(BaseModel):
115
+ """Request body for /step endpoint"""
116
+ action: Dict[str, Any] = Field(..., description="Action to execute")
117
+
118
+
119
+ @app.post("/step")
120
+ async def step(request: StepRequest):
121
+ """Apply an action and return next observation, reward, done, info"""
122
+ global world_instance, current_task, current_objective
123
+
124
+ if world_instance is None:
125
+ raise HTTPException(status_code=500, detail="World not initialized")
126
+
127
+ action = request.action
128
+ action_type = action.get("action_type", "")
129
+
130
+ # Execute action
131
+ info = {}
132
+ reward = 0.0
133
+ done = False
134
+
135
+ try:
136
+ if action_type == "scale":
137
+ deployment = action.get("deployment", "")
138
+ replicas = action.get("replicas", 1)
139
+ world_instance.scale(deployment, replicas)
140
+ info["scaled"] = deployment
141
+ info["replicas"] = replicas
142
+
143
+ elif action_type == "delete_pod":
144
+ pod_name = action.get("pod_name", "")
145
+ world_instance.delete_pod(pod_name)
146
+ info["deleted"] = pod_name
147
+
148
+ elif action_type == "patch":
149
+ resource_type = action.get("resource_type", "")
150
+ name = action.get("name", "")
151
+ patch = action.get("patch", {})
152
+ world_instance.apply_patch(resource_type, name, patch)
153
+ info["patched"] = f"{resource_type}/{name}"
154
+
155
+ elif action_type == "rollout_restart":
156
+ deployment = action.get("deployment", "")
157
+ world_instance.rollout_restart(deployment)
158
+ info["restarted"] = deployment
159
+
160
+ elif action_type == "drain_node":
161
+ node_name = action.get("node_name", "")
162
+ world_instance.apply_patch("node", node_name, {"status": "SchedulingDisabled"})
163
+ info["drained"] = node_name
164
+
165
+ elif action_type == "set_hpa":
166
+ deployment = action.get("deployment", "")
167
+ min_replicas = action.get("min_replicas", 1)
168
+ max_replicas = action.get("max_replicas", 10)
169
+ cpu_target = action.get("cpu_target_percent", 80)
170
+ hpa_name = f"{deployment}-hpa"
171
+ world_instance.apply_patch("hpa", hpa_name, {
172
+ "min_replicas": min_replicas,
173
+ "max_replicas": max_replicas,
174
+ "cpu_target_percent": cpu_target
175
+ })
176
+ info["hpa_set"] = deployment
177
+
178
+ elif action_type == "describe":
179
+ # Investigation action - no state change
180
+ resource_type = action.get("resource_type", "")
181
+ name = action.get("name", "")
182
+ info["described"] = f"{resource_type}/{name}"
183
+
184
+ else:
185
+ info["error"] = f"Unknown action type: {action_type}"
186
+ reward = -0.1 # Penalty for invalid action
187
+
188
+ except Exception as e:
189
+ info["error"] = str(e)
190
+ reward = -0.1
191
+
192
+ # Always tick after an action
193
+ world_instance.tick()
194
+
195
+ # Calculate reward based on current state
196
+ reward = calculate_reward(world_instance, current_task)
197
+
198
+ # Check if task is done
199
+ max_steps = config.get("tasks", {}).get(current_task, {}).get("max_steps", 15)
200
+ if world_instance.step_count >= max_steps:
201
+ done = True
202
+
203
+ # Check if all pods are running (simplified done check)
204
+ if check_task_complete(world_instance, current_task):
205
+ done = True
206
+
207
+ observation = world_instance.get_observation(current_objective)
208
+
209
+ reward_signal = RewardSignal(reward=reward, done=done, info=info)
210
+
211
+ return {
212
+ "observation": observation.model_dump(),
213
+ "reward": reward_signal.reward,
214
+ "done": reward_signal.done,
215
+ "info": reward_signal.info
216
+ }
217
 
 
 
218
 
219
+ def calculate_reward(world: World, task_id: str) -> float:
220
+ """Calculate reward based on current state"""
221
+ if task_id == "pod_recovery":
222
+ pods = world.get_pods()
223
+ frontend_pods = [p for p in pods if p.deployment == "frontend"]
224
+ running = [p for p in frontend_pods if p.status == "Running"]
225
+ if frontend_pods:
226
+ return len(running) / len(frontend_pods)
227
+ elif task_id == "autoscaling":
228
+ pods = world.get_pods()
229
+ backend_pods = [p for p in pods if p.deployment == "backend"]
230
+ running = [p for p in backend_pods if p.status == "Running"]
231
+ if backend_pods:
232
+ return min(len(running) / len(backend_pods), 1.0)
233
+ elif task_id == "incident":
234
+ pods = world.get_pods()
235
+ key_services = ["auth-service", "api-gateway", "frontend"]
236
+ healthy_count = 0
237
+ for svc in key_services:
238
+ svc_pods = [p for p in pods if p.deployment == svc]
239
+ running = [p for p in svc_pods if p.status == "Running"]
240
+ if svc_pods and len(running) >= len(svc_pods) * 0.8:
241
+ healthy_count += 1
242
+ return healthy_count / len(key_services) if key_services else 0.0
243
+
244
+ return 0.0
245
+
246
+
247
+ def check_task_complete(world: World, task_id: str) -> bool:
248
+ """Check if task is complete"""
249
+ if task_id == "pod_recovery":
250
+ pods = world.get_pods()
251
+ frontend_pods = [p for p in pods if p.deployment == "frontend"]
252
+ running = [p for p in frontend_pods if p.status == "Running"]
253
+ return len(frontend_pods) > 0 and len(running) == len(frontend_pods)
254
+ elif task_id == "autoscaling":
255
+ pods = world.get_pods()
256
+ backend_pods = [p for p in pods if p.deployment == "backend"]
257
+ running = [p for p in backend_pods if p.status == "Running"]
258
+ return len(backend_pods) >= 2 and len(running) >= 2
259
+ elif task_id == "incident":
260
+ pods = world.get_pods()
261
+ key_services = ["auth-service", "api-gateway", "frontend"]
262
+ for svc in key_services:
263
+ svc_pods = [p for p in pods if p.deployment == svc]
264
+ running = [p for p in svc_pods if p.status == "Running"]
265
+ if svc_pods and len(running) < len(svc_pods) * 0.8:
266
+ return False
267
+ return True
268
+ return False
269
+
270
+
271
+ @app.get("/state")
272
+ async def get_state():
273
+ """Return full current simulator state"""
274
+ global world_instance, current_objective
275
+
276
+ if world_instance is None:
277
+ raise HTTPException(status_code=500, detail="World not initialized")
278
+
279
+ return world_instance.get_observation(current_objective).model_dump()
280
+
281
+
282
+ if __name__ == "__main__":
283
+ uvicorn.run(app, host="0.0.0.0", port=8000)
server/conditions/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Conditions - Failure injectors"""
2
+
3
+ __all__ = ["crash_loop", "oom_kill", "node_failure", "cascade_failure"]
4
+
5
+ from .crash_loop import CrashLoopCondition
6
+ from .oom_kill import OOMKillCondition
7
+ from .node_failure import NodeFailureCondition
8
+ from .cascade_failure import CascadeFailureCondition
9
+
10
+ __all__ += [
11
+ "CrashLoopCondition",
12
+ "OOMKillCondition",
13
+ "NodeFailureCondition",
14
+ "CascadeFailureCondition"
15
+ ]
server/conditions/cascade_failure.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CascadeFailureCondition - Simulates multi-service dependency failure
3
+ """
4
+
5
+ from typing import Dict, List, Any, Optional
6
+ from ..COEnv_environment import World
7
+ import random
8
+
9
+
10
+ class CascadeFailureCondition:
11
+ """Injects cascading failures across multiple services"""
12
+
13
+ def __init__(self, world: World, config: Dict[str, Any]):
14
+ self.world = world
15
+ self.config = config
16
+
17
+ def inject(self, root_cause_service: Optional[str] = None, failure_probability: Optional[float] = None):
18
+ """
19
+ Inject cascading failures starting from a root cause service
20
+
21
+ Args:
22
+ root_cause_service: Specific service to start failure (None for random)
23
+ failure_probability: Probability of failure propagating to dependencies (0.0-1.0)
24
+ """
25
+ if failure_probability is None:
26
+ failure_probability = self.config.get("cascade_failure_probability", 0.7)
27
+ else:
28
+ failure_probability = float(failure_probability)
29
+
30
+ if root_cause_service is None:
31
+ critical_services = ["auth-service", "database", "api-gateway"]
32
+ deployments = self.world.get_deployments()
33
+ critical_deployments = [d for d in deployments if d.name in critical_services]
34
+ if critical_deployments:
35
+ root_cause_service = random.choice(critical_deployments).name
36
+ else:
37
+ deployments = self.world.get_deployments()
38
+ root_cause_service = random.choice(deployments).name if deployments else "frontend"
39
+
40
+ root_deployment = next((d for d in self.world.get_deployments() if d.name == root_cause_service), None)
41
+ if root_deployment:
42
+ from ..oom_kill import OOMKillCondition
43
+ oom_condition = OOMKillCondition(self.world, self.config)
44
+ oom_condition.inject(target_deployment=root_cause_service, failure_rate=0.8)
45
+
46
+ self._add_cascade_event(f"Root cause failure in {root_cause_service}", "Warning")
47
+
48
+ deployments = self.world.get_deployments()
49
+ for deployment in deployments:
50
+ if deployment.name != root_cause_service and failure_probability is not None and random.random() < failure_probability:
51
+ failure_type = random.choice(["crashloop", "oom", "slow"])
52
+
53
+ if failure_type == "crashloop":
54
+ from ..crash_loop import CrashLoopCondition
55
+ condition = CrashLoopCondition(self.world, self.config)
56
+ condition.inject(target_deployment=deployment.name, failure_rate=0.6)
57
+ elif failure_type == "oom":
58
+ from ..oom_kill import OOMKillCondition
59
+ condition = OOMKillCondition(self.world, self.config)
60
+ condition.inject(target_deployment=deployment.name, failure_rate=0.6)
61
+ else:
62
+ pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
63
+ for pod in pods[:1]:
64
+ patch = {
65
+ "cpu_request": int(pod.cpu_request * 1.5) if pod.cpu_request else 750,
66
+ "mem_request": int(pod.mem_request * 1.5) if pod.mem_request else 384
67
+ }
68
+ self.world.apply_patch("pod", pod.name, patch)
69
+
70
+ self._add_cascade_event(f"Cascading failure detected in {deployment.name}", "Warning")
71
+
72
+ def _add_cascade_event(self, message: str, event_type: str):
73
+ """Add a cascade failure event"""
74
+ from ..models import ClusterEvent
75
+ from datetime import datetime
76
+
77
+ event = ClusterEvent(
78
+ event_id=f"event-cascade-{random.randint(1000, 9999)}",
79
+ timestamp=datetime.now().isoformat(),
80
+ type=event_type,
81
+ reason="CascadeFailure",
82
+ message=message,
83
+ involved_object="cluster"
84
+ )
85
+ self.world.events.append(event)
server/conditions/crash_loop.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CrashLoopCondition - Simulates pods stuck in CrashLoopBackOff
3
+ """
4
+
5
+ from typing import Dict, List, Any, Optional
6
+ from ..COEnv_environment import World
7
+ import random
8
+
9
+
10
+ class CrashLoopCondition:
11
+ """Injects CrashLoopBackOff failures into pods"""
12
+
13
+ def __init__(self, world: World, config: Dict[str, Any]):
14
+ self.world = world
15
+ self.config = config
16
+
17
+ def inject(self, target_deployment: Optional[str] = None, failure_rate: Optional[float] = None):
18
+ """
19
+ Inject crash loop failures into pods
20
+
21
+ Args:
22
+ target_deployment: Specific deployment to target (None for random)
23
+ failure_rate: Probability of each pod failing (0.0-1.0)
24
+ """
25
+ if failure_rate is None:
26
+ failure_rate = self.config.get("crash_loop_failure_rate", 0.8)
27
+ else:
28
+ failure_rate = float(failure_rate)
29
+
30
+ deployments = self.world.get_deployments()
31
+
32
+ if target_deployment is not None:
33
+ target_deps = [d for d in deployments if d.name == target_deployment]
34
+ else:
35
+ target_deps = [random.choice(deployments)] if deployments else []
36
+
37
+ for deployment in target_deps:
38
+ pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
39
+
40
+ for pod in pods:
41
+ if failure_rate is not None and random.random() < failure_rate:
42
+ patch = {
43
+ "status": "CrashLoopBackOff",
44
+ "restarts": random.randint(5, 20)
45
+ }
46
+ self.world.apply_patch("pod", pod.name, patch)
47
+ self._add_crashloop_event(pod.name)
48
+
49
+ def _add_crashloop_event(self, pod_name: str):
50
+ """Add a crashloop event"""
51
+ from .models import ClusterEvent
52
+ from datetime import datetime
53
+
54
+ event = ClusterEvent(
55
+ event_id=f"event-crashloop-{random.randint(1000, 9999)}",
56
+ timestamp=datetime.now().isoformat(),
57
+ type="Warning",
58
+ reason="BackOff",
59
+ message=f"Back-off restarting failed container pod/{pod_name}",
60
+ involved_object=pod_name
61
+ )
62
+ self.world.events.append(event)
server/conditions/node_failure.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ NodeFailureCondition - Simulates node outages and scheduling disruption
3
+ """
4
+
5
+ from typing import Dict, List, Any, Optional
6
+ from ..COEnv_environment import World
7
+ import random
8
+
9
+
10
+ class NodeFailureCondition:
11
+ """Injects node failures into the cluster"""
12
+
13
+ def __init__(self, world: World, config: Dict[str, Any]):
14
+ self.world = world
15
+ self.config = config
16
+
17
+ def inject(self, target_node: Optional[str] = None, failure_rate: Optional[float] = None):
18
+ """
19
+ Inject node failures
20
+
21
+ Args:
22
+ target_node: Specific node to target (None for random)
23
+ failure_rate: Probability of node failing (0.0-1.0)
24
+ """
25
+ if failure_rate is None:
26
+ failure_rate = self.config.get("node_failure_rate", 0.3)
27
+ else:
28
+ failure_rate = float(failure_rate)
29
+
30
+ nodes = self.world.get_nodes()
31
+
32
+ if target_node:
33
+ target_nodes = [n for n in nodes if n.name == target_node]
34
+ else:
35
+ target_nodes = [n for n in nodes if failure_rate is not None and random.random() < failure_rate]
36
+
37
+ for node in target_nodes:
38
+ patch = {
39
+ "status": "NotReady",
40
+ "cpu_usage": 0.0,
41
+ "mem_usage": 0.0
42
+ }
43
+ self.world.apply_patch("node", node.name, patch)
44
+
45
+ pods_on_node = [p for p in self.world.get_pods() if p.node == node.name]
46
+ for pod in pods_on_node:
47
+ patch = {
48
+ "node": None,
49
+ "status": "Pending"
50
+ }
51
+ self.world.apply_patch("pod", pod.name, patch)
52
+
53
+ self._add_node_failure_event(node.name)
54
+
55
+ def _add_node_failure_event(self, node_name: str):
56
+ """Add a node failure event"""
57
+ from models import ClusterEvent
58
+ from datetime import datetime
59
+
60
+ event = ClusterEvent(
61
+ event_id=f"event-nodefail-{random.randint(1000, 9999)}",
62
+ timestamp=datetime.now().isoformat(),
63
+ type="Warning",
64
+ reason="NodeNotReady",
65
+ message=f"Node {node_name} status is now: NodeNotReady",
66
+ involved_object=node_name
67
+ )
68
+ self.world.events.append(event)
server/conditions/oom_kill.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OOMKillCondition - Simulates memory-limit failures causing repeated restarts
3
+ """
4
+
5
+ from typing import Dict, List, Any, Optional
6
+ from ..COEnv_environment import World
7
+ import random
8
+
9
+
10
+ class OOMKillCondition:
11
+ """Injects OOMKill failures into pods"""
12
+
13
+ def __init__(self, world: World, config: Dict[str, Any]):
14
+ self.world = world
15
+ self.config = config
16
+
17
+ def inject(self, target_deployment: Optional[str] = None, failure_rate: Optional[float] = None):
18
+ """
19
+ Inject OOMKill failures into pods
20
+
21
+ Args:
22
+ target_deployment: Specific deployment to target (None for random)
23
+ failure_rate: Probability of each pod failing (0.0-1.0)
24
+ """
25
+ if failure_rate is None:
26
+ failure_rate = self.config.get("oom_kill_failure_rate", 0.6)
27
+ else:
28
+ # Ensure failure_rate is a float
29
+ failure_rate = float(failure_rate)
30
+
31
+ deployments = self.world.get_deployments()
32
+
33
+ if target_deployment is not None:
34
+ target_deps = [d for d in deployments if d.name == target_deployment]
35
+ else:
36
+ # Target a random deployment
37
+ target_deps = [random.choice(deployments)] if deployments else []
38
+
39
+ for deployment in target_deps:
40
+ # Get pods for this deployment
41
+ pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
42
+
43
+ for pod in pods:
44
+ if failure_rate is not None and random.random() < failure_rate:
45
+ # Simulate OOMKill by setting high memory usage and restart count
46
+ patch = {
47
+ "status": "Running", # OOMKill pods often show as Running but crash
48
+ "restarts": random.randint(10, 30) # High restart count from OOM
49
+ }
50
+ self.world.apply_patch("pod", pod.name, patch)
51
+
52
+ # Also reduce the pod's memory limit to simulate the condition that caused OOM
53
+ mem_patch = {
54
+ "mem_limit": max(64, pod.mem_limit // 2) if pod.mem_limit else 128
55
+ }
56
+ self.world.apply_patch("pod", pod.name, mem_patch)
57
+
58
+ # Add event
59
+ self._add_oom_event(pod.name)
60
+
61
+ def _add_oom_event(self, pod_name: str):
62
+ """Add an OOMKill event"""
63
+ from .models import ClusterEvent
64
+ from datetime import datetime
65
+
66
+ event = ClusterEvent(
67
+ event_id=f"event-oom-{random.randint(1000, 9999)}",
68
+ timestamp=datetime.now().isoformat(),
69
+ type="Warning",
70
+ reason="OOMKilling",
71
+ message=f"Container {pod_name} exceeded memory limit",
72
+ involved_object=pod_name
73
+ )
74
+ self.world.events.append(event)
server/executor.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Executor - Action execution bridge"""
2
+
3
+ from typing import Dict, Any, Optional
4
+
5
+
6
+ class ExecutionResult:
7
+ """Result of action execution"""
8
+
9
+ def __init__(self, observation, reward: float, done: bool, info: Dict[str, Any]):
10
+ self.observation = observation
11
+ self.reward = reward
12
+ self.done = done
13
+ self.info = info
14
+
15
+
16
+ class Executor:
17
+ """Maps validated actions to world method calls"""
18
+
19
+ def __init__(self, world, grader):
20
+ self.world = world
21
+ self.grader = grader
22
+
23
+ def execute(self, action: Dict[str, Any], task_id: str, max_steps: int) -> ExecutionResult:
24
+ """
25
+ Execute an action and return the result
26
+
27
+ Args:
28
+ action: The action to execute
29
+ task_id: Current task ID for grading
30
+ max_steps: Maximum steps for the episode
31
+
32
+ Returns:
33
+ ExecutionResult with observation, reward, done, and info
34
+ """
35
+ action_type = action.get("action_type", "")
36
+ info = {}
37
+ reward = 0.0
38
+ done = False
39
+
40
+ try:
41
+ if action_type == "scale":
42
+ deployment = action.get("deployment", "")
43
+ replicas = action.get("replicas", 1)
44
+ self.world.scale(deployment, replicas)
45
+ info["scaled"] = deployment
46
+ info["replicas"] = replicas
47
+
48
+ elif action_type == "delete_pod":
49
+ pod_name = action.get("pod_name", "")
50
+ self.world.delete_pod(pod_name)
51
+ info["deleted"] = pod_name
52
+
53
+ elif action_type == "patch":
54
+ resource_type = action.get("resource_type", "")
55
+ name = action.get("name", "")
56
+ patch = action.get("patch", {})
57
+ self.world.apply_patch(resource_type, name, patch)
58
+ info["patched"] = f"{resource_type}/{name}"
59
+
60
+ elif action_type == "rollout_restart":
61
+ deployment = action.get("deployment", "")
62
+ self.world.rollout_restart(deployment)
63
+ info["restarted"] = deployment
64
+
65
+ elif action_type == "drain_node":
66
+ node_name = action.get("node_name", "")
67
+ self.world.apply_patch("node", node_name, {"status": "SchedulingDisabled"})
68
+ info["drained"] = node_name
69
+
70
+ elif action_type == "set_hpa":
71
+ deployment = action.get("deployment", "")
72
+ min_replicas = action.get("min_replicas", 1)
73
+ max_replicas = action.get("max_replicas", 10)
74
+ cpu_target = action.get("cpu_target_percent", 80)
75
+ hpa_name = f"{deployment}-hpa"
76
+ self.world.apply_patch("hpa", hpa_name, {
77
+ "min_replicas": min_replicas,
78
+ "max_replicas": max_replicas,
79
+ "cpu_target_percent": cpu_target
80
+ })
81
+ info["hpa_set"] = deployment
82
+
83
+ elif action_type == "describe":
84
+ # Investigation action - no state change
85
+ resource_type = action.get("resource_type", "")
86
+ name = action.get("name", "")
87
+ info["described"] = f"{resource_type}/{name}"
88
+
89
+ else:
90
+ info["error"] = f"Unknown action type: {action_type}"
91
+ reward = -0.1
92
+
93
+ except Exception as e:
94
+ info["error"] = str(e)
95
+ reward = -0.1
96
+
97
+ # Always advance time after an action
98
+ self.world.tick()
99
+
100
+ # Calculate reward
101
+ world_state = self.world.get_full_state()
102
+ reward = self.grader.grade(world_state, self.world.step_count, max_steps)
103
+
104
+ # Check if done
105
+ if self.world.step_count >= max_steps:
106
+ done = True
107
+
108
+ # Check task completion
109
+ if self._check_task_complete(task_id):
110
+ done = True
111
+
112
+ observation = self.world.get_observation()
113
+
114
+ return ExecutionResult(
115
+ observation=observation,
116
+ reward=reward,
117
+ done=done,
118
+ info=info
119
+ )
120
+
121
+ def _check_task_complete(self, task_id: str) -> bool:
122
+ """Check if task is complete"""
123
+ pods = self.world.get_pods()
124
+
125
+ if task_id == "pod_recovery":
126
+ frontend_pods = [p for p in pods if p.deployment == "frontend"]
127
+ running = [p for p in frontend_pods if p.status == "Running"]
128
+ return len(frontend_pods) > 0 and len(running) == len(frontend_pods)
129
+
130
+ elif task_id == "autoscaling":
131
+ backend_pods = [p for p in pods if p.deployment == "backend"]
132
+ running = [p for p in backend_pods if p.status == "Running"]
133
+ return len(backend_pods) >= 2 and len(running) >= 2
134
+
135
+ elif task_id == "incident":
136
+ key_services = ["auth-service", "api-gateway", "frontend"]
137
+ for svc in key_services:
138
+ svc_pods = [p for p in pods if p.deployment == svc]
139
+ running = [p for p in svc_pods if p.status == "Running"]
140
+ if svc_pods and len(running) < len(svc_pods) * 0.8:
141
+ return False
142
+ return True
143
+
144
+ return False
server/graders/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Graders"""
2
+
3
+ __all__ = ["grader_pod_recovery", "grader_autoscaling", "grader_incident"]
4
+
5
+ from .grader_pod_recovery import grade as pod_recovery_grade
6
+ from .grader_autoscaling import grade as autoscaling_grade
7
+ from .grader_incident import grade as incident_grade
8
+
9
+ __all__ += ["pod_recovery_grade", "autoscaling_grade", "incident_grade"]
server/graders/grader_autoscaling.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grader for Autoscaling Task
3
+ """
4
+
5
+ from typing import Dict, Any
6
+
7
+
8
+ def grade(world_state: Dict[str, Any], step: int, max_steps: int) -> float:
9
+ """Grade the autoscaling task"""
10
+ score = 0.0
11
+
12
+ # Check backend deployment status
13
+ backend_deployment = next((d for d in world_state.get("deployments", []) if d.get("name") == "backend"), None)
14
+ if not backend_deployment:
15
+ return 0.0
16
+
17
+ # Check if we have adequate replicas
18
+ desired = backend_deployment.get("desired_replicas", 0)
19
+ available = backend_deployment.get("available_replicas", 0)
20
+
21
+ if desired > 0:
22
+ replica_ratio = min(available / desired, 1.0)
23
+ score += replica_ratio * 0.4 # 40% for proper scaling
24
+
25
+ # Check backend pod health
26
+ backend_pods = [p for p in world_state.get("pods", [])
27
+ if p.get("deployment") == "backend" and p.get("status") == "Running"]
28
+ total_backend_pods = [p for p in world_state.get("pods", [])
29
+ if p.get("deployment") == "backend"]
30
+
31
+ if total_backend_pods:
32
+ health_ratio = len(backend_pods) / len(total_backend_pods)
33
+ score += health_ratio * 0.4 # 40% for pod health
34
+
35
+ # Efficiency bonus
36
+ if max_steps > 0:
37
+ efficiency = 1.0 - (step / max_steps)
38
+ score += efficiency * 0.2 # 20% for efficiency
39
+
40
+ return min(score, 1.0)
server/graders/grader_incident.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grader for Incident Task
3
+ """
4
+
5
+ from typing import Dict, Any
6
+
7
+
8
+ def grade(world_state: Dict[str, Any], step: int, max_steps: int) -> float:
9
+ """Grade the incident task"""
10
+ score = 0.0
11
+
12
+ # Key services that should be healthy after incident resolution
13
+ key_services = ["auth-service", "api-gateway", "frontend"]
14
+ healthy_services = 0
15
+
16
+ for service_name in key_services:
17
+ # Check if deployment exists and has running pods
18
+ deployment = next((d for d in world_state.get("deployments", []) if d.get("name") == service_name), None)
19
+ if deployment:
20
+ desired = deployment.get("desired_replicas", 0)
21
+ available = deployment.get("available_replicas", 0)
22
+
23
+ if desired > 0:
24
+ # Consider service healthy if at least 80% of desired replicas are available
25
+ if available / desired >= 0.8:
26
+ healthy_services += 1
27
+
28
+ # Score based on proportion of healthy services
29
+ if key_services:
30
+ service_health_score = healthy_services / len(key_services)
31
+ score += service_health_score * 0.6 # 60% for service health
32
+
33
+ # Check for absence of crashlooping pods in key services
34
+ key_service_pods = [p for p in world_state.get("pods", [])
35
+ if p.get("deployment") in key_services]
36
+ crashloop_pods = [p for p in key_service_pods
37
+ if p.get("status") == "CrashLoopBackOff"]
38
+
39
+ if key_service_pods:
40
+ crashloop_ratio = len(crashloop_pods) / len(key_service_pods)
41
+ # Penalize for crashlooping pods (inverse relationship)
42
+ health_bonus = (1.0 - crashloop_ratio) * 0.3 # 30% for no crashloops
43
+ score += health_bonus
44
+
45
+ # Efficiency bonus
46
+ if max_steps > 0:
47
+ efficiency = 1.0 - (step / max_steps)
48
+ score += efficiency * 0.1 # 10% for efficiency
49
+
50
+ return min(score, 1.0)
server/graders/grader_pod_recovery.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Grader for Pod Recovery Task
3
+ """
4
+
5
+ from typing import Dict, Any
6
+
7
+
8
+ def grade(world_state: Dict[str, Any], step: int, max_steps: int) -> float:
9
+ """Grade the pod recovery task"""
10
+ score = 0.0
11
+
12
+ # Count running frontend pods
13
+ frontend_pods = [p for p in world_state.get("pods", [])
14
+ if p.get("deployment") == "frontend" and p.get("status") == "Running"]
15
+ total_frontend_pods = [p for p in world_state.get("pods", [])
16
+ if p.get("deployment") == "frontend"]
17
+
18
+ if total_frontend_pods:
19
+ running_ratio = len(frontend_pods) / len(total_frontend_pods)
20
+ score += running_ratio * 0.5
21
+
22
+ # Bonus for all pods running
23
+ if total_frontend_pods and len(frontend_pods) == len(total_frontend_pods):
24
+ score += 0.4
25
+
26
+ # Efficiency bonus (faster is better)
27
+ if max_steps > 0:
28
+ efficiency = 1.0 - (step / max_steps)
29
+ score += efficiency * 0.1
30
+
31
+ return min(score, 1.0)
server/models.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ KubeSimEnv Models - Pydantic models for OpenEnv compliance
3
+ All typed models are mandatory for OpenEnv spec compliance.
4
+ Every endpoint uses these.
5
+ """
6
+
7
+ from pydantic import BaseModel, Field
8
+ from typing import List, Dict, Any, Optional, Literal
9
+ from datetime import datetime
10
+
11
+
12
+ class NodeStatus(BaseModel):
13
+ """Status of a Kubernetes node"""
14
+ name: str
15
+ status: Literal["Ready", "NotReady", "Unknown", "SchedulingDisabled"]
16
+ cpu_capacity: int # in cores
17
+ mem_capacity: int # in MB
18
+ cpu_usage: float = Field(ge=0, le=100) # percentage
19
+ mem_usage: float = Field(ge=0, le=100) # percentage
20
+ last_updated: str # ISO timestamp
21
+
22
+
23
+ class PodStatus(BaseModel):
24
+ """Status of a Kubernetes pod"""
25
+ name: str
26
+ status: Literal["Pending", "Running", "Succeeded", "Failed", "Unknown", "CrashLoopBackOff"]
27
+ node: Optional[str] = None
28
+ restarts: int = 0
29
+ cpu_request: int = Field(default=0) # in millicores
30
+ mem_request: int = Field(default=0) # in MB
31
+ cpu_limit: Optional[int] = Field(default=None) # in millicores
32
+ mem_limit: Optional[int] = Field(default=None) # in MB
33
+ deployment: Optional[str] = None
34
+ last_updated: str # ISO timestamp
35
+
36
+
37
+ class DeploymentStatus(BaseModel):
38
+ """Status of a Kubernetes deployment"""
39
+ name: str
40
+ desired_replicas: int
41
+ available_replicas: int
42
+ image: str
43
+ last_updated: str # ISO timestamp
44
+
45
+
46
+ class ServiceStatus(BaseModel):
47
+ """Status of a Kubernetes service"""
48
+ name: str
49
+ type: Literal["ClusterIP", "NodePort", "LoadBalancer", "ExternalName"]
50
+ ports: List[Dict[str, Any]]
51
+ selector: Optional[Dict[str, str]] = None
52
+ cluster_ip: Optional[str] = None
53
+ last_updated: str # ISO timestamp
54
+
55
+
56
+ class ConfigMapStatus(BaseModel):
57
+ """Status of a Kubernetes ConfigMap"""
58
+ name: str
59
+ data: Dict[str, str]
60
+ last_updated: str # ISO timestamp
61
+
62
+
63
+ class HPAStatus(BaseModel):
64
+ """Status of a HorizontalPodAutoscaler"""
65
+ name: str
66
+ min_replicas: int
67
+ max_replicas: int
68
+ current_replicas: int
69
+ cpu_target_percent: int
70
+ last_updated: str # ISO timestamp
71
+
72
+
73
+ class ClusterEvent(BaseModel):
74
+ """Kubernetes-style event"""
75
+ event_id: str
76
+ timestamp: str # ISO timestamp
77
+ type: Literal["Normal", "Warning"]
78
+ reason: str
79
+ message: str
80
+ involved_object: str
81
+
82
+
83
+ class ClusterObservation(BaseModel):
84
+ """Main observation model - typed cluster snapshot"""
85
+ nodes: List[NodeStatus]
86
+ pods: List[PodStatus]
87
+ deployments: List[DeploymentStatus]
88
+ services: List[ServiceStatus]
89
+ configmaps: List[ConfigMapStatus]
90
+ hpas: List[HPAStatus]
91
+ events: List[ClusterEvent]
92
+ step: int
93
+ objective: str
94
+
95
+
96
+ class RewardSignal(BaseModel):
97
+ """Reward signal returned by step()"""
98
+ reward: float
99
+ done: bool
100
+ info: Dict[str, Any] = Field(default_factory=dict)
101
+
102
+
103
+ # Action Models - These represent the structured action space
104
+ class KubeAction(BaseModel):
105
+ """Base action model"""
106
+ action_type: Literal[
107
+ "scale", "delete_pod", "patch", "rollout_restart",
108
+ "set_hpa", "drain_node", "describe"
109
+ ]
110
+
111
+
112
+ class ScaleAction(KubeAction):
113
+ """Scale a deployment to a specific replica count"""
114
+ deployment: str
115
+ replicas: int
116
+
117
+
118
+ class DeletePodAction(KubeAction):
119
+ """Delete a specific pod"""
120
+ pod_name: str
121
+
122
+
123
+ class PatchAction(KubeAction):
124
+ """Patch a resource with specific changes"""
125
+ resource_type: Literal["deployment", "pod", "node", "service"]
126
+ name: str
127
+ patch: Dict[str, Any]
128
+
129
+
130
+ class RolloutRestartAction(KubeAction):
131
+ """Restart a deployment rollout"""
132
+ deployment: str
133
+
134
+
135
+ class SetHPAAction(KubeAction):
136
+ """Set HorizontalPodAutoscaler for a deployment"""
137
+ deployment: str
138
+ min_replicas: int
139
+ max_replicas: int
140
+ cpu_target_percent: int
141
+
142
+
143
+ class DrainNodeAction(KubeAction):
144
+ """Drain a node (evict all pods)"""
145
+ node_name: str
146
+
147
+
148
+ class DescribeAction(KubeAction):
149
+ """Describe/get details of a resource"""
150
+ resource_type: Literal["deployment", "pod", "node", "service"]
151
+ name: str
server/requirements.txt DELETED
@@ -1,6 +0,0 @@
1
- openenv[core]>=0.2.0
2
- fastapi>=0.115.0
3
- uvicorn>=0.24.0
4
-
5
-
6
-
 
 
 
 
 
 
 
server/tasks/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Tasks"""
2
+
3
+ __all__ = ["task_pod_recovery", "task_autoscaling", "task_incident"]
4
+
5
+ from .task_pod_recovery import PodRecoveryTask
6
+ from .task_autoscaling import AutoscalingTask
7
+ from .task_incident import IncidentTask
8
+
9
+ __all__ += ["PodRecoveryTask", "AutoscalingTask", "IncidentTask"]
server/tasks/task_autoscaling.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Autoscaling Task - Medium difficulty
3
+ Configure HPA to handle traffic spike
4
+ """
5
+
6
+ from typing import Dict, Any
7
+ from ..COEnv_environment import World
8
+
9
+
10
+ class AutoscalingTask:
11
+ """Autoscaling task implementation"""
12
+
13
+ def __init__(self, world: World, config: Dict[str, Any]):
14
+ self.world = world
15
+ self.config = config
16
+ self.task_id = "autoscaling"
17
+ self.description = "Configure HPA to handle traffic spike"
18
+
19
+ def reset(self):
20
+ """Reset the task to initial state"""
21
+ self.world.reset_to_healthy()
22
+
23
+ backend_pods = [p for p in self.world.get_pods() if p.deployment == "backend"]
24
+ for pod in backend_pods:
25
+ patch = {
26
+ "cpu_request": int(pod.cpu_request * 3) if pod.cpu_request else 750,
27
+ "mem_request": int(pod.mem_request * 3) if pod.mem_request else 384
28
+ }
29
+ self.world.apply_patch("pod", pod.name, patch)
30
+
31
+ self.objective = "Backend service is overloaded due to traffic spike. Configure HPA to automatically scale the backend deployment based on CPU utilization."
32
+
33
+ def is_complete(self) -> bool:
34
+ """Check if the task is complete"""
35
+ backend_deployment = next((d for d in self.world.get_deployments() if d.name == "backend"), None)
36
+ if not backend_deployment:
37
+ return False
38
+
39
+ backend_pods = [p for p in self.world.get_pods()
40
+ if p.deployment == "backend" and p.status == "Running"]
41
+ return len(backend_pods) >= 2
42
+
43
+ def get_observation(self) -> Dict[str, Any]:
44
+ """Get current observation for the task"""
45
+ observation = self.world.get_full_state()
46
+ observation["objective"] = self.objective
47
+ return observation
server/tasks/task_incident.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Incident Task - Hard difficulty
3
+ Handle multi-service cascading incident
4
+ """
5
+
6
+ from typing import Dict, Any
7
+ from ..COEnv_environment import World
8
+ from ..conditions.cascade_failure import CascadeFailureCondition
9
+
10
+
11
+ class IncidentTask:
12
+ """Incident task implementation"""
13
+
14
+ def __init__(self, world: World, config: Dict[str, Any]):
15
+ self.world = world
16
+ self.config = config
17
+ self.task_id = "incident"
18
+ self.description = "Handle multi-service cascading incident"
19
+
20
+ def reset(self):
21
+ """Reset the task to initial state"""
22
+ self.world.reset_to_healthy()
23
+
24
+ cascade_condition = CascadeFailureCondition(self.world, self.config)
25
+ cascade_condition.inject(root_cause_service="auth-service", failure_probability=0.6)
26
+
27
+ self.objective = "Auth-service OOMKill has caused cascading failures. Identify the root cause, fix memory limits, restart workloads, and verify downstream recovery."
28
+
29
+ def is_complete(self) -> bool:
30
+ """Check if the task is complete"""
31
+ key_services = ["auth-service", "api-gateway", "frontend"]
32
+ healthy_services = 0
33
+
34
+ for service_name in key_services:
35
+ deployment = next((d for d in self.world.get_deployments() if d.name == service_name), None)
36
+ if deployment:
37
+ running_pods = [p for p in self.world.get_pods()
38
+ if p.deployment == service_name and p.status == "Running"]
39
+ if len(running_pods) >= deployment.desired_replicas * 0.8:
40
+ healthy_services += 1
41
+
42
+ return healthy_services >= len(key_services) * 0.67
43
+
44
+ def get_observation(self) -> Dict[str, Any]:
45
+ """Get current observation for the task"""
46
+ observation = self.world.get_full_state()
47
+ observation["objective"] = self.objective
48
+ return observation
server/tasks/task_pod_recovery.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pod Recovery Task - Easy difficulty
3
+ Fix crash-looping pods by identifying and patching bad configuration
4
+ """
5
+
6
+ from typing import Dict, Any
7
+ from ..COEnv_environment import World
8
+ from ..conditions.crash_loop import CrashLoopCondition
9
+
10
+
11
+ class PodRecoveryTask:
12
+ """Pod recovery task implementation"""
13
+
14
+ def __init__(self, world: World, config: Dict[str, Any]):
15
+ self.world = world
16
+ self.config = config
17
+ self.task_id = "pod_recovery"
18
+ self.description = "Fix crash-looping pods by identifying and patching bad configuration"
19
+
20
+ def reset(self):
21
+ """Reset the task to initial state"""
22
+ self.world.reset_to_healthy()
23
+
24
+ crash_loop_condition = CrashLoopCondition(self.world, self.config)
25
+ crash_loop_condition.inject(target_deployment="frontend", failure_rate=0.8)
26
+
27
+ self.objective = "All frontend pods should be running. Investigate the CrashLoopBackOff pods and fix the configuration issue."
28
+
29
+ def is_complete(self) -> bool:
30
+ """Check if the task is complete"""
31
+ frontend_pods = [p for p in self.world.get_pods()
32
+ if p.deployment == "frontend" and p.status == "Running"]
33
+ total_frontend_pods = [p for p in self.world.get_pods()
34
+ if p.deployment == "frontend"]
35
+
36
+ if not total_frontend_pods:
37
+ return False
38
+
39
+ return len(frontend_pods) == len(total_frontend_pods)
40
+
41
+ def get_observation(self) -> Dict[str, Any]:
42
+ """Get current observation for the task"""
43
+ observation = self.world.get_full_state()
44
+ observation["objective"] = self.objective
45
+ return observation
server/utils.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ KubeSimEnv Utils - Probability helpers and simulation utilities
3
+ Random failure rate generators, latency simulators, resource usage curves.
4
+ Makes the simulation feel realistic and non-deterministic in the right ways.
5
+ """
6
+
7
+ import random
8
+ import math
9
+ from typing import Dict, List, Any, Optional
10
+ from datetime import datetime, timedelta
11
+
12
+
13
+ class ProbabilityHelpers:
14
+ """Helpers for generating realistic probabilities and distributions"""
15
+
16
+ @staticmethod
17
+ def weighted_random_choice(choices: List[Any], weights: List[float]) -> Any:
18
+ """Make a weighted random choice"""
19
+ if not choices or not weights or len(choices) != len(weights):
20
+ return random.choice(choices) if choices else None
21
+
22
+ # Normalize weights
23
+ total_weight = sum(weights)
24
+ if total_weight == 0:
25
+ return random.choice(choices)
26
+
27
+ normalized_weights = [w / total_weight for w in weights]
28
+
29
+ # Make choice
30
+ r = random.random()
31
+ cumulative_weight = 0
32
+ for choice, weight in zip(choices, normalized_weights):
33
+ cumulative_weight += weight
34
+ if r <= cumulative_weight:
35
+ return choice
36
+ return choices[-1] # Fallback
37
+
38
+ @staticmethod
39
+ def exponential_backoff(attempt: int, base_delay: float = 1.0, max_delay: float = 60.0) -> float:
40
+ """Calculate exponential backoff delay"""
41
+ delay = base_delay * (2 ** attempt)
42
+ return min(delay, max_delay)
43
+
44
+ @staticmethod
45
+ def poisson_arrival_rate(lambda_rate: float, time_window: float) -> int:
46
+ """Generate number of events in time window using Poisson distribution"""
47
+ # Simple approximation - in reality would use numpy.random.poisson
48
+ return int(lambda_rate * time_window + random.gauss(0, math.sqrt(lambda_rate * time_window)))
49
+
50
+ @staticmethod
51
+ def failure_probability_over_time(base_rate: float, time_elapsed: float,
52
+ max_rate: float = 1.0) -> float:
53
+ """Calculate failure probability that increases over time"""
54
+ probability = base_rate * (1 + math.log(1 + time_elapsed))
55
+ return min(probability, max_rate)
56
+
57
+ @staticmethod
58
+ def random_failure_rate(min_rate: float = 0.1, max_rate: float = 0.9) -> float:
59
+ """Generate a random failure rate within bounds"""
60
+ return random.uniform(min_rate, max_rate)
61
+
62
+
63
+ class LatencySimulator:
64
+ """Simulates network and service latency"""
65
+
66
+ def __init__(self, base_latency_ms: float = 50.0):
67
+ self.base_latency_ms = base_latency_ms
68
+ self.load_factor = 1.0
69
+
70
+ def set_load(self, load_factor: float):
71
+ """Set system load factor (1.0 = normal, >1.0 = overloaded)"""
72
+ self.load_factor = max(0.1, load_factor)
73
+
74
+ def get_latency(self) -> float:
75
+ """Get simulated latency in milliseconds"""
76
+ # Base latency + load-dependent component + random jitter
77
+ load_latency = self.base_latency_ms * (self.load_factor - 1.0) * 2
78
+ jitter = random.gauss(0, self.base_latency_ms * 0.1)
79
+ latency = self.base_latency_ms + max(0, load_latency) + jitter
80
+ return max(1.0, latency) # Minimum 1ms latency
81
+
82
+ def get_latency_with_spike(self, spike_probability: float = 0.05,
83
+ spike_multiplier: float = 5.0) -> float:
84
+ """Get latency with occasional spikes"""
85
+ latency = self.get_latency()
86
+ if random.random() < spike_probability:
87
+ latency *= spike_multiplier
88
+ return latency
89
+
90
+
91
+ class ResourceUsageSimulator:
92
+ """Simulates realistic CPU and memory usage patterns"""
93
+
94
+ def __init__(self):
95
+ self.time_offset = random.uniform(0, 2 * math.pi)
96
+
97
+ def get_cpu_usage(self, base_usage: float = 0.3,
98
+ variation: float = 0.2) -> float:
99
+ """Get CPU usage as percentage (0-100)"""
100
+ # Simulate daily patterns with some noise
101
+ time_factor = (datetime.now().timestamp() / 3600) % 24 # Hours in day
102
+ daily_pattern = 0.5 * math.sin(2 * math.pi * time_factor / 24) + 0.5
103
+
104
+ usage = base_usage + variation * daily_pattern
105
+ usage += random.gauss(0, 0.05) # Noise
106
+ return max(0.0, min(1.0, usage)) * 100 # Clamp to 0-100%
107
+
108
+ def get_memory_usage(self, base_usage: float = 0.4,
109
+ variation: float = 0.15) -> float:
110
+ """Get memory usage as percentage (0-100)"""
111
+ # Memory usage tends to creep up over time (simulate leak)
112
+ time_factor = min((datetime.now().timestamp() / 86400) % 7, 1.0) # Weekly pattern
113
+ leak_factor = 0.1 * time_factor # Slow leak over week
114
+
115
+ usage = base_usage + leak_factor
116
+ usage += random.gauss(0, 0.03) # Noise
117
+ return max(0.0, min(1.0, usage)) * 100 # Clamp to 0-100%
118
+
119
+ def get_resource_curve(self, resource_type: str,
120
+ time_elapsed: float) -> float:
121
+ """Get resource usage following a specific curve"""
122
+ if resource_type == "cpu":
123
+ # CPU: periodic with bursts
124
+ return 0.3 + 0.4 * math.sin(time_elapsed / 100) + 0.2 * random.random()
125
+ elif resource_type == "memory":
126
+ # Memory: gradual increase with occasional GC drops
127
+ base = 0.2 + 0.6 * (1 - math.exp(-time_elapsed / 1000))
128
+ gc_drop = 0.3 if random.random() < 0.01 else 0 # Occasional GC
129
+ return max(0, base - gc_drop)
130
+ elif resource_type == "disk":
131
+ # Disk: steady growth
132
+ return 0.1 + 0.8 * min(time_elapsed / 10000, 1.0)
133
+ else:
134
+ return 0.5
135
+
136
+
137
+ class NetworkSimulator:
138
+ """Simulates network conditions and partitions"""
139
+
140
+ def __init__(self):
141
+ self.partition_probability = 0.01
142
+ self.latency_ms = 10.0
143
+ self.bandwidth_mbps = 1000.0
144
+
145
+ def simulate_partition(self) -> bool:
146
+ """Return True if network partition is simulated"""
147
+ return random.random() < self.partition_probability
148
+
149
+ def get_latency(self) -> float:
150
+ """Get network latency in milliseconds"""
151
+ # Base latency with occasional spikes
152
+ latency = self.latency_ms + random.gauss(0, self.latency_ms * 0.2)
153
+ if random.random() < 0.05: # 5% chance of spike
154
+ latency *= random.uniform(2, 10)
155
+ return max(1.0, latency)
156
+
157
+ def get_bandwidth(self) -> float:
158
+ """Get available bandwidth in Mbps"""
159
+ # Bandwidth varies with usage and conditions
160
+ usage_factor = random.uniform(0.3, 0.9)
161
+ condition_factor = random.uniform(0.8, 1.2)
162
+ return self.bandwidth_mbps * usage_factor * condition_factor
163
+
164
+
165
+ def generate_failure_scenario(config: Dict[str, Any]) -> Dict[str, Any]:
166
+ """Generate a random failure scenario based on config"""
167
+ scenario = {
168
+ "type": random.choice(["crashloop", "oom", "node_failure", "cascade"]),
169
+ "severity": random.uniform(0.3, 0.9),
170
+ "duration": random.randint(30, 300), # seconds
171
+ "affected_components": []
172
+ }
173
+
174
+ # Add specific parameters based on type
175
+ if scenario["type"] == "crashloop":
176
+ scenario["failure_rate"] = config.get("crash_loop_failure_rate", 0.7)
177
+ elif scenario["type"] == "oom":
178
+ scenario["failure_rate"] = config.get("oom_kill_failure_rate", 0.6)
179
+ elif scenario["type"] == "node_failure":
180
+ scenario["failure_rate"] = config.get("node_failure_rate", 0.4)
181
+ elif scenario["type"] == "cascade":
182
+ scenario["probability"] = config.get("cascade_failure_probability", 0.5)
183
+
184
+ return scenario
185
+
186
+
187
+ def apply_realistic_noise(value: float, noise_percent: float = 10.0) -> float:
188
+ """Apply realistic noise to a value"""
189
+ noise = random.gauss(0, value * (noise_percent / 100.0))
190
+ return max(0, value + noise)
server/validator.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Validator - Action validation"""
2
+
3
+ from typing import Dict, Any, Optional, Tuple
4
+
5
+
6
+ class Validator:
7
+ """Validates actions before execution"""
8
+
9
+ def __init__(self, world):
10
+ self.world = world
11
+
12
+ def validate(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
13
+ """
14
+ Validate an action before execution
15
+
16
+ Returns:
17
+ (is_valid, error_message)
18
+ """
19
+ action_type = action.get("action_type", "")
20
+
21
+ if action_type == "scale":
22
+ return self._validate_scale(action)
23
+ elif action_type == "delete_pod":
24
+ return self._validate_delete_pod(action)
25
+ elif action_type == "patch":
26
+ return self._validate_patch(action)
27
+ elif action_type == "rollout_restart":
28
+ return self._validate_rollout_restart(action)
29
+ elif action_type == "set_hpa":
30
+ return self._validate_set_hpa(action)
31
+ elif action_type == "drain_node":
32
+ return self._validate_drain_node(action)
33
+ elif action_type == "describe":
34
+ return self._validate_describe(action)
35
+ else:
36
+ return False, f"Unknown action type: {action_type}"
37
+
38
+ def _validate_scale(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
39
+ deployment = action.get("deployment", "")
40
+ replicas = action.get("replicas", 0)
41
+
42
+ if not deployment:
43
+ return False, "Deployment name is required"
44
+
45
+ if replicas < 0 or replicas > 100:
46
+ return False, "Replicas must be between 0 and 100"
47
+
48
+ # Check if deployment exists
49
+ deployments = self.world.get_deployments()
50
+ if not any(d.name == deployment for d in deployments):
51
+ return False, f"Deployment '{deployment}' does not exist"
52
+
53
+ return True, None
54
+
55
+ def _validate_delete_pod(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
56
+ pod_name = action.get("pod_name", "")
57
+
58
+ if not pod_name:
59
+ return False, "Pod name is required"
60
+
61
+ # Check if pod exists
62
+ pods = self.world.get_pods()
63
+ if not any(p.name == pod_name for p in pods):
64
+ return False, f"Pod '{pod_name}' does not exist"
65
+
66
+ return True, None
67
+
68
+ def _validate_patch(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
69
+ resource_type = action.get("resource_type", "")
70
+ name = action.get("name", "")
71
+
72
+ if not resource_type:
73
+ return False, "Resource type is required"
74
+
75
+ if not name:
76
+ return False, "Resource name is required"
77
+
78
+ # Check if resource exists
79
+ if resource_type == "deployment":
80
+ deployments = self.world.get_deployments()
81
+ if not any(d.name == name for d in deployments):
82
+ return False, f"Deployment '{name}' does not exist"
83
+ elif resource_type == "pod":
84
+ pods = self.world.get_pods()
85
+ if not any(p.name == name for p in pods):
86
+ return False, f"Pod '{name}' does not exist"
87
+ elif resource_type == "node":
88
+ nodes = self.world.get_nodes()
89
+ if not any(n.name == name for n in nodes):
90
+ return False, f"Node '{name}' does not exist"
91
+ elif resource_type == "service":
92
+ services = self.world.get_services()
93
+ if not any(s.name == name for s in services):
94
+ return False, f"Service '{name}' does not exist"
95
+ elif resource_type == "configmap":
96
+ configmaps = self.world.get_configmaps()
97
+ if not any(cm.name == name for cm in configmaps):
98
+ return False, f"ConfigMap '{name}' does not exist"
99
+
100
+ return True, None
101
+
102
+ def _validate_rollout_restart(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
103
+ deployment = action.get("deployment", "")
104
+
105
+ if not deployment:
106
+ return False, "Deployment name is required"
107
+
108
+ # Check if deployment exists
109
+ deployments = self.world.get_deployments()
110
+ if not any(d.name == deployment for d in deployments):
111
+ return False, f"Deployment '{deployment}' does not exist"
112
+
113
+ return True, None
114
+
115
+ def _validate_set_hpa(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
116
+ deployment = action.get("deployment", "")
117
+ min_replicas = action.get("min_replicas", 0)
118
+ max_replicas = action.get("max_replicas", 0)
119
+
120
+ if not deployment:
121
+ return False, "Deployment name is required"
122
+
123
+ if min_replicas < 1 or min_replicas > 50:
124
+ return False, "Min replicas must be between 1 and 50"
125
+
126
+ if max_replicas < 1 or max_replicas > 100:
127
+ return False, "Max replicas must be between 1 and 100"
128
+
129
+ if min_replicas > max_replicas:
130
+ return False, "Min replicas cannot be greater than max replicas"
131
+
132
+ # Check if deployment exists
133
+ deployments = self.world.get_deployments()
134
+ if not any(d.name == deployment for d in deployments):
135
+ return False, f"Deployment '{deployment}' does not exist"
136
+
137
+ return True, None
138
+
139
+ def _validate_drain_node(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
140
+ node_name = action.get("node_name", "")
141
+
142
+ if not node_name:
143
+ return False, "Node name is required"
144
+
145
+ # Check if node exists
146
+ nodes = self.world.get_nodes()
147
+ if not any(n.name == node_name for n in nodes):
148
+ return False, f"Node '{node_name}' does not exist"
149
+
150
+ # Check if node is already drained/scheduling disabled
151
+ node = next((n for n in nodes if n.name == node_name), None)
152
+ if node and node.status == "SchedulingDisabled":
153
+ return False, f"Node '{node_name}' is already drained"
154
+
155
+ return True, None
156
+
157
+ def _validate_describe(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
158
+ resource_type = action.get("resource_type", "")
159
+ name = action.get("name", "")
160
+
161
+ if not resource_type:
162
+ return False, "Resource type is required"
163
+
164
+ if not name:
165
+ return False, "Resource name is required"
166
+
167
+ return True, None
server/worker.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """COEnv Worker - Episode loop manager"""
2
+
3
+ from typing import Dict, Any, Optional, List
4
+
5
+
6
+ class EpisodeResult:
7
+ """Result of a single episode"""
8
+
9
+ def __init__(self, rewards: List[float], steps: int, success: bool, info: Dict[str, Any]):
10
+ self.rewards = rewards
11
+ self.steps = steps
12
+ self.success = success
13
+ self.info = info
14
+
15
+
16
+ class Worker:
17
+ """Manages the full lifecycle of a single episode"""
18
+
19
+ def __init__(self, world, executor, validator):
20
+ self.world = world
21
+ self.executor = executor
22
+ self.validator = validator
23
+
24
+ def run_episode(self, task_id: str, task_objective: str, max_steps: int,
25
+ get_action_fn=None) -> EpisodeResult:
26
+ """
27
+ Run a single episode
28
+
29
+ Args:
30
+ task_id: Task ID to run
31
+ task_objective: Objective string for the task
32
+ max_steps: Maximum steps for the episode
33
+ get_action_fn: Function to get action from agent (if None, uses random)
34
+
35
+ Returns:
36
+ EpisodeResult with rewards, steps, success, and info
37
+ """
38
+ import random
39
+
40
+ # Reset world with task condition
41
+ observation = self.world.reset(task_objective)
42
+ rewards = []
43
+ steps = 0
44
+ info = {}
45
+
46
+ for step in range(1, max_steps + 1):
47
+ steps = step
48
+
49
+ # Get action from agent (or random for now)
50
+ if get_action_fn:
51
+ action = get_action_fn(observation)
52
+ else:
53
+ # Random action for testing
54
+ action = self._random_action()
55
+
56
+ # Validate action
57
+ is_valid, error_msg = self.validator.validate(action)
58
+ if not is_valid:
59
+ info["error"] = error_msg
60
+ # Still execute but with penalty
61
+ result = self.executor.execute(action, task_id, max_steps)
62
+ result.reward = -0.1 # Penalty for invalid action
63
+ else:
64
+ # Execute valid action
65
+ result = self.executor.execute(action, task_id, max_steps)
66
+
67
+ rewards.append(result.reward)
68
+
69
+ if result.done:
70
+ info["success"] = True
71
+ info["final_reward"] = result.reward
72
+ break
73
+ else:
74
+ info["success"] = False
75
+ info["final_reward"] = rewards[-1] if rewards else 0.0
76
+
77
+ return EpisodeResult(
78
+ rewards=rewards,
79
+ steps=steps,
80
+ success=info.get("success", False),
81
+ info=info
82
+ )
83
+
84
+ def _random_action(self) -> Dict[str, Any]:
85
+ """Generate a random action for testing"""
86
+ import random
87
+
88
+ action_types = [
89
+ {"action_type": "describe", "resource_type": "deployment", "name": "frontend"},
90
+ {"action_type": "describe", "resource_type": "pod", "name": "frontend-abc123"},
91
+ {"action_type": "scale", "deployment": "frontend", "replicas": random.randint(1, 5)},
92
+ {"action_type": "scale", "deployment": "backend", "replicas": random.randint(1, 5)},
93
+ {"action_type": "delete_pod", "pod_name": "frontend-xyz789"},
94
+ {"action_type": "patch", "resource_type": "deployment", "name": "frontend",
95
+ "patch": {"desired_replicas": random.randint(1, 5)}},
96
+ {"action_type": "rollout_restart", "deployment": "frontend"},
97
+ {"action_type": "set_hpa", "deployment": "backend",
98
+ "min_replicas": 1, "max_replicas": 10, "cpu_target_percent": 80},
99
+ ]
100
+
101
+ return random.choice(action_types)
tests/test_environment.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test - Environment (from test_world.py)
3
+ """
4
+
5
+ import sys
6
+ import os
7
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'server'))
8
+
9
+ from COEnv_environment import World
10
+
11
+
12
+ def test_world_initialization():
13
+ """Test that the world initializes correctly"""
14
+ config = {
15
+ "num_nodes": 2,
16
+ "node_cpu_capacity": 4,
17
+ "node_mem_capacity": 8192,
18
+ "pod_cpu_request": 250,
19
+ "pod_mem_request": 128,
20
+ "pod_cpu_limit": 500,
21
+ "pod_mem_limit": 256,
22
+ }
23
+
24
+ world = World(config)
25
+ print("World initialized successfully")
26
+
27
+ nodes = world.get_nodes()
28
+ pods = world.get_pods()
29
+ deployments = world.get_deployments()
30
+ services = world.get_services()
31
+
32
+ print(f"Nodes: {len(nodes)}")
33
+ print(f"Pods: {len(pods)}")
34
+ print(f"Deployments: {len(deployments)}")
35
+ print(f"Services: {len(services)}")
36
+
37
+ assert len(nodes) == 2
38
+ assert len(pods) > 0
39
+ assert len(deployments) > 0
40
+ assert len(services) > 0
41
+
42
+ print("All tests passed!")
43
+
44
+
45
+ if __name__ == "__main__":
46
+ test_world_initialization()