Spaces:
No application file
feat: Implement COEnv Executor and Graders for task management
Browse files- Added `executor.py` to handle action execution and manage task states.
- Created grading modules for different tasks: `grader_autoscaling.py`, `grader_incident.py`, and `grader_pod_recovery.py`.
- Introduced Pydantic models in `models.py` for structured data representation.
- Removed outdated `requirements.txt`.
- Developed task implementations for autoscaling, incident handling, and pod recovery in `tasks` directory.
- Added utility functions in `utils.py` for realistic simulation of network and resource conditions.
- Implemented action validation in `validator.py` to ensure action integrity before execution.
- Created `worker.py` to manage episode execution and integrate with the executor and validator.
- Added tests for environment initialization in `test_environment.py`.
- .opencode/agents/executor-agent.ts +11 -0
- .opencode/agents/memory-agent.ts +20 -0
- .opencode/agents/planner-agent.ts +20 -0
- .opencode/agents/web-agent.ts +10 -0
- COEnv_Project_Documentation.md +704 -0
- config.json +14 -0
- inference.py +86 -0
- opencode.json +18 -0
- openenv.yaml +20 -7
- requirements.txt +4 -0
- server/COEnv_environment.py +424 -98
- server/Dockerfile +0 -80
- server/__init__.py +22 -8
- server/actions/__init__.py +13 -0
- server/actions/delete_pod_action.py +10 -0
- server/actions/describe_action.py +11 -0
- server/actions/drain_action.py +10 -0
- server/actions/hpa_action.py +13 -0
- server/actions/patch_action.py +12 -0
- server/actions/rollout_action.py +10 -0
- server/actions/scale_action.py +11 -0
- server/app.py +264 -65
- server/conditions/__init__.py +15 -0
- server/conditions/cascade_failure.py +85 -0
- server/conditions/crash_loop.py +62 -0
- server/conditions/node_failure.py +68 -0
- server/conditions/oom_kill.py +74 -0
- server/executor.py +144 -0
- server/graders/__init__.py +9 -0
- server/graders/grader_autoscaling.py +40 -0
- server/graders/grader_incident.py +50 -0
- server/graders/grader_pod_recovery.py +31 -0
- server/models.py +151 -0
- server/requirements.txt +0 -6
- server/tasks/__init__.py +9 -0
- server/tasks/task_autoscaling.py +47 -0
- server/tasks/task_incident.py +48 -0
- server/tasks/task_pod_recovery.py +45 -0
- server/utils.py +190 -0
- server/validator.py +167 -0
- server/worker.py +101 -0
- tests/test_environment.py +46 -0
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Agent } from "@opencode-ai/plugin"
|
| 2 |
+
|
| 3 |
+
export const ExecutorAgent: Agent = {
|
| 4 |
+
name: "executor",
|
| 5 |
+
description: "Handles general tasks",
|
| 6 |
+
|
| 7 |
+
async run(ctx) {
|
| 8 |
+
return `Executing task: ${ctx.input}`
|
| 9 |
+
}
|
| 10 |
+
}
|
| 11 |
+
|
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Agent } from "@opencode-ai/plugin"
|
| 2 |
+
|
| 3 |
+
export const MemoryAgent: Agent = {
|
| 4 |
+
name: "memory",
|
| 5 |
+
description: "Handles memory storage and retrieval",
|
| 6 |
+
|
| 7 |
+
async run(ctx) {
|
| 8 |
+
const input = ctx.input
|
| 9 |
+
|
| 10 |
+
if (input.includes("remember")) {
|
| 11 |
+
return "Saved to memory"
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
if (input.includes("recall")) {
|
| 15 |
+
return "Here is your memory"
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
return null
|
| 19 |
+
}
|
| 20 |
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Agent } from "@opencode-ai/plugin"
|
| 2 |
+
|
| 3 |
+
export const PlannerAgent: Agent = {
|
| 4 |
+
name: "planner",
|
| 5 |
+
description: "Decides which agent to call",
|
| 6 |
+
|
| 7 |
+
async run(ctx) {
|
| 8 |
+
const input = ctx.input
|
| 9 |
+
|
| 10 |
+
if (input.includes("remember")) {
|
| 11 |
+
return ctx.call("memory-agent", input)
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
if (input.includes("search")) {
|
| 15 |
+
return ctx.call("web-agent", input)
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
return ctx.call("executor-agent", input)
|
| 19 |
+
}
|
| 20 |
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Agent } from "@opencode-ai/plugin"
|
| 2 |
+
|
| 3 |
+
export const WebAgent: Agent = {
|
| 4 |
+
name: "web",
|
| 5 |
+
description: "Fetches web data",
|
| 6 |
+
|
| 7 |
+
async run(ctx) {
|
| 8 |
+
return `Searching web for: ${ctx.input}`
|
| 9 |
+
}
|
| 10 |
+
}
|
|
@@ -0,0 +1,704 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# COEnv — Project Documentation
|
| 2 |
+
### Meta × Hugging Face OpenEnv RL Hackathon
|
| 3 |
+
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
## Table of Contents
|
| 7 |
+
|
| 8 |
+
1. [What Is This Project?](#1-what-is-this-project)
|
| 9 |
+
2. [Why Kubernetes?](#2-why-kubernetes)
|
| 10 |
+
3. [How It Works — The Big Picture](#3-how-it-works--the-big-picture)
|
| 11 |
+
4. [The Three Layers Explained](#4-the-three-layers-explained)
|
| 12 |
+
5. [Team Ownership](#5-team-ownership)
|
| 13 |
+
6. [Full Project Directory Structure](#6-full-project-directory-structure)
|
| 14 |
+
7. [The Three Tasks (Easy → Medium → Hard)](#7-the-three-tasks-easy--medium--hard)
|
| 15 |
+
8. [Reward & Grading Design](#8-reward--grading-design)
|
| 16 |
+
9. [The Complete Episode Flow](#9-the-complete-episode-flow)
|
| 17 |
+
10. [OpenEnv Spec Compliance Checklist](#10-openenv-spec-compliance-checklist)
|
| 18 |
+
11. [Submission Checklist](#11-submission-checklist)
|
| 19 |
+
12. [Key Technical Decisions](#12-key-technical-decisions)
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## 1. What Is This Project?
|
| 24 |
+
|
| 25 |
+
**COEnv** is a Reinforcement Learning environment that simulates real-world Kubernetes cluster operations. An AI agent (LLM) is placed inside a broken or degraded Kubernetes cluster and must figure out the right sequence of operations to fix it — just like a real Site Reliability Engineer (SRE) would.
|
| 26 |
+
|
| 27 |
+
This is built for the **Meta × Hugging Face OpenEnv RL Hackathon**, which requires:
|
| 28 |
+
- A real-world task simulation (not games or toys)
|
| 29 |
+
- Full OpenEnv interface implementation (`step()`, `reset()`, `state()`)
|
| 30 |
+
- At least 3 tasks with programmatic graders (easy → medium → hard)
|
| 31 |
+
- A meaningful reward function that gives partial credit throughout the episode
|
| 32 |
+
- A working `inference.py` that runs an LLM agent and logs structured output
|
| 33 |
+
- Deployment on Hugging Face Spaces with a working Dockerfile
|
| 34 |
+
|
| 35 |
+
**In simple terms:** We fake a Kubernetes cluster in Python memory, break it in specific ways, and then let an LLM try to fix it step by step — scoring it on how well it does.
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
## 2. Why Kubernetes?
|
| 40 |
+
|
| 41 |
+
Kubernetes (k8s) is the industry-standard container orchestration system used by virtually every tech company running production software. Managing it is genuinely difficult and is a daily job for SREs and DevOps engineers worldwide.
|
| 42 |
+
|
| 43 |
+
**Why it's a perfect RL environment:**
|
| 44 |
+
|
| 45 |
+
| RL Concept | Kubernetes Equivalent |
|
| 46 |
+
|---|---|
|
| 47 |
+
| State | Cluster state (pod statuses, node health, resource usage) |
|
| 48 |
+
| Action | kubectl commands (scale, patch, delete, restart) |
|
| 49 |
+
| Reward | How close the cluster is to a healthy target state |
|
| 50 |
+
| Episode | One incident recovery scenario |
|
| 51 |
+
| Done | All SLOs restored / all pods healthy |
|
| 52 |
+
|
| 53 |
+
**Why it's novel for OpenEnv:** None of Meta's reference environments (calendar, REPL, browser, CARLA, reasoning gym) touch infrastructure operations. This fills a real gap.
|
| 54 |
+
|
| 55 |
+
**Why it's practical:** Companies would immediately use an environment like this to train or evaluate agents that assist SREs — the real-world utility score (30% of judging) is very high.
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## 3. How It Works — The Big Picture
|
| 60 |
+
|
| 61 |
+
Think of the project as three concentric layers:
|
| 62 |
+
|
| 63 |
+
```
|
| 64 |
+
┌─────────────────────────────────────────────────────────┐
|
| 65 |
+
│ LAYER 1 — RL ENVIRONMENT │
|
| 66 |
+
│ inference.py ←→ main.py (FastAPI) ←→ tasks/graders │
|
| 67 |
+
│ (Sandeep) │
|
| 68 |
+
├─────────────────────────────────────────────────────────┤
|
| 69 |
+
│ LAYER 2 — SIMULATION ENGINE │
|
| 70 |
+
│ world.py ←→ models.py ←→ conditions/ │
|
| 71 |
+
│ (You) │
|
| 72 |
+
├─────────────────────────────────────────────────────────┤
|
| 73 |
+
│ LAYER 3 — ACTION SPACE │
|
| 74 |
+
│ worker.py ←→ executor.py ←→ actions/ ←→ validator│
|
| 75 |
+
│ (Third Person) │
|
| 76 |
+
└─────────────────────────────────────────────────────────┘
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
**Layer 1 (Sandeep)** is what the judges see — the API endpoints, the inference script, the task definitions, the graders, the README.
|
| 80 |
+
|
| 81 |
+
**Layer 2 (You)** is the fake Kubernetes cluster. It holds the state of the cluster, knows how pods transition between statuses, and can inject failures. Everything sits in Python dictionaries — no real Kubernetes cluster runs.
|
| 82 |
+
|
| 83 |
+
**Layer 3 (Third Person)** is the action space — the specific operations the LLM agent is allowed to perform, and the validation/execution bridge that translates those actions into state changes in the simulator.
|
| 84 |
+
|
| 85 |
+
---
|
| 86 |
+
|
| 87 |
+
## 4. The Three Layers Explained
|
| 88 |
+
|
| 89 |
+
### Layer 1 — RL Environment (Sandeep)
|
| 90 |
+
|
| 91 |
+
This layer is the **public contract** of the project. It's what OpenEnv's `validate` command checks, what the judges' scripts call, and what the LLM agent talks to.
|
| 92 |
+
|
| 93 |
+
**`main.py` — FastAPI application**
|
| 94 |
+
|
| 95 |
+
The central API server. It exposes exactly three mandatory endpoints:
|
| 96 |
+
|
| 97 |
+
- `POST /reset` — Starts a new episode. Sets up a broken cluster using one of the condition injectors. Returns the initial `ClusterObservation` (what the agent sees first).
|
| 98 |
+
- `POST /step` — Receives an action from the agent. Validates it, executes it on the simulated cluster, advances time by one tick, and returns the new observation + reward + done flag + info.
|
| 99 |
+
- `GET /state` — Returns the full current cluster state. Used for debugging and grading.
|
| 100 |
+
|
| 101 |
+
**`inference.py` — LLM agent runner**
|
| 102 |
+
|
| 103 |
+
This is the script the hackathon validators actually run. It:
|
| 104 |
+
1. Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from environment variables
|
| 105 |
+
2. Calls `/reset` to start an episode
|
| 106 |
+
3. Feeds the observation to the LLM using the OpenAI client
|
| 107 |
+
4. Parses the LLM's response as a structured action
|
| 108 |
+
5. Calls `/step` with that action
|
| 109 |
+
6. Prints structured stdout logs after every step:
|
| 110 |
+
```
|
| 111 |
+
[START] task=pod-recovery env=coenv model=Qwen3-VL-30B
|
| 112 |
+
[STEP] step=1 action=delete_pod('frontend-7d9f-xkp2') reward=0.20 done=false error=null
|
| 113 |
+
[STEP] step=2 action=scale('frontend',3) reward=0.60 done=false error=null
|
| 114 |
+
[END] success=true steps=2 rewards=0.20,0.60
|
| 115 |
+
```
|
| 116 |
+
7. Repeats until `done=true` or `max_steps` is reached
|
| 117 |
+
|
| 118 |
+
**`openenv.yaml` — Spec metadata**
|
| 119 |
+
|
| 120 |
+
Required for `openenv validate` to pass. Contains:
|
| 121 |
+
- Environment name, version, description
|
| 122 |
+
- List of task IDs with difficulty labels
|
| 123 |
+
- References to the action schema and observation schema
|
| 124 |
+
|
| 125 |
+
**`classes/tasks/` — Task definitions**
|
| 126 |
+
|
| 127 |
+
Three Python files, each defining one task:
|
| 128 |
+
- What the broken state looks like (which condition to inject)
|
| 129 |
+
- What the agent's objective is (in plain English, passed to the LLM as a prompt)
|
| 130 |
+
- What counts as success
|
| 131 |
+
- Maximum number of steps allowed
|
| 132 |
+
|
| 133 |
+
**`classes/graders/` — Reward graders**
|
| 134 |
+
|
| 135 |
+
Three Python files, each implementing a `grade(world_state) -> float` function. Graders must be fully deterministic — same world state always returns same score. They implement partial credit: a grader doesn't just say "fixed or not fixed" but scores partial progress (e.g., 2 out of 5 pods fixed = 0.4).
|
| 136 |
+
|
| 137 |
+
**`Dockerfile`**
|
| 138 |
+
|
| 139 |
+
Single-stage Python container. Installs `requirements.txt`, copies the project, exposes port 8000, runs `uvicorn main:app`. Must build and run cleanly — this is a hard pass/fail gate.
|
| 140 |
+
|
| 141 |
+
**`README.md`**
|
| 142 |
+
|
| 143 |
+
Mandatory documentation. Must include: environment overview, motivation, action space definition, observation space definition, task descriptions with difficulty labels, setup instructions, baseline scores table.
|
| 144 |
+
|
| 145 |
+
---
|
| 146 |
+
|
| 147 |
+
### Layer 2 — Simulation Engine (You)
|
| 148 |
+
|
| 149 |
+
This is the **most important layer technically**. It's what makes the environment believable. Since we cannot run a real Kubernetes cluster inside a 2 vCPU / 8 GB HF Space container, the entire cluster is simulated as an in-memory Python object.
|
| 150 |
+
|
| 151 |
+
**`classes/world.py` — The cluster simulator**
|
| 152 |
+
|
| 153 |
+
This is the brain of the project. It maintains the complete cluster state as a Python dictionary, structured like a real Kubernetes API response:
|
| 154 |
+
|
| 155 |
+
```python
|
| 156 |
+
cluster_state = {
|
| 157 |
+
"nodes": [
|
| 158 |
+
{"name": "node-1", "status": "Ready", "cpu_capacity": 4, "mem_capacity": 8192},
|
| 159 |
+
{"name": "node-2", "status": "NotReady", "cpu_capacity": 4, "mem_capacity": 8192}
|
| 160 |
+
],
|
| 161 |
+
"deployments": [
|
| 162 |
+
{"name": "frontend", "desired_replicas": 3, "available_replicas": 1, "image": "nginx:1.21"}
|
| 163 |
+
],
|
| 164 |
+
"pods": [
|
| 165 |
+
{"name": "frontend-7d9f-xkp2", "status": "CrashLoopBackOff", "node": "node-1", "restarts": 7},
|
| 166 |
+
{"name": "frontend-7d9f-ab3c", "status": "Running", "node": "node-1", "restarts": 0},
|
| 167 |
+
{"name": "frontend-7d9f-mn8x", "status": "Pending", "node": None, "restarts": 0}
|
| 168 |
+
],
|
| 169 |
+
"services": [...],
|
| 170 |
+
"configmaps": [...],
|
| 171 |
+
"hpa": [...]
|
| 172 |
+
}
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
Key methods:
|
| 176 |
+
- `reset(condition)` — Wipes state, injects a failure condition, returns initial observation
|
| 177 |
+
- `get_pods(namespace, selector)` — Returns filtered pod list (mimics `kubectl get pods`)
|
| 178 |
+
- `apply_patch(resource_type, name, patch)` — Applies a patch to a resource
|
| 179 |
+
- `scale(deployment_name, replicas)` — Changes replica count
|
| 180 |
+
- `delete_pod(pod_name)` — Removes a pod (it gets recreated by the deployment controller on next tick)
|
| 181 |
+
- `tick()` — Advances simulated time by one step. Pods in `CrashLoopBackOff` increment their restart counter. Pending pods on ready nodes eventually transition to `Running`. Dead nodes stay dead unless drained.
|
| 182 |
+
- `get_observation()` — Serialises the current state into a `ClusterObservation` Pydantic model
|
| 183 |
+
|
| 184 |
+
**`classes/models.py` — Pydantic typed models**
|
| 185 |
+
|
| 186 |
+
All data structures are defined here. This is mandatory for OpenEnv spec compliance — typed models enforce the action/observation contract.
|
| 187 |
+
|
| 188 |
+
```python
|
| 189 |
+
class PodStatus(BaseModel):
|
| 190 |
+
name: str
|
| 191 |
+
status: Literal["Running", "Pending", "CrashLoopBackOff", "OOMKilled", "Terminating", "Unknown"]
|
| 192 |
+
node: Optional[str]
|
| 193 |
+
restarts: int
|
| 194 |
+
cpu_usage: float
|
| 195 |
+
mem_usage: float
|
| 196 |
+
|
| 197 |
+
class NodeStatus(BaseModel):
|
| 198 |
+
name: str
|
| 199 |
+
status: Literal["Ready", "NotReady", "SchedulingDisabled"]
|
| 200 |
+
cpu_capacity: float
|
| 201 |
+
mem_capacity: float
|
| 202 |
+
cpu_usage: float
|
| 203 |
+
mem_usage: float
|
| 204 |
+
|
| 205 |
+
class ClusterObservation(BaseModel):
|
| 206 |
+
nodes: List[NodeStatus]
|
| 207 |
+
pods: List[PodStatus]
|
| 208 |
+
deployments: List[DeploymentStatus]
|
| 209 |
+
services: List[ServiceStatus]
|
| 210 |
+
events: List[ClusterEvent] # recent k8s events (error messages, warnings)
|
| 211 |
+
step: int
|
| 212 |
+
objective: str # plain English description of what to fix
|
| 213 |
+
|
| 214 |
+
class RewardSignal(BaseModel):
|
| 215 |
+
reward: float # 0.0 to 1.0 incremental reward this step
|
| 216 |
+
cumulative: float # total reward so far
|
| 217 |
+
done: bool
|
| 218 |
+
info: Dict[str, Any] # breakdown: why this reward was given
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
**`classes/conditions/` — Failure injectors**
|
| 222 |
+
|
| 223 |
+
Each condition is a Python class with a single `inject(cluster_state) -> cluster_state` method that takes a healthy cluster and returns a broken one. This is how each task starts with a specific failure scenario:
|
| 224 |
+
|
| 225 |
+
- `crash_loop.py` — Sets 3 pods to `CrashLoopBackOff` with high restart counts. Simulates a bad image tag or missing environment variable.
|
| 226 |
+
- `oom_kill.py` — Sets pods to `OOMKilled`. Memory limits are set too low in the deployment spec. Pods keep restarting.
|
| 227 |
+
- `node_failure.py` — Sets one node to `NotReady`. All pods on that node go to `Unknown`. New pods are `Pending` (no space to schedule).
|
| 228 |
+
- `cascade_failure.py` — Combines multiple failures: one OOMKilled service causes downstream 503s in two dependent services, creating a cascading failure across 3 deployments.
|
| 229 |
+
|
| 230 |
+
**`classes/utils.py` — Probability and simulation helpers**
|
| 231 |
+
|
| 232 |
+
Utility functions that make the simulation feel realistic:
|
| 233 |
+
- `sample_cpu_usage(base_load, noise_factor)` — Returns a slightly randomised CPU % (real clusters are never exactly at baseline)
|
| 234 |
+
- `sample_latency(healthy_latency, degradation_factor)` — Simulates p95 request latency under load
|
| 235 |
+
- `should_pod_recover(restarts, backoff_seconds)` — Determines if a `CrashLoopBackOff` pod would naturally recover (it usually won't — that's the point)
|
| 236 |
+
- `generate_cluster_events(pod_list)` — Creates realistic k8s event messages like `"Back-off restarting failed container"` or `"OOMKilled: container exceeded memory limit"`
|
| 237 |
+
|
| 238 |
+
**`config.json` — Cluster defaults**
|
| 239 |
+
|
| 240 |
+
Single source of truth for all simulation parameters:
|
| 241 |
+
|
| 242 |
+
```json
|
| 243 |
+
{
|
| 244 |
+
"cluster": {
|
| 245 |
+
"num_nodes": 3,
|
| 246 |
+
"cpu_per_node": 4,
|
| 247 |
+
"mem_per_node_gb": 8
|
| 248 |
+
},
|
| 249 |
+
"tasks": {
|
| 250 |
+
"pod_recovery": { "max_steps": 15, "success_threshold": 0.9 },
|
| 251 |
+
"autoscaling": { "max_steps": 20, "success_threshold": 0.85 },
|
| 252 |
+
"incident": { "max_steps": 30, "success_threshold": 0.80 }
|
| 253 |
+
},
|
| 254 |
+
"simulation": {
|
| 255 |
+
"tick_interval_seconds": 30,
|
| 256 |
+
"crash_backoff_max_seconds": 300,
|
| 257 |
+
"hpa_cooldown_seconds": 180
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
```
|
| 261 |
+
|
| 262 |
+
---
|
| 263 |
+
|
| 264 |
+
### Layer 3 — Action Space & Workers (Third Person)
|
| 265 |
+
|
| 266 |
+
This layer defines what the LLM is allowed to do, makes sure it's valid, and executes it against the simulator.
|
| 267 |
+
|
| 268 |
+
**`classes/actions/` — Typed action definitions**
|
| 269 |
+
|
| 270 |
+
Each action is a Pydantic model. The LLM must output one of these (Sandeep's inference.py prompts it to respond in JSON matching one of these schemas):
|
| 271 |
+
|
| 272 |
+
```python
|
| 273 |
+
class ScaleAction(BaseModel):
|
| 274 |
+
action_type: Literal["scale"]
|
| 275 |
+
deployment: str # e.g. "frontend"
|
| 276 |
+
replicas: int # e.g. 3
|
| 277 |
+
|
| 278 |
+
class DeletePodAction(BaseModel):
|
| 279 |
+
action_type: Literal["delete_pod"]
|
| 280 |
+
pod_name: str # e.g. "frontend-7d9f-xkp2"
|
| 281 |
+
|
| 282 |
+
class PatchAction(BaseModel):
|
| 283 |
+
action_type: Literal["patch"]
|
| 284 |
+
resource_type: str # "deployment" | "configmap" | "service"
|
| 285 |
+
name: str
|
| 286 |
+
patch: Dict[str, Any] # the fields to update
|
| 287 |
+
|
| 288 |
+
class RolloutRestartAction(BaseModel):
|
| 289 |
+
action_type: Literal["rollout_restart"]
|
| 290 |
+
deployment: str
|
| 291 |
+
|
| 292 |
+
class SetHPAAction(BaseModel):
|
| 293 |
+
action_type: Literal["set_hpa"]
|
| 294 |
+
deployment: str
|
| 295 |
+
min_replicas: int
|
| 296 |
+
max_replicas: int
|
| 297 |
+
cpu_target_percent: int
|
| 298 |
+
|
| 299 |
+
class DrainNodeAction(BaseModel):
|
| 300 |
+
action_type: Literal["drain_node"]
|
| 301 |
+
node_name: str
|
| 302 |
+
|
| 303 |
+
class DescribeAction(BaseModel):
|
| 304 |
+
action_type: Literal["describe"]
|
| 305 |
+
resource_type: str
|
| 306 |
+
name: str # "investigation" action — no state change, returns detail
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
**`classes/validator.py` — Action validation**
|
| 310 |
+
|
| 311 |
+
Before any action touches the world state, the validator checks it:
|
| 312 |
+
- Does the target resource exist? (Can't delete a pod that doesn't exist)
|
| 313 |
+
- Is the scale value sane? (Can't scale to 0 or to 1000 replicas)
|
| 314 |
+
- Is the node already drained? (Can't drain twice)
|
| 315 |
+
- Is the deployment name a real deployment?
|
| 316 |
+
|
| 317 |
+
If validation fails, it returns an error string. This flows directly into the `[STEP] error=` field in stdout logs. The step still counts against the agent's limit �� bad actions are penalised by wasting steps.
|
| 318 |
+
|
| 319 |
+
**`classes/executor.py` — Action execution bridge**
|
| 320 |
+
|
| 321 |
+
Maps each validated action type to the correct `world.py` method call:
|
| 322 |
+
|
| 323 |
+
```python
|
| 324 |
+
def execute(action: KubeAction, world: World) -> ExecutionResult:
|
| 325 |
+
if action.action_type == "scale":
|
| 326 |
+
world.scale(action.deployment, action.replicas)
|
| 327 |
+
elif action.action_type == "delete_pod":
|
| 328 |
+
world.delete_pod(action.pod_name)
|
| 329 |
+
elif action.action_type == "rollout_restart":
|
| 330 |
+
world.rollout_restart(action.deployment)
|
| 331 |
+
...
|
| 332 |
+
world.tick() # always advance time after an action
|
| 333 |
+
return ExecutionResult(observation=world.get_observation(), ...)
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
**`classes/worker.py` — Agent episode loop**
|
| 337 |
+
|
| 338 |
+
Manages the full lifecycle of a single episode. Sandeep's `inference.py` calls this:
|
| 339 |
+
|
| 340 |
+
```python
|
| 341 |
+
class Worker:
|
| 342 |
+
def run_episode(self, task_id, world, max_steps) -> EpisodeResult:
|
| 343 |
+
obs = world.reset(task=task_id)
|
| 344 |
+
rewards = []
|
| 345 |
+
for step in range(1, max_steps + 1):
|
| 346 |
+
action = self.get_action(obs) # calls LLM
|
| 347 |
+
result = executor.execute(action, world)
|
| 348 |
+
rewards.append(result.reward)
|
| 349 |
+
if result.done:
|
| 350 |
+
break
|
| 351 |
+
return EpisodeResult(rewards=rewards, steps=step, success=result.done)
|
| 352 |
+
```
|
| 353 |
+
|
| 354 |
+
---
|
| 355 |
+
|
| 356 |
+
## 5. Team Ownership
|
| 357 |
+
|
| 358 |
+
| Module | Owner | Why It's Their Responsibility |
|
| 359 |
+
|---|---|---|
|
| 360 |
+
| `main.py` | Sandeep | He owns the public API contract |
|
| 361 |
+
| `inference.py` | Sandeep | He owns the hackathon submission script |
|
| 362 |
+
| `openenv.yaml` | Sandeep | He owns spec compliance |
|
| 363 |
+
| `Dockerfile` | Sandeep | He owns deployment |
|
| 364 |
+
| `README.md` | Sandeep | He owns documentation |
|
| 365 |
+
| `classes/tasks/` | Sandeep | He defines what success looks like |
|
| 366 |
+
| `classes/graders/` | Sandeep | He owns the scoring logic |
|
| 367 |
+
| `classes/world.py` | You | You own the cluster simulator |
|
| 368 |
+
| `classes/models.py` | You | You own all typed data models |
|
| 369 |
+
| `classes/utils.py` | You | You own simulation helpers |
|
| 370 |
+
| `classes/conditions/` | You | You own failure injection |
|
| 371 |
+
| `config.json` | You | You own all parameters |
|
| 372 |
+
| `classes/worker.py` | Third person | They own the episode loop |
|
| 373 |
+
| `classes/actions/` | Third person | They own the action space |
|
| 374 |
+
| `classes/executor.py` | Third person | They own action execution |
|
| 375 |
+
| `classes/validator.py` | Third person | They own action validation |
|
| 376 |
+
| `tests/` | All three | Each writes tests for their own module |
|
| 377 |
+
|
| 378 |
+
---
|
| 379 |
+
|
| 380 |
+
## 6. Full Project Directory Structure
|
| 381 |
+
|
| 382 |
+
```text
|
| 383 |
+
COEnv/
|
| 384 |
+
├── .dockerignore # Docker build exclusions
|
| 385 |
+
├── __init__.py # Module exports
|
| 386 |
+
├── README.md # Project documentation
|
| 387 |
+
├── openenv.yaml # OpenEnv manifest
|
| 388 |
+
├── pyproject.toml # Project metadata and dependencies
|
| 389 |
+
├── uv.lock # Locked dependencies
|
| 390 |
+
├── client.py # CoenvEnv client / inference-side runner
|
| 391 |
+
├── models.py # Shared action and observation models
|
| 392 |
+
├── config.json # Cluster defaults and simulation params
|
| 393 |
+
├── mkdocs.yml # Docs site configuration
|
| 394 |
+
├── tests/ # End-to-end and unit tests
|
| 395 |
+
│ ├── test_environment.py # From test_world.py
|
| 396 |
+
│ ├── test_conditions.py # From test_conditions.py
|
| 397 |
+
│ ├── test_models.py # From test_models.py
|
| 398 |
+
│ ├── test_actions.py # From test_actions.py
|
| 399 |
+
│ ├── test_executor.py # From test_executor.py
|
| 400 |
+
│ ├── test_graders.py # From test_graders.py
|
| 401 |
+
│ ├── test_tasks.py # From test_tasks.py
|
| 402 |
+
│ └── test_integration.py # End-to-end reset→step→state flow
|
| 403 |
+
└── server/
|
| 404 |
+
├── __init__.py # Server module exports
|
| 405 |
+
├── COEnv_environment.py # Core environment logic
|
| 406 |
+
├── app.py # FastAPI app exposing /reset /step /state
|
| 407 |
+
├── Dockerfile # Container image definition
|
| 408 |
+
├── utils.py # Simulation helpers
|
| 409 |
+
├── validator.py # Action validation
|
| 410 |
+
├── executor.py # Action execution bridge
|
| 411 |
+
├── worker.py # Episode loop manager
|
| 412 |
+
├── tasks/
|
| 413 |
+
│ ├── __init__.py
|
| 414 |
+
│ ├── task_pod_recovery.py
|
| 415 |
+
│ ├── task_autoscaling.py
|
| 416 |
+
│ └── task_incident.py
|
| 417 |
+
├── graders/
|
| 418 |
+
│ ├── __init__.py
|
| 419 |
+
│ ├── grader_pod_recovery.py
|
| 420 |
+
│ ├── grader_autoscaling.py
|
| 421 |
+
│ └── grader_incident.py
|
| 422 |
+
├── conditions/
|
| 423 |
+
│ ├── __init__.py
|
| 424 |
+
│ ├── crash_loop.py
|
| 425 |
+
│ ├── oom_kill.py
|
| 426 |
+
│ ├── node_failure.py
|
| 427 |
+
│ └── cascade_failure.py
|
| 428 |
+
└── actions/
|
| 429 |
+
├── __init__.py
|
| 430 |
+
├── scale_action.py
|
| 431 |
+
��── patch_action.py
|
| 432 |
+
├── delete_pod_action.py
|
| 433 |
+
├── rollout_action.py
|
| 434 |
+
├── hpa_action.py
|
| 435 |
+
├── drain_action.py
|
| 436 |
+
└── describe_action.py
|
| 437 |
+
```
|
| 438 |
+
|
| 439 |
+
---
|
| 440 |
+
|
| 441 |
+
## 7. The Three Tasks (Easy → Medium → Hard)
|
| 442 |
+
|
| 443 |
+
### Task 1 — Pod Recovery (Easy)
|
| 444 |
+
|
| 445 |
+
**What's broken:** A frontend deployment has 3 pods stuck in `CrashLoopBackOff`. The restart count is climbing. The root cause is a wrong environment variable in the deployment spec pointing to a database host that doesn't exist.
|
| 446 |
+
|
| 447 |
+
**What the agent must do:**
|
| 448 |
+
1. Observe the broken pods and read the k8s events (which mention a connection refused error)
|
| 449 |
+
2. Identify the bad `DB_HOST` environment variable using a `describe` or `patch` inspect action
|
| 450 |
+
3. Patch the deployment with the correct `DB_HOST` value
|
| 451 |
+
4. Optionally delete the crash-looping pods to speed up recovery (they'll get recreated with the new config)
|
| 452 |
+
5. Verify all 3 pods reach `Running` state
|
| 453 |
+
|
| 454 |
+
**Objective string shown to agent:** *"The frontend deployment is crash-looping. Diagnose and fix the root cause so that all pods reach Running state."*
|
| 455 |
+
|
| 456 |
+
**Max steps:** 15
|
| 457 |
+
**Success threshold:** All 3 pods in `Running` state (score ≥ 0.9)
|
| 458 |
+
|
| 459 |
+
**Partial rewards:**
|
| 460 |
+
- +0.1 for each pod that stops crash-looping
|
| 461 |
+
- +0.2 for correctly patching the environment variable
|
| 462 |
+
- +0.3 bonus for all pods Running within 10 steps
|
| 463 |
+
|
| 464 |
+
---
|
| 465 |
+
|
| 466 |
+
### Task 2 — HPA Autoscaling Under Traffic Spike (Medium)
|
| 467 |
+
|
| 468 |
+
**What's broken:** The cluster is healthy but receiving 10× normal traffic. The deployment has no HPA configured, is running on fixed 2 replicas, and is already at 95% CPU. Request latency is climbing past the SLO threshold.
|
| 469 |
+
|
| 470 |
+
**What the agent must do:**
|
| 471 |
+
1. Observe high CPU usage and rising latency in the observation
|
| 472 |
+
2. Immediately scale up the deployment to handle current load
|
| 473 |
+
3. Configure a HorizontalPodAutoscaler (HPA) with appropriate min/max replicas and CPU target
|
| 474 |
+
4. Set correct CPU resource requests/limits on the deployment so HPA has a baseline to work with
|
| 475 |
+
5. Verify that latency drops back below the SLO threshold
|
| 476 |
+
|
| 477 |
+
**Objective string shown to agent:** *"Traffic has spiked 10×. The api-server deployment is overloaded. Configure autoscaling and ensure p95 latency stays below 500ms."*
|
| 478 |
+
|
| 479 |
+
**Max steps:** 20
|
| 480 |
+
**Success threshold:** p95 latency < 500ms, HPA configured, replicas ≥ 4 (score ≥ 0.85)
|
| 481 |
+
|
| 482 |
+
**Partial rewards:**
|
| 483 |
+
- +0.15 for scaling up replicas immediately (within 3 steps)
|
| 484 |
+
- +0.20 for configuring HPA correctly
|
| 485 |
+
- +0.25 for latency dropping below 1000ms
|
| 486 |
+
- +0.30 for latency dropping below 500ms (SLO met)
|
| 487 |
+
- -0.10 penalty for scaling beyond 12 replicas unnecessarily (resource waste)
|
| 488 |
+
|
| 489 |
+
---
|
| 490 |
+
|
| 491 |
+
### Task 3 — Multi-Service Cascading Incident (Hard)
|
| 492 |
+
|
| 493 |
+
**What's broken:** The `auth-service` deployment has pods getting OOMKilled because memory limits are set 4× too low relative to actual usage. This causes the `api-gateway` to fail authentication checks and return 503s. Downstream, the `data-processor` service is also throwing errors because it depends on the gateway. Three services are degraded simultaneously.
|
| 494 |
+
|
| 495 |
+
**What the agent must do:**
|
| 496 |
+
1. Identify the blast radius — which services are affected and why
|
| 497 |
+
2. Investigate `auth-service` to find the OOMKill root cause (memory limits too low)
|
| 498 |
+
3. Patch `auth-service` deployment with correct memory limits
|
| 499 |
+
4. Rollout restart `auth-service` so new pods come up with correct limits
|
| 500 |
+
5. Drain the partially-failed node where most OOMKilled pods were running, to force clean rescheduling
|
| 501 |
+
6. Verify `api-gateway` 503 errors stop (automatically once auth recovers)
|
| 502 |
+
7. Verify `data-processor` error rate drops (automatically once gateway recovers)
|
| 503 |
+
8. Confirm all three services are fully healthy
|
| 504 |
+
|
| 505 |
+
**Objective string shown to agent:** *"A cascading incident has degraded auth-service, api-gateway, and data-processor. Identify the root cause and restore all three services to healthy state without data loss."*
|
| 506 |
+
|
| 507 |
+
**Max steps:** 30
|
| 508 |
+
**Success threshold:** All 3 services healthy, error rate < 0.1% (score ≥ 0.80)
|
| 509 |
+
|
| 510 |
+
**Partial rewards:**
|
| 511 |
+
- +0.10 for correctly identifying `auth-service` as the root cause (within 5 steps)
|
| 512 |
+
- +0.15 for patching memory limits correctly
|
| 513 |
+
- +0.15 for auth-service pods reaching Running
|
| 514 |
+
- +0.20 for api-gateway 503s stopping
|
| 515 |
+
- +0.20 for data-processor errors resolving
|
| 516 |
+
- +0.10 for draining the bad node cleanly
|
| 517 |
+
- -0.15 penalty for deleting services or breaking healthy components
|
| 518 |
+
|
| 519 |
+
---
|
| 520 |
+
|
| 521 |
+
## 8. Reward & Grading Design
|
| 522 |
+
|
| 523 |
+
The grading philosophy follows what the PS requires: reward signal over the **full trajectory**, not just at the end.
|
| 524 |
+
|
| 525 |
+
### Reward Principles
|
| 526 |
+
|
| 527 |
+
**Partial progress is always rewarded.** If the agent fixes 1 out of 3 broken pods, it gets 1/3 of the maximum reward for that milestone — not zero.
|
| 528 |
+
|
| 529 |
+
**Speed bonus.** Fixing the issue in fewer steps earns a small bonus. This incentivises efficient reasoning.
|
| 530 |
+
|
| 531 |
+
**Waste penalty.** Unnecessary destructive actions (scaling to 0, deleting healthy pods, draining a healthy node) subtract from the reward. This teaches the agent to be surgical.
|
| 532 |
+
|
| 533 |
+
**Idempotency.** Repeating the same correct action doesn't give extra reward but doesn't penalise either (except for wasted steps).
|
| 534 |
+
|
| 535 |
+
### Grader Implementation Pattern
|
| 536 |
+
|
| 537 |
+
Each grader implements:
|
| 538 |
+
|
| 539 |
+
```python
|
| 540 |
+
def grade(world_state: dict, step: int, max_steps: int) -> float:
|
| 541 |
+
score = 0.0
|
| 542 |
+
|
| 543 |
+
# Milestone 1: Partial progress
|
| 544 |
+
running_pods = [p for p in world_state["pods"] if p["status"] == "Running"]
|
| 545 |
+
score += (len(running_pods) / total_expected_pods) * 0.5
|
| 546 |
+
|
| 547 |
+
# Milestone 2: Full success
|
| 548 |
+
if all(p["status"] == "Running" for p in world_state["pods"]):
|
| 549 |
+
score += 0.4
|
| 550 |
+
|
| 551 |
+
# Speed bonus
|
| 552 |
+
efficiency = 1.0 - (step / max_steps)
|
| 553 |
+
score += efficiency * 0.1
|
| 554 |
+
|
| 555 |
+
return min(score, 1.0) # always clamp to [0, 1]
|
| 556 |
+
```
|
| 557 |
+
|
| 558 |
+
---
|
| 559 |
+
|
| 560 |
+
## 9. The Complete Episode Flow
|
| 561 |
+
|
| 562 |
+
Here is the full step-by-step flow of one complete episode, from start to finish:
|
| 563 |
+
|
| 564 |
+
```
|
| 565 |
+
1. JUDGE / VALIDATOR runs:
|
| 566 |
+
python inference.py
|
| 567 |
+
|
| 568 |
+
2. inference.py reads env vars:
|
| 569 |
+
API_BASE_URL, MODEL_NAME, HF_TOKEN
|
| 570 |
+
|
| 571 |
+
3. inference.py calls:
|
| 572 |
+
POST /reset { "task": "pod_recovery" }
|
| 573 |
+
|
| 574 |
+
4. main.py receives /reset:
|
| 575 |
+
→ Calls task_pod_recovery.get_condition() → crash_loop.inject(cluster_state)
|
| 576 |
+
→ world.reset(broken_state)
|
| 577 |
+
→ Returns ClusterObservation (3 CrashLoopBackOff pods, events, objective string)
|
| 578 |
+
|
| 579 |
+
5. stdout prints:
|
| 580 |
+
[START] task=pod-recovery env=coenv model=Qwen3-30B
|
| 581 |
+
|
| 582 |
+
6. inference.py builds LLM prompt:
|
| 583 |
+
"You are an SRE. Current cluster state: [observation JSON].
|
| 584 |
+
Objective: Fix the frontend deployment crash loop.
|
| 585 |
+
Respond with a JSON action from the available action types."
|
| 586 |
+
|
| 587 |
+
7. LLM responds:
|
| 588 |
+
{ "action_type": "describe", "resource_type": "deployment", "name": "frontend" }
|
| 589 |
+
|
| 590 |
+
8. inference.py calls:
|
| 591 |
+
POST /step { action }
|
| 592 |
+
|
| 593 |
+
9. main.py receives /step:
|
| 594 |
+
→ validator.validate(action, world) → OK
|
| 595 |
+
→ executor.execute(action, world)
|
| 596 |
+
→ world.tick()
|
| 597 |
+
→ grader.grade(world.state, step=1) → reward=0.00 (just investigating)
|
| 598 |
+
→ Returns observation, reward=0.00, done=false, info={...}
|
| 599 |
+
|
| 600 |
+
10. stdout prints:
|
| 601 |
+
[STEP] step=1 action=describe('deployment','frontend') reward=0.00 done=false error=null
|
| 602 |
+
|
| 603 |
+
11. LLM sees deployment spec, notices DB_HOST=wrong-host.internal
|
| 604 |
+
LLM responds: { "action_type": "patch", "resource_type": "deployment",
|
| 605 |
+
"name": "frontend",
|
| 606 |
+
"patch": {"env": [{"name": "DB_HOST", "value": "db.prod.internal"}]} }
|
| 607 |
+
|
| 608 |
+
12. POST /step { patch action }
|
| 609 |
+
→ executor patches deployment in world state
|
| 610 |
+
→ world.tick() — pods begin restarting with new config
|
| 611 |
+
→ grader → reward=0.20 (correct patch applied)
|
| 612 |
+
|
| 613 |
+
13. [STEP] step=2 action=patch('frontend',{env...}) reward=0.20 done=false error=null
|
| 614 |
+
|
| 615 |
+
14. LLM responds: { "action_type": "delete_pod", "pod_name": "frontend-7d9f-xkp2" }
|
| 616 |
+
→ world deletes pod, recreates with correct env, status → Running
|
| 617 |
+
→ grader → reward=0.40
|
| 618 |
+
|
| 619 |
+
15. Repeat for remaining 2 pods...
|
| 620 |
+
|
| 621 |
+
16. All 3 pods Running. grader → reward=1.0, done=true
|
| 622 |
+
|
| 623 |
+
17. stdout prints:
|
| 624 |
+
[END] success=true steps=8 rewards=0.00,0.20,0.40,0.55,0.70,0.85,0.95,1.00
|
| 625 |
+
```
|
| 626 |
+
|
| 627 |
+
---
|
| 628 |
+
|
| 629 |
+
## 10. OpenEnv Spec Compliance Checklist
|
| 630 |
+
|
| 631 |
+
| Requirement | File | Status |
|
| 632 |
+
|---|---|---|
|
| 633 |
+
| Typed Observation model | `classes/models.py` → `ClusterObservation` | Required |
|
| 634 |
+
| Typed Action model | `classes/models.py` → `KubeAction` | Required |
|
| 635 |
+
| Typed Reward model | `classes/models.py` → `RewardSignal` | Required |
|
| 636 |
+
| `step(action) → (obs, reward, done, info)` | `main.py` → `POST /step` | Required |
|
| 637 |
+
| `reset() → initial_observation` | `main.py` → `POST /reset` | Required |
|
| 638 |
+
| `state() → current_state` | `main.py` → `GET /state` | Required |
|
| 639 |
+
| `openenv.yaml` with metadata | `openenv.yaml` | Required |
|
| 640 |
+
| `openenv validate` passes | Tested via pre-validation script | Required |
|
| 641 |
+
| Min 3 tasks | `classes/tasks/` — 3 files | Required |
|
| 642 |
+
| Easy → medium → hard difficulty | task_pod_recovery / task_autoscaling / task_incident | Required |
|
| 643 |
+
| Graders return 0.0–1.0 | `classes/graders/` — 3 graders | Required |
|
| 644 |
+
| Graders are deterministic | Pure functions, no randomness | Required |
|
| 645 |
+
| Partial reward signals | All 3 graders implement milestone scoring | Required |
|
| 646 |
+
| Penalise bad actions | validator.py + grader penalty terms | Required |
|
| 647 |
+
| `inference.py` in root | `inference.py` | Required |
|
| 648 |
+
| `[START]` log line | `inference.py` → `log_start()` | Required |
|
| 649 |
+
| `[STEP]` log per step | `inference.py` → `log_step()` | Required |
|
| 650 |
+
| `[END]` log always emitted | `inference.py` → `finally: log_end()` | Required |
|
| 651 |
+
| Reads `API_BASE_URL` with default | `inference.py` | Required |
|
| 652 |
+
| Reads `MODEL_NAME` with default | `inference.py` | Required |
|
| 653 |
+
| Reads `HF_TOKEN` (no default) | `inference.py` | Required |
|
| 654 |
+
| Uses OpenAI client | `from openai import OpenAI` | Required |
|
| 655 |
+
| `Dockerfile` builds cleanly | `Dockerfile` | Required |
|
| 656 |
+
| HF Space deploys and responds | Deployed on Hugging Face | Required |
|
| 657 |
+
| Inference runs in < 20 min | Max 30 steps × ~20s/step = ~10 min | Required |
|
| 658 |
+
| Runs in 2 vCPU / 8 GB RAM | Pure Python in-memory sim, no real k8s | Required |
|
| 659 |
+
| README with all required sections | `README.md` | Required |
|
| 660 |
+
|
| 661 |
+
---
|
| 662 |
+
|
| 663 |
+
## 11. Submission Checklist
|
| 664 |
+
|
| 665 |
+
Before submitting, verify all of these:
|
| 666 |
+
|
| 667 |
+
- [ ] `inference.py` is in the **root directory** (not inside `classes/`)
|
| 668 |
+
- [ ] `inference.py` has default values for `API_BASE_URL` and `MODEL_NAME`
|
| 669 |
+
- [ ] `inference.py` raises `ValueError` if `HF_TOKEN` is missing
|
| 670 |
+
- [ ] `[START]`, `[STEP]`, `[END]` format matches the spec **exactly** (field names, order, lowercase booleans)
|
| 671 |
+
- [ ] `openenv validate` passes locally
|
| 672 |
+
- [ ] `docker build` completes without errors
|
| 673 |
+
- [ ] `docker run` starts the server and responds to `GET /state`
|
| 674 |
+
- [ ] HF Space is in **Running** state (not Building, not Stopped)
|
| 675 |
+
- [ ] All 3 tasks can be reset and stepped without crashing
|
| 676 |
+
- [ ] All 3 graders return a float between 0.0 and 1.0
|
| 677 |
+
- [ ] Running `inference.py` end-to-end completes in under 20 minutes
|
| 678 |
+
- [ ] `README.md` includes baseline scores table
|
| 679 |
+
- [ ] `tests/test_integration.py` passes cleanly
|
| 680 |
+
|
| 681 |
+
---
|
| 682 |
+
|
| 683 |
+
## 12. Key Technical Decisions
|
| 684 |
+
|
| 685 |
+
### Why a simulated cluster, not a real one?
|
| 686 |
+
|
| 687 |
+
Running `kind` or `minikube` inside a Hugging Face Space container with 2 vCPU / 8 GB RAM is not feasible. The Kubernetes control plane alone (etcd + apiserver + scheduler + controller-manager) consumes ~1.5–2 GB RAM before any workloads run. An in-memory Python simulator is the only viable approach within the hardware constraints. It is also faster (no scheduling latency), fully deterministic (same input = same output), and easier to test.
|
| 688 |
+
|
| 689 |
+
### Why a constrained action space?
|
| 690 |
+
|
| 691 |
+
Free-form kubectl text strings are nearly impossible to grade deterministically. By defining ~7 typed Pydantic action models, we make the action space clear to the LLM (easier to prompt), easy to validate (Pydantic does the type checking), and easy to grade (executor calls predictable world methods). This also keeps the action space small enough that the LLM can reason about it effectively without getting lost in kubectl's hundreds of sub-commands.
|
| 692 |
+
|
| 693 |
+
### Why FastAPI?
|
| 694 |
+
|
| 695 |
+
OpenEnv environments are expected to be HTTP servers. FastAPI gives automatic OpenAPI documentation (at `/docs`), Pydantic integration for request/response validation, async support for when we need it, and a clean decorator syntax that makes `main.py` easy to read. It is also trivial to run with `uvicorn` inside a Docker container.
|
| 696 |
+
|
| 697 |
+
### Why partial rewards matter for the hackathon
|
| 698 |
+
|
| 699 |
+
The PS explicitly states: *"The reward function must provide feedback throughout the task trajectory, not just at completion."* Binary rewards (0 until success, then 1) are explicitly penalised in the environment design score. Our graders implement milestone-based partial rewards, which also makes the environment more useful for actual RL training — sparse rewards make training slow and unstable.
|
| 700 |
+
|
| 701 |
+
---
|
| 702 |
+
|
| 703 |
+
*COEnv — Meta × Hugging Face OpenEnv RL Hackathon*
|
| 704 |
+
*Team: Sandeep (RL environment) · You (Simulation) · Third Person (Actions & Workers)*
|
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_nodes": 5,
|
| 3 |
+
"node_cpu_capacity": 4,
|
| 4 |
+
"node_mem_capacity": 8192,
|
| 5 |
+
"pod_cpu_request": 250,
|
| 6 |
+
"pod_mem_request": 128,
|
| 7 |
+
"pod_cpu_limit": 500,
|
| 8 |
+
"pod_mem_limit": 256,
|
| 9 |
+
"crash_loop_failure_rate": 0.7,
|
| 10 |
+
"oom_kill_failure_rate": 0.6,
|
| 11 |
+
"node_failure_rate": 0.3,
|
| 12 |
+
"cascade_failure_probability": 0.5,
|
| 13 |
+
"task_timeout_values": 300
|
| 14 |
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
COEnv Inference Script
|
| 3 |
+
Used by validators to run episodes with LLMs
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import json
|
| 9 |
+
import argparse
|
| 10 |
+
import requests
|
| 11 |
+
from typing import Dict, Any, Optional
|
| 12 |
+
|
| 13 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "http://localhost:8000")
|
| 14 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen3-30B")
|
| 15 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def main():
|
| 19 |
+
parser = argparse.ArgumentParser(description='Run COEnv inference')
|
| 20 |
+
parser.add_argument('--api-base-url', type=str, default=API_BASE_URL, help='Base URL for the COEnv API')
|
| 21 |
+
parser.add_argument('--model-name', type=str, default=MODEL_NAME, help='Name of the model to use')
|
| 22 |
+
parser.add_argument('--hf-token', type=str, default=HF_TOKEN, help='Hugging Face token (if needed)')
|
| 23 |
+
parser.add_argument('--task-id', type=str, default='pod_recovery', help='Task ID to run')
|
| 24 |
+
parser.add_argument('--max-steps', type=int, default=15, help='Maximum steps per episode')
|
| 25 |
+
|
| 26 |
+
args = parser.parse_args()
|
| 27 |
+
|
| 28 |
+
api_base_url = args.api_base_url.rstrip('/')
|
| 29 |
+
model_name = args.model_name
|
| 30 |
+
hf_token = args.hf_token or HF_TOKEN
|
| 31 |
+
task_id = args.task_id
|
| 32 |
+
max_steps = args.max_steps
|
| 33 |
+
|
| 34 |
+
print(f"[START] task={task_id} env=coenv model={model_name}")
|
| 35 |
+
|
| 36 |
+
reset_url = f"{api_base_url}/reset"
|
| 37 |
+
try:
|
| 38 |
+
response = requests.post(reset_url, json={"task": task_id})
|
| 39 |
+
response.raise_for_status()
|
| 40 |
+
observation = response.json()
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"[ERROR] Failed to reset environment: {e}")
|
| 43 |
+
return 1
|
| 44 |
+
|
| 45 |
+
total_reward = []
|
| 46 |
+
|
| 47 |
+
for step in range(1, max_steps + 1):
|
| 48 |
+
action = {
|
| 49 |
+
"action_type": "describe",
|
| 50 |
+
"resource_type": "deployment",
|
| 51 |
+
"name": "frontend"
|
| 52 |
+
}
|
| 53 |
+
action_str = f"describe('deployment','frontend')"
|
| 54 |
+
|
| 55 |
+
step_url = f"{api_base_url}/step"
|
| 56 |
+
try:
|
| 57 |
+
response = requests.post(step_url, json={"action": action})
|
| 58 |
+
response.raise_for_status()
|
| 59 |
+
result = response.json()
|
| 60 |
+
|
| 61 |
+
reward = result.get('reward', 0.0)
|
| 62 |
+
done = result.get('done', False)
|
| 63 |
+
info = result.get('info', {})
|
| 64 |
+
error_str = "null"
|
| 65 |
+
|
| 66 |
+
if 'error' in info and info['error']:
|
| 67 |
+
error_str = f"\"{info['error']}\""
|
| 68 |
+
|
| 69 |
+
total_reward.append(reward)
|
| 70 |
+
|
| 71 |
+
print(f"[STEP] step={step} action={action_str} reward={reward:.2f} done={done} error={error_str}")
|
| 72 |
+
|
| 73 |
+
if done:
|
| 74 |
+
print(f"[END] success={str(done).lower()} steps={step} rewards={total_reward}")
|
| 75 |
+
return 0
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"[ERROR] Failed to step environment: {e}")
|
| 79 |
+
print(f"[STEP] step={step} action={action_str} reward=0.00 done=false error=\"{str(e)}\"")
|
| 80 |
+
|
| 81 |
+
print(f"[END] success=false steps={max_steps} rewards={total_reward}")
|
| 82 |
+
return 0
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
sys.exit(main())
|
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "https://opencode.ai/config.json",
|
| 3 |
+
|
| 4 |
+
"agent": {
|
| 5 |
+
"planner": {
|
| 6 |
+
"entry": "./.opencode/agents/planner-agent.ts"
|
| 7 |
+
},
|
| 8 |
+
"memory": {
|
| 9 |
+
"entry": "./.opencode/agents/memory-agent.ts"
|
| 10 |
+
},
|
| 11 |
+
"executor": {
|
| 12 |
+
"entry": "./.opencode/agents/executor-agent.ts"
|
| 13 |
+
},
|
| 14 |
+
"web": {
|
| 15 |
+
"entry": "./.opencode/agents/web-agent.ts"
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
}
|
|
@@ -1,7 +1,20 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: coenv
|
| 2 |
+
version: 0.1.0
|
| 3 |
+
description: Kubernetes Cluster Simulator for OpenEnv RL Hackathon
|
| 4 |
+
entrypoint: server/app.py
|
| 5 |
+
inference: inference.py
|
| 6 |
+
endpoints:
|
| 7 |
+
reset: /reset
|
| 8 |
+
step: /step
|
| 9 |
+
state: /state
|
| 10 |
+
tasks:
|
| 11 |
+
- id: pod_recovery
|
| 12 |
+
difficulty: easy
|
| 13 |
+
- id: autoscaling
|
| 14 |
+
difficulty: medium
|
| 15 |
+
- id: incident
|
| 16 |
+
difficulty: hard
|
| 17 |
+
artifacts:
|
| 18 |
+
action_schema: server/actions/
|
| 19 |
+
observation_schema: server/models.py
|
| 20 |
+
reward_schema: server/models.py
|
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.100.0
|
| 2 |
+
uvicorn>=0.23.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
requests>=2.31.0
|
|
@@ -1,104 +1,430 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
"""
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
"""
|
| 13 |
|
| 14 |
-
from
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
try:
|
| 20 |
-
from ..models import CoenvAction, CoenvObservation
|
| 21 |
-
except ImportError:
|
| 22 |
-
from models import CoenvAction, CoenvObservation
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
class CoenvEnvironment(Environment):
|
| 26 |
-
"""
|
| 27 |
-
A simple echo environment that echoes back messages.
|
| 28 |
-
|
| 29 |
-
This environment is designed for testing the HTTP server infrastructure.
|
| 30 |
-
It maintains minimal state and simply echoes back whatever message it receives.
|
| 31 |
-
|
| 32 |
-
Example:
|
| 33 |
-
>>> env = CoenvEnvironment()
|
| 34 |
-
>>> obs = env.reset()
|
| 35 |
-
>>> print(obs.echoed_message) # "Coenv environment ready!"
|
| 36 |
-
>>>
|
| 37 |
-
>>> obs = env.step(CoenvAction(message="Hello"))
|
| 38 |
-
>>> print(obs.echoed_message) # "Hello"
|
| 39 |
-
>>> print(obs.message_length) # 5
|
| 40 |
-
"""
|
| 41 |
-
|
| 42 |
-
# Enable concurrent WebSocket sessions.
|
| 43 |
-
# Set to True if your environment isolates state between instances.
|
| 44 |
-
# When True, multiple WebSocket clients can connect simultaneously, each
|
| 45 |
-
# getting their own environment instance (when using factory mode in app.py).
|
| 46 |
-
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 47 |
-
|
| 48 |
-
def __init__(self):
|
| 49 |
-
"""Initialize the COEnv environment."""
|
| 50 |
-
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 51 |
-
self._reset_count = 0
|
| 52 |
-
|
| 53 |
-
def reset(self) -> CoenvObservation:
|
| 54 |
-
"""
|
| 55 |
-
Reset the environment.
|
| 56 |
-
|
| 57 |
-
Returns:
|
| 58 |
-
CoenvObservation with a ready message
|
| 59 |
-
"""
|
| 60 |
-
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 61 |
-
self._reset_count += 1
|
| 62 |
-
|
| 63 |
-
return CoenvObservation(
|
| 64 |
-
echoed_message="Coenv environment ready!",
|
| 65 |
-
message_length=0,
|
| 66 |
-
done=False,
|
| 67 |
-
reward=0.0,
|
| 68 |
-
)
|
| 69 |
-
|
| 70 |
-
def step(self, action: CoenvAction) -> CoenvObservation: # type: ignore[override]
|
| 71 |
-
"""
|
| 72 |
-
Execute a step in the environment by echoing the message.
|
| 73 |
-
|
| 74 |
-
Args:
|
| 75 |
-
action: CoenvAction containing the message to echo
|
| 76 |
-
|
| 77 |
-
Returns:
|
| 78 |
-
CoenvObservation with the echoed message and its length
|
| 79 |
-
"""
|
| 80 |
-
self._state.step_count += 1
|
| 81 |
-
|
| 82 |
-
message = action.message
|
| 83 |
-
length = len(message)
|
| 84 |
-
|
| 85 |
-
# Simple reward: longer messages get higher rewards
|
| 86 |
-
reward = length * 0.1
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
metadata={"original_message": message, "step": self._state.step_count},
|
| 94 |
-
)
|
| 95 |
|
| 96 |
-
@property
|
| 97 |
-
def state(self) -> State:
|
| 98 |
-
"""
|
| 99 |
-
Get the current environment state.
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
COEnv Environment - Cluster Simulator
|
| 3 |
+
In-memory dict that holds cluster state: nodes, pods, deployments, services.
|
| 4 |
+
Has methods like get_pods(), apply_patch(), tick() to advance time.
|
| 5 |
+
This is the brain of the whole project.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
from typing import Dict, List, Any, Optional, Literal
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import random
|
| 11 |
+
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
from .models import (
|
| 14 |
+
NodeStatus, PodStatus, DeploymentStatus, ServiceStatus,
|
| 15 |
+
ClusterEvent, ClusterObservation, KubeAction, RewardSignal,
|
| 16 |
+
ConfigMapStatus, HPAStatus
|
| 17 |
+
)
|
|
|
|
|
|
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
class World:
|
| 21 |
+
"""In-memory Kubernetes cluster simulator"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, config: Dict[str, Any]):
|
| 24 |
+
self.config = config
|
| 25 |
+
self.cluster_state = self._initialize_healthy_cluster()
|
| 26 |
+
self.step_count = 0
|
| 27 |
+
self.events = []
|
| 28 |
+
self._event_counter = 0
|
| 29 |
+
|
| 30 |
+
def _initialize_healthy_cluster(self) -> Dict[str, List[Dict]]:
|
| 31 |
+
"""Initialize a healthy cluster state based on config"""
|
| 32 |
+
nodes = []
|
| 33 |
+
for i in range(self.config.get("num_nodes", 3)):
|
| 34 |
+
nodes.append({
|
| 35 |
+
"name": f"node-{i+1}",
|
| 36 |
+
"status": "Ready",
|
| 37 |
+
"cpu_capacity": self.config.get("node_cpu_capacity", 4),
|
| 38 |
+
"mem_capacity": self.config.get("node_mem_capacity", 8192),
|
| 39 |
+
"cpu_usage": 0.0,
|
| 40 |
+
"mem_usage": 0.0,
|
| 41 |
+
"last_updated": datetime.now().isoformat()
|
| 42 |
+
})
|
| 43 |
+
|
| 44 |
+
pods = []
|
| 45 |
+
deployments = []
|
| 46 |
+
services = []
|
| 47 |
+
configmaps = []
|
| 48 |
+
hpas = []
|
| 49 |
+
|
| 50 |
+
# Create some default deployments and their pods
|
| 51 |
+
default_deployments = [
|
| 52 |
+
{"name": "frontend", "image": "nginx:1.21", "replicas": 3},
|
| 53 |
+
{"name": "backend", "image": "python:3.9", "replicas": 2},
|
| 54 |
+
{"name": "database", "image": "postgres:13", "replicas": 1},
|
| 55 |
+
{"name": "auth-service", "image": "auth:latest", "replicas": 2},
|
| 56 |
+
{"name": "api-gateway", "image": "nginx:alpine", "replicas": 2}
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
for dep in default_deployments:
|
| 60 |
+
deployments.append({
|
| 61 |
+
"name": dep["name"],
|
| 62 |
+
"desired_replicas": dep["replicas"],
|
| 63 |
+
"available_replicas": dep["replicas"],
|
| 64 |
+
"image": dep["image"],
|
| 65 |
+
"last_updated": datetime.now().isoformat()
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
# Create pods for this deployment
|
| 69 |
+
for j in range(dep["replicas"]):
|
| 70 |
+
pod_name = f"{dep['name']}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
|
| 71 |
+
pods.append({
|
| 72 |
+
"name": pod_name,
|
| 73 |
+
"status": "Running",
|
| 74 |
+
"node": nodes[j % len(nodes)]["name"] if nodes else None,
|
| 75 |
+
"restarts": 0,
|
| 76 |
+
"cpu_request": self.config.get("pod_cpu_request", 500),
|
| 77 |
+
"mem_request": self.config.get("pod_mem_request", 256),
|
| 78 |
+
"cpu_limit": self.config.get("pod_cpu_limit", 1000),
|
| 79 |
+
"mem_limit": self.config.get("pod_mem_limit", 512),
|
| 80 |
+
"deployment": dep["name"],
|
| 81 |
+
"last_updated": datetime.now().isoformat()
|
| 82 |
+
})
|
| 83 |
+
|
| 84 |
+
# Create some default services
|
| 85 |
+
default_services = [
|
| 86 |
+
{"name": "frontend-service", "type": "ClusterIP", "ports": [{"port": 80, "targetPort": 80}]},
|
| 87 |
+
{"name": "backend-service", "type": "ClusterIP", "ports": [{"port": 8080, "targetPort": 8080}]},
|
| 88 |
+
{"name": "database-service", "type": "ClusterIP", "ports": [{"port": 5432, "targetPort": 5432}]},
|
| 89 |
+
{"name": "auth-service-service", "type": "ClusterIP", "ports": [{"port": 8000, "targetPort": 8000}]}
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
for svc in default_services:
|
| 93 |
+
services.append({
|
| 94 |
+
"name": svc["name"],
|
| 95 |
+
"type": svc["type"],
|
| 96 |
+
"ports": svc["ports"],
|
| 97 |
+
"selector": {"app": svc["name"].replace("-service", "")},
|
| 98 |
+
"cluster_ip": f"10.96.{len(services)+1}.{len(services)+1}",
|
| 99 |
+
"last_updated": datetime.now().isoformat()
|
| 100 |
+
})
|
| 101 |
+
|
| 102 |
+
# Create some default configmaps
|
| 103 |
+
default_configmaps = [
|
| 104 |
+
{"name": "frontend-config", "data": {"DB_HOST": "db.prod.internal", "DB_PORT": "5432"}},
|
| 105 |
+
{"name": "backend-config", "data": {"LOG_LEVEL": "info", "CACHE_SIZE": "100"}},
|
| 106 |
+
{"name": "database-config", "data": {"MAX_CONNECTIONS": "100", "TIMEOUT": "30"}}
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
for cm in default_configmaps:
|
| 110 |
+
configmaps.append({
|
| 111 |
+
"name": cm["name"],
|
| 112 |
+
"data": cm["data"],
|
| 113 |
+
"last_updated": datetime.now().isoformat()
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
# Create some default HPAs
|
| 117 |
+
default_hpas = [
|
| 118 |
+
{"name": "frontend-hpa", "min_replicas": 2, "max_replicas": 10, "cpu_target_percent": 70},
|
| 119 |
+
{"name": "backend-hpa", "min_replicas": 1, "max_replicas": 5, "cpu_target_percent": 80}
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
for hpa in default_hpas:
|
| 123 |
+
hpas.append({
|
| 124 |
+
"name": hpa["name"],
|
| 125 |
+
"min_replicas": hpa["min_replicas"],
|
| 126 |
+
"max_replicas": hpa["max_replicas"],
|
| 127 |
+
"current_replicas": hpa["min_replicas"],
|
| 128 |
+
"cpu_target_percent": hpa["cpu_target_percent"],
|
| 129 |
+
"last_updated": datetime.now().isoformat()
|
| 130 |
+
})
|
| 131 |
+
|
| 132 |
+
return {
|
| 133 |
+
"nodes": nodes,
|
| 134 |
+
"pods": pods,
|
| 135 |
+
"deployments": deployments,
|
| 136 |
+
"services": services,
|
| 137 |
+
"configmaps": configmaps,
|
| 138 |
+
"hpas": hpas
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
def get_pods(self, namespace: Optional[str] = None, selector: Optional[Dict[str, str]] = None) -> List[PodStatus]:
|
| 142 |
+
"""Returns filtered pod list (mimics kubectl get pods)"""
|
| 143 |
+
pods = [PodStatus(**pod) for pod in self.cluster_state["pods"]]
|
| 144 |
+
# Simple filtering by namespace (not fully implemented - just returns all for now)
|
| 145 |
+
return pods
|
| 146 |
+
|
| 147 |
+
def get_nodes(self) -> List[NodeStatus]:
|
| 148 |
+
"""Get all nodes as Pydantic models"""
|
| 149 |
+
return [NodeStatus(**node) for node in self.cluster_state["nodes"]]
|
| 150 |
+
|
| 151 |
+
def get_deployments(self) -> List[DeploymentStatus]:
|
| 152 |
+
"""Get all deployments as Pydantic models"""
|
| 153 |
+
return [DeploymentStatus(**dep) for dep in self.cluster_state["deployments"]]
|
| 154 |
+
|
| 155 |
+
def get_services(self) -> List[ServiceStatus]:
|
| 156 |
+
"""Get all services as Pydantic models"""
|
| 157 |
+
return [ServiceStatus(**svc) for svc in self.cluster_state["services"]]
|
| 158 |
+
|
| 159 |
+
def get_configmaps(self) -> List[ConfigMapStatus]:
|
| 160 |
+
"""Get all configmaps as Pydantic models"""
|
| 161 |
+
return [ConfigMapStatus(**cm) for cm in self.cluster_state["configmaps"]]
|
| 162 |
+
|
| 163 |
+
def get_hpas(self) -> List[HPAStatus]:
|
| 164 |
+
"""Get all HPAs as Pydantic models"""
|
| 165 |
+
return [HPAStatus(**hpa) for hpa in self.cluster_state["hpas"]]
|
| 166 |
+
|
| 167 |
+
def get_events(self) -> List[ClusterEvent]:
|
| 168 |
+
"""Get all events"""
|
| 169 |
+
return self.events.copy()
|
| 170 |
+
|
| 171 |
+
def apply_patch(self, resource_type: str, name: str, patch: Dict[str, Any]) -> bool:
|
| 172 |
+
"""Apply a patch to a resource"""
|
| 173 |
+
try:
|
| 174 |
+
if resource_type == "deployment":
|
| 175 |
+
for dep in self.cluster_state["deployments"]:
|
| 176 |
+
if dep["name"] == name:
|
| 177 |
+
dep.update(patch)
|
| 178 |
+
dep["last_updated"] = datetime.now().isoformat()
|
| 179 |
+
if "desired_replicas" in patch or "available_replicas" in patch:
|
| 180 |
+
self._update_pods_for_deployment(name, dep.get("desired_replicas", dep["desired_replicas"]))
|
| 181 |
+
return True
|
| 182 |
+
|
| 183 |
+
elif resource_type == "pod":
|
| 184 |
+
for pod in self.cluster_state["pods"]:
|
| 185 |
+
if pod["name"] == name:
|
| 186 |
+
pod.update(patch)
|
| 187 |
+
pod["last_updated"] = datetime.now().isoformat()
|
| 188 |
+
return True
|
| 189 |
+
|
| 190 |
+
elif resource_type == "node":
|
| 191 |
+
for node in self.cluster_state["nodes"]:
|
| 192 |
+
if node["name"] == name:
|
| 193 |
+
node.update(patch)
|
| 194 |
+
node["last_updated"] = datetime.now().isoformat()
|
| 195 |
+
return True
|
| 196 |
+
|
| 197 |
+
elif resource_type == "service":
|
| 198 |
+
for svc in self.cluster_state["services"]:
|
| 199 |
+
if svc["name"] == name:
|
| 200 |
+
svc.update(patch)
|
| 201 |
+
svc["last_updated"] = datetime.now().isoformat()
|
| 202 |
+
return True
|
| 203 |
+
|
| 204 |
+
elif resource_type == "configmap":
|
| 205 |
+
for cm in self.cluster_state["configmaps"]:
|
| 206 |
+
if cm["name"] == name:
|
| 207 |
+
cm.update(patch)
|
| 208 |
+
cm["last_updated"] = datetime.now().isoformat()
|
| 209 |
+
return True
|
| 210 |
+
|
| 211 |
+
elif resource_type == "hpa":
|
| 212 |
+
for hpa in self.cluster_state["hpas"]:
|
| 213 |
+
if hpa["name"] == name:
|
| 214 |
+
hpa.update(patch)
|
| 215 |
+
hpa["last_updated"] = datetime.now().isoformat()
|
| 216 |
+
return True
|
| 217 |
+
|
| 218 |
+
return False
|
| 219 |
+
except Exception as e:
|
| 220 |
+
print(f"Error applying patch: {e}")
|
| 221 |
+
return False
|
| 222 |
+
|
| 223 |
+
def _update_pods_for_deployment(self, deployment_name: str, desired_replicas: int):
|
| 224 |
+
"""Update pods count for a deployment"""
|
| 225 |
+
current_pods = [p for p in self.cluster_state["pods"] if p.get("deployment") == deployment_name]
|
| 226 |
+
current_count = len(current_pods)
|
| 227 |
+
|
| 228 |
+
if desired_replicas > current_count:
|
| 229 |
+
nodes = self.cluster_state["nodes"]
|
| 230 |
+
for i in range(desired_replicas - current_count):
|
| 231 |
+
deployment = next((d for d in self.cluster_state["deployments"] if d["name"] == deployment_name), None)
|
| 232 |
+
if deployment:
|
| 233 |
+
pod_name = f"{deployment_name}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
|
| 234 |
+
node = nodes[i % len(nodes)] if nodes else None
|
| 235 |
+
self.cluster_state["pods"].append({
|
| 236 |
+
"name": pod_name,
|
| 237 |
+
"status": "Pending",
|
| 238 |
+
"node": node["name"] if node else None,
|
| 239 |
+
"restarts": 0,
|
| 240 |
+
"cpu_request": self.config.get("pod_cpu_request", 500),
|
| 241 |
+
"mem_request": self.config.get("pod_mem_request", 256),
|
| 242 |
+
"cpu_limit": self.config.get("pod_cpu_limit", 1000),
|
| 243 |
+
"mem_limit": self.config.get("pod_mem_limit", 512),
|
| 244 |
+
"deployment": deployment_name,
|
| 245 |
+
"last_updated": datetime.now().isoformat()
|
| 246 |
+
})
|
| 247 |
+
elif desired_replicas < current_count:
|
| 248 |
+
pods_to_remove = current_pods[desired_replicas:]
|
| 249 |
+
for pod in pods_to_remove:
|
| 250 |
+
self.cluster_state["pods"].remove(pod)
|
| 251 |
+
|
| 252 |
+
def scale(self, deployment_name: str, replicas: int) -> bool:
|
| 253 |
+
"""Changes replica count"""
|
| 254 |
+
return self.apply_patch("deployment", deployment_name, {"desired_replicas": replicas})
|
| 255 |
+
|
| 256 |
+
def delete_pod(self, pod_name: str) -> bool:
|
| 257 |
+
"""Removes a pod (it gets recreated by the deployment controller on next tick)"""
|
| 258 |
+
pod_index = None
|
| 259 |
+
for i, pod in enumerate(self.cluster_state["pods"]):
|
| 260 |
+
if pod["name"] == pod_name:
|
| 261 |
+
pod_index = i
|
| 262 |
+
break
|
| 263 |
+
|
| 264 |
+
if pod_index is not None:
|
| 265 |
+
del self.cluster_state["pods"][pod_index]
|
| 266 |
+
|
| 267 |
+
event_type: Literal["Normal"] = "Normal" # type: ignore
|
| 268 |
+
event = ClusterEvent(
|
| 269 |
+
event_id=f"event-delpod-{random.randint(1000, 9999)}",
|
| 270 |
+
timestamp=datetime.now().isoformat(),
|
| 271 |
+
type=event_type,
|
| 272 |
+
reason="UserDeleted",
|
| 273 |
+
message=f"pod/{pod_name} deleted by user",
|
| 274 |
+
involved_object=pod_name
|
| 275 |
+
)
|
| 276 |
+
self.events.append(event)
|
| 277 |
+
|
| 278 |
+
return True
|
| 279 |
+
return False
|
| 280 |
+
|
| 281 |
+
def rollout_restart(self, deployment: str) -> bool:
|
| 282 |
+
"""Restart a deployment rollout"""
|
| 283 |
+
# Delete all pods for this deployment - they'll get recreated with new config
|
| 284 |
+
pods_to_delete = [p for p in self.cluster_state["pods"] if p.get("deployment") == deployment]
|
| 285 |
+
|
| 286 |
+
for pod in pods_to_delete:
|
| 287 |
+
event_type: Literal["Normal"] = "Normal" # type: ignore
|
| 288 |
+
event = ClusterEvent(
|
| 289 |
+
event_id=f"event-restart-{random.randint(1000, 9999)}",
|
| 290 |
+
timestamp=datetime.now().isoformat(),
|
| 291 |
+
type=event_type,
|
| 292 |
+
reason="RolledOut",
|
| 293 |
+
message=f"Deployment {deployment} rollout restart triggered",
|
| 294 |
+
involved_object=deployment
|
| 295 |
+
)
|
| 296 |
+
self.events.append(event)
|
| 297 |
+
|
| 298 |
+
# Delete pods - they'll be recreated on next tick
|
| 299 |
+
self.cluster_state["pods"] = [p for p in self.cluster_state["pods"] if p.get("deployment") != deployment]
|
| 300 |
+
|
| 301 |
+
return True
|
| 302 |
+
|
| 303 |
+
def tick(self):
|
| 304 |
+
"""Advances simulated time by one step. Pods in CrashLoopBackOff increment their restart counter. Pending pods on ready nodes eventually transition to Running. Dead nodes stay dead unless drained."""
|
| 305 |
+
self.step_count += 1
|
| 306 |
+
|
| 307 |
+
# Simulate some natural changes in resource usage
|
| 308 |
+
for node in self.cluster_state["nodes"]:
|
| 309 |
+
node["cpu_usage"] = max(0, min(100, node["cpu_usage"] + random.uniform(-5, 5)))
|
| 310 |
+
node["mem_usage"] = max(0, min(100, node["mem_usage"] + random.uniform(-5, 5)))
|
| 311 |
+
node["last_updated"] = datetime.now().isoformat()
|
| 312 |
+
|
| 313 |
+
# Update pod statuses based on node status
|
| 314 |
+
for pod in self.cluster_state["pods"]:
|
| 315 |
+
node_name = pod.get("node")
|
| 316 |
+
if node_name:
|
| 317 |
+
node = next((n for n in self.cluster_state["nodes"] if n["name"] == node_name), None)
|
| 318 |
+
if node and node["status"] != "Ready":
|
| 319 |
+
if pod["status"] == "Running":
|
| 320 |
+
pod["status"] = "Unknown"
|
| 321 |
+
elif pod["status"] == "Pending":
|
| 322 |
+
pod["status"] = "Unknown"
|
| 323 |
+
elif node and node["status"] == "Ready" and pod["status"] == "Pending":
|
| 324 |
+
if random.random() > 0.7:
|
| 325 |
+
pod["status"] = "Running"
|
| 326 |
+
pod["last_updated"] = datetime.now().isoformat()
|
| 327 |
+
|
| 328 |
+
# Update deployment available replicas based on running pods
|
| 329 |
+
for deployment in self.cluster_state["deployments"]:
|
| 330 |
+
running_pods = [p for p in self.cluster_state["pods"]
|
| 331 |
+
if p.get("deployment") == deployment["name"] and p["status"] == "Running"]
|
| 332 |
+
deployment["available_replicas"] = len(running_pods)
|
| 333 |
+
deployment["last_updated"] = datetime.now().isoformat()
|
| 334 |
+
|
| 335 |
+
# Re-create pods for deployments that need them
|
| 336 |
+
for deployment in self.cluster_state["deployments"]:
|
| 337 |
+
desired = deployment.get("desired_replicas", 0)
|
| 338 |
+
current_pods = [p for p in self.cluster_state["pods"] if p.get("deployment") == deployment["name"]]
|
| 339 |
+
current_count = len(current_pods)
|
| 340 |
+
|
| 341 |
+
if current_count < desired:
|
| 342 |
+
nodes = self.cluster_state["nodes"]
|
| 343 |
+
for i in range(desired - current_count):
|
| 344 |
+
pod_name = f"{deployment['name']}-{random.randint(1000, 9999)}-{''.join([chr(random.randint(97, 122)) for _ in range(5)])}"
|
| 345 |
+
node = nodes[i % len(nodes)] if nodes else None
|
| 346 |
+
self.cluster_state["pods"].append({
|
| 347 |
+
"name": pod_name,
|
| 348 |
+
"status": "Running",
|
| 349 |
+
"node": node["name"] if node else None,
|
| 350 |
+
"restarts": 0,
|
| 351 |
+
"cpu_request": self.config.get("pod_cpu_request", 500),
|
| 352 |
+
"mem_request": self.config.get("pod_mem_request", 256),
|
| 353 |
+
"cpu_limit": self.config.get("pod_cpu_limit", 1000),
|
| 354 |
+
"mem_limit": self.config.get("pod_mem_limit", 512),
|
| 355 |
+
"deployment": deployment["name"],
|
| 356 |
+
"last_updated": datetime.now().isoformat()
|
| 357 |
+
})
|
| 358 |
+
|
| 359 |
+
# Generate occasional events
|
| 360 |
+
if random.random() < 0.3:
|
| 361 |
+
self._generate_event()
|
| 362 |
+
|
| 363 |
+
def _generate_event(self):
|
| 364 |
+
"""Generate a realistic cluster event"""
|
| 365 |
+
event_types = [
|
| 366 |
+
{"type": "Normal", "reason": "Scheduled", "message": "Successfully assigned node"},
|
| 367 |
+
{"type": "Warning", "reason": "FailedScheduling", "message": "0/3 nodes are available: 3 Insufficient cpu."},
|
| 368 |
+
{"type": "Normal", "reason": "Pulling", "message": "Pulling image \"nginx:1.21\""},
|
| 369 |
+
{"type": "Normal", "reason": "Pulled", "message": "Successfully pulled image \"nginx:1.21\""},
|
| 370 |
+
{"type": "Normal", "reason": "Created", "message": "Created container"},
|
| 371 |
+
{"type": "Normal", "reason": "Started", "message": "Started container"},
|
| 372 |
+
{"type": "Warning", "reason": "BackOff", "message": "Back-off restarting failed container"},
|
| 373 |
+
{"type": "Normal", "reason": "Killing", "message": "Stopping container"}
|
| 374 |
+
]
|
| 375 |
+
|
| 376 |
+
event = random.choice(event_types)
|
| 377 |
+
involved_objects = []
|
| 378 |
+
involved_objects.extend([p["name"] for p in self.cluster_state["pods"][:3]])
|
| 379 |
+
involved_objects.extend([d["name"] for d in self.cluster_state["deployments"][:3]])
|
| 380 |
+
involved_objects.extend([n["name"] for n in self.cluster_state["nodes"][:3]])
|
| 381 |
+
|
| 382 |
+
if not involved_objects:
|
| 383 |
+
involved_objects = ["cluster"]
|
| 384 |
+
|
| 385 |
+
event_type: Literal["Normal", "Warning"] = event["type"] # type: ignore
|
| 386 |
+
self.events.append(ClusterEvent(
|
| 387 |
+
event_id=f"event-{self._event_counter:04d}",
|
| 388 |
+
timestamp=datetime.now().isoformat(),
|
| 389 |
+
type=event_type,
|
| 390 |
+
reason=event["reason"],
|
| 391 |
+
message=event["message"],
|
| 392 |
+
involved_object=random.choice(involved_objects)
|
| 393 |
+
))
|
| 394 |
+
self._event_counter += 1
|
| 395 |
+
|
| 396 |
+
if len(self.events) > 100:
|
| 397 |
+
self.events = self.events[-50:]
|
| 398 |
+
|
| 399 |
+
def get_full_state(self) -> Dict[str, Any]:
|
| 400 |
+
"""Get the full cluster state for debugging"""
|
| 401 |
+
return {
|
| 402 |
+
"nodes": self.get_nodes(),
|
| 403 |
+
"pods": self.get_pods(),
|
| 404 |
+
"deployments": self.get_deployments(),
|
| 405 |
+
"services": self.get_services(),
|
| 406 |
+
"configmaps": self.get_configmaps(),
|
| 407 |
+
"hpas": self.get_hpas(),
|
| 408 |
+
"events": self.get_events(),
|
| 409 |
+
"step": self.step_count
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
def reset_to_healthy(self):
|
| 413 |
+
"""Reset cluster to healthy state"""
|
| 414 |
+
self.cluster_state = self._initialize_healthy_cluster()
|
| 415 |
+
self.step_count = 0
|
| 416 |
+
self.events = []
|
| 417 |
+
self._event_counter = 0
|
| 418 |
+
|
| 419 |
+
def reset(self, condition=None):
|
| 420 |
+
"""Reset the world state and optionally inject a failure condition"""
|
| 421 |
+
self.reset_to_healthy()
|
| 422 |
+
if condition:
|
| 423 |
+
condition.inject(self)
|
| 424 |
+
return self.get_observation()
|
| 425 |
+
|
| 426 |
+
def get_observation(self, objective: str = "Maintain cluster health"):
|
| 427 |
+
"""Serialises the current state into a ClusterObservation Pydantic model"""
|
| 428 |
+
observation_dict = self.get_full_state()
|
| 429 |
+
observation_dict["objective"] = objective
|
| 430 |
+
return ClusterObservation(**observation_dict)
|
|
@@ -1,80 +0,0 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
# Multi-stage build using openenv-base
|
| 8 |
-
# This Dockerfile is flexible and works for both:
|
| 9 |
-
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
-
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
-
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
-
|
| 13 |
-
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
-
FROM ${BASE_IMAGE} AS builder
|
| 15 |
-
|
| 16 |
-
WORKDIR /app
|
| 17 |
-
|
| 18 |
-
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
-
RUN apt-get update && \
|
| 20 |
-
apt-get install -y --no-install-recommends git && \
|
| 21 |
-
rm -rf /var/lib/apt/lists/*
|
| 22 |
-
|
| 23 |
-
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
-
ARG BUILD_MODE=in-repo
|
| 25 |
-
ARG ENV_NAME=COEnv
|
| 26 |
-
|
| 27 |
-
# Copy environment code (always at root of build context)
|
| 28 |
-
COPY . /app/env
|
| 29 |
-
|
| 30 |
-
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
-
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
-
WORKDIR /app/env
|
| 33 |
-
|
| 34 |
-
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
-
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
-
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
-
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
-
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
-
fi
|
| 40 |
-
|
| 41 |
-
# Install dependencies using uv sync
|
| 42 |
-
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
-
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
-
if [ -f uv.lock ]; then \
|
| 45 |
-
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
-
else \
|
| 47 |
-
uv sync --no-install-project --no-editable; \
|
| 48 |
-
fi
|
| 49 |
-
|
| 50 |
-
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
-
if [ -f uv.lock ]; then \
|
| 52 |
-
uv sync --frozen --no-editable; \
|
| 53 |
-
else \
|
| 54 |
-
uv sync --no-editable; \
|
| 55 |
-
fi
|
| 56 |
-
|
| 57 |
-
# Final runtime stage
|
| 58 |
-
FROM ${BASE_IMAGE}
|
| 59 |
-
|
| 60 |
-
WORKDIR /app
|
| 61 |
-
|
| 62 |
-
# Copy the virtual environment from builder
|
| 63 |
-
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
-
|
| 65 |
-
# Copy the environment code
|
| 66 |
-
COPY --from=builder /app/env /app/env
|
| 67 |
-
|
| 68 |
-
# Set PATH to use the virtual environment
|
| 69 |
-
ENV PATH="/app/.venv/bin:$PATH"
|
| 70 |
-
|
| 71 |
-
# Set PYTHONPATH so imports work correctly
|
| 72 |
-
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 73 |
-
|
| 74 |
-
# Health check
|
| 75 |
-
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 76 |
-
CMD curl -f http://localhost:8000/health || exit 1
|
| 77 |
-
|
| 78 |
-
# Run the FastAPI server
|
| 79 |
-
# The module path is constructed to work with the /app/env structure
|
| 80 |
-
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1,11 +1,25 @@
|
|
| 1 |
-
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
|
| 8 |
|
| 9 |
-
from .COEnv_environment import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv - Kubernetes Cluster Simulator for OpenEnv"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
__version__ = "0.1.0"
|
| 4 |
|
| 5 |
+
from .COEnv_environment import World
|
| 6 |
+
from .models import (
|
| 7 |
+
ClusterObservation,
|
| 8 |
+
RewardSignal,
|
| 9 |
+
KubeAction,
|
| 10 |
+
PodStatus,
|
| 11 |
+
NodeStatus,
|
| 12 |
+
DeploymentStatus,
|
| 13 |
+
ServiceStatus
|
| 14 |
+
)
|
| 15 |
|
| 16 |
+
__all__ = [
|
| 17 |
+
"World",
|
| 18 |
+
"ClusterObservation",
|
| 19 |
+
"RewardSignal",
|
| 20 |
+
"KubeAction",
|
| 21 |
+
"PodStatus",
|
| 22 |
+
"NodeStatus",
|
| 23 |
+
"DeploymentStatus",
|
| 24 |
+
"ServiceStatus"
|
| 25 |
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Actions - Action definitions"""
|
| 2 |
+
|
| 3 |
+
__all__ = ["scale_action", "delete_pod_action", "patch_action", "rollout_action", "hpa_action", "drain_action", "describe_action"]
|
| 4 |
+
|
| 5 |
+
from .scale_action import ScaleAction
|
| 6 |
+
from .delete_pod_action import DeletePodAction
|
| 7 |
+
from .patch_action import PatchAction
|
| 8 |
+
from .rollout_action import RolloutRestartAction
|
| 9 |
+
from .hpa_action import SetHPAAction
|
| 10 |
+
from .drain_action import DrainNodeAction
|
| 11 |
+
from .describe_action import DescribeAction
|
| 12 |
+
|
| 13 |
+
__all__ += ["ScaleAction", "DeletePodAction", "PatchAction", "RolloutRestartAction", "SetHPAAction", "DrainNodeAction", "DescribeAction"]
|
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Actions - Delete pod action"""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DeletePodAction(BaseModel):
|
| 8 |
+
"""Delete a specific pod"""
|
| 9 |
+
action_type: Literal["delete_pod"] = "delete_pod"
|
| 10 |
+
pod_name: str = Field(..., description="Pod name to delete")
|
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Actions - Describe action"""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DescribeAction(BaseModel):
|
| 8 |
+
"""Describe/get details of a resource"""
|
| 9 |
+
action_type: Literal["describe"] = "describe"
|
| 10 |
+
resource_type: Literal["deployment", "pod", "node", "service", "configmap"] = Field(..., description="Resource type")
|
| 11 |
+
name: str = Field(..., description="Resource name")
|
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Actions - Drain action"""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DrainNodeAction(BaseModel):
|
| 8 |
+
"""Drain a node (evict all pods)"""
|
| 9 |
+
action_type: Literal["drain_node"] = "drain_node"
|
| 10 |
+
node_name: str = Field(..., description="Node name to drain")
|
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Actions - HPA action"""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SetHPAAction(BaseModel):
|
| 8 |
+
"""Set HorizontalPodAutoscaler for a deployment"""
|
| 9 |
+
action_type: Literal["set_hpa"] = "set_hpa"
|
| 10 |
+
deployment: str = Field(..., description="Deployment name")
|
| 11 |
+
min_replicas: int = Field(..., ge=1, le=50, description="Minimum replicas")
|
| 12 |
+
max_replicas: int = Field(..., ge=1, le=100, description="Maximum replicas")
|
| 13 |
+
cpu_target_percent: int = Field(..., ge=1, le=100, description="CPU target percentage")
|
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Actions - Patch action"""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Literal, Dict, Any
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class PatchAction(BaseModel):
|
| 8 |
+
"""Patch a resource with specific changes"""
|
| 9 |
+
action_type: Literal["patch"] = "patch"
|
| 10 |
+
resource_type: Literal["deployment", "pod", "node", "service", "configmap"] = Field(..., description="Resource type")
|
| 11 |
+
name: str = Field(..., description="Resource name")
|
| 12 |
+
patch: Dict[str, Any] = Field(..., description="Patch to apply")
|
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Actions - Rollout restart action"""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class RolloutRestartAction(BaseModel):
|
| 8 |
+
"""Restart a deployment rollout"""
|
| 9 |
+
action_type: Literal["rollout_restart"] = "rollout_restart"
|
| 10 |
+
deployment: str = Field(..., description="Deployment name to restart")
|
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Actions - Scale action"""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Literal
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ScaleAction(BaseModel):
|
| 8 |
+
"""Scale a deployment to a specific replica count"""
|
| 9 |
+
action_type: Literal["scale"] = "scale"
|
| 10 |
+
deployment: str = Field(..., description="Deployment name to scale")
|
| 11 |
+
replicas: int = Field(..., ge=0, le=100, description="Number of replicas")
|
|
@@ -1,84 +1,283 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
"""
|
| 8 |
-
FastAPI
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
- POST /step: Execute an action
|
| 16 |
-
- GET /state: Get current environment state
|
| 17 |
-
- GET /schema: Get action/observation schemas
|
| 18 |
-
- WS /ws: WebSocket endpoint for persistent sessions
|
| 19 |
|
| 20 |
-
|
| 21 |
-
# Development (with auto-reload):
|
| 22 |
-
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
# Or run directly:
|
| 28 |
-
python -m server.app
|
| 29 |
-
"""
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
try:
|
| 39 |
-
from ..models import CoenvAction, CoenvObservation
|
| 40 |
-
from .COEnv_environment import CoenvEnvironment
|
| 41 |
-
except ModuleNotFoundError:
|
| 42 |
-
from models import CoenvAction, CoenvObservation
|
| 43 |
-
from server.COEnv_environment import CoenvEnvironment
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
# Create the app with web interface and README integration
|
| 47 |
-
app = create_app(
|
| 48 |
-
CoenvEnvironment,
|
| 49 |
-
CoenvAction,
|
| 50 |
-
CoenvObservation,
|
| 51 |
-
env_name="COEnv",
|
| 52 |
-
max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
|
| 53 |
-
)
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 57 |
-
"""
|
| 58 |
-
Entry point for direct execution via uv run or python -m.
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
Args:
|
| 66 |
-
host: Host address to bind to (default: "0.0.0.0")
|
| 67 |
-
port: Port number to listen on (default: 8000)
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
"""
|
| 73 |
-
import uvicorn
|
| 74 |
|
| 75 |
-
uvicorn.run(app, host=host, port=port)
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
if __name__ == "__main__":
|
| 79 |
-
import argparse
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
COEnv FastAPI Application
|
| 3 |
+
Exposes /reset /step /state endpoints
|
| 4 |
+
"""
|
| 5 |
|
| 6 |
+
from fastapi import FastAPI, HTTPException
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
from typing import Dict, Any, Optional, List, Literal
|
| 9 |
+
import uvicorn
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
|
| 14 |
+
from .COEnv_environment import World
|
| 15 |
+
from .models import ClusterObservation, RewardSignal, KubeAction
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
app = FastAPI(title="COEnv", description="Kubernetes Simulator for OpenEnv")
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
# Global world instance
|
| 20 |
+
world_instance: Optional[World] = None
|
| 21 |
+
config: Dict[str, Any] = {}
|
| 22 |
+
current_task: Optional[str] = None
|
| 23 |
+
current_objective: str = ""
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
def load_config():
|
| 27 |
+
"""Load configuration from config.json"""
|
| 28 |
+
global config
|
| 29 |
+
config_path = os.path.join(os.path.dirname(__file__), "..", "config.json")
|
| 30 |
+
try:
|
| 31 |
+
with open(config_path, 'r') as f:
|
| 32 |
+
config = json.load(f)
|
| 33 |
+
except FileNotFoundError:
|
| 34 |
+
config = {
|
| 35 |
+
"num_nodes": 3,
|
| 36 |
+
"node_cpu_capacity": 4,
|
| 37 |
+
"node_mem_capacity": 8192,
|
| 38 |
+
"pod_cpu_request": 250,
|
| 39 |
+
"pod_mem_request": 128,
|
| 40 |
+
"pod_cpu_limit": 500,
|
| 41 |
+
"pod_mem_limit": 256,
|
| 42 |
+
"crash_loop_failure_rate": 0.7,
|
| 43 |
+
"oom_kill_failure_rate": 0.6,
|
| 44 |
+
"node_failure_rate": 0.3,
|
| 45 |
+
"cascade_failure_probability": 0.5,
|
| 46 |
+
"task_timeout_values": 300,
|
| 47 |
+
"tasks": {
|
| 48 |
+
"pod_recovery": {"max_steps": 15, "success_threshold": 0.9},
|
| 49 |
+
"autoscaling": {"max_steps": 20, "success_threshold": 0.85},
|
| 50 |
+
"incident": {"max_steps": 30, "success_threshold": 0.80}
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
# Import conditions for task injection
|
| 56 |
+
def get_condition_for_task(task_id: str):
|
| 57 |
+
"""Get the condition injector for a task"""
|
| 58 |
+
if task_id == "pod_recovery":
|
| 59 |
+
from .conditions.crash_loop import CrashLoopCondition
|
| 60 |
+
return CrashLoopCondition(world_instance, config)
|
| 61 |
+
elif task_id == "autoscaling":
|
| 62 |
+
from .conditions.oom_kill import OOMKillCondition
|
| 63 |
+
return OOMKillCondition(world_instance, config)
|
| 64 |
+
elif task_id == "incident":
|
| 65 |
+
from .conditions.cascade_failure import CascadeFailureCondition
|
| 66 |
+
return CascadeFailureCondition(world_instance, config)
|
| 67 |
+
return None
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
def get_objective_for_task(task_id: str) -> str:
|
| 71 |
+
"""Get the objective string for a task"""
|
| 72 |
+
objectives = {
|
| 73 |
+
"pod_recovery": "The frontend deployment is crash-looping. Diagnose and fix the root cause so that all pods reach Running state.",
|
| 74 |
+
"autoscaling": "Traffic has spiked 10×. The api-server deployment is overloaded. Configure autoscaling and ensure p95 latency stays below 500ms.",
|
| 75 |
+
"incident": "A cascading incident has degraded auth-service, api-gateway, and data-processor. Identify the root cause and restore all three services to healthy state without data loss."
|
| 76 |
+
}
|
| 77 |
+
return objectives.get(task_id, "Maintain cluster health")
|
| 78 |
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
@app.on_event("startup")
|
| 81 |
+
async def startup_event():
|
| 82 |
+
"""Initialize the world on startup"""
|
| 83 |
+
global world_instance, current_task, current_objective
|
| 84 |
+
load_config()
|
| 85 |
+
world_instance = World(config)
|
| 86 |
+
print("COEnv initialized")
|
| 87 |
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
class ResetRequest(BaseModel):
|
| 90 |
+
"""Request body for /reset endpoint"""
|
| 91 |
+
task: Optional[str] = Field(default="pod_recovery", description="Task ID to initialize")
|
|
|
|
|
|
|
| 92 |
|
|
|
|
| 93 |
|
| 94 |
+
@app.post("/reset")
|
| 95 |
+
async def reset(request: ResetRequest = ResetRequest()):
|
| 96 |
+
"""Reset the environment and return initial observation"""
|
| 97 |
+
global world_instance, current_task, current_objective
|
| 98 |
+
|
| 99 |
+
if world_instance is None:
|
| 100 |
+
raise HTTPException(status_code=500, detail="World not initialized")
|
| 101 |
+
|
| 102 |
+
current_task = request.task
|
| 103 |
+
current_objective = get_objective_for_task(request.task)
|
| 104 |
+
|
| 105 |
+
# Get condition for the task and inject it
|
| 106 |
+
condition = get_condition_for_task(request.task)
|
| 107 |
+
|
| 108 |
+
# Reset with the condition
|
| 109 |
+
observation = world_instance.reset(condition)
|
| 110 |
+
|
| 111 |
+
return observation
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class StepRequest(BaseModel):
|
| 115 |
+
"""Request body for /step endpoint"""
|
| 116 |
+
action: Dict[str, Any] = Field(..., description="Action to execute")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@app.post("/step")
|
| 120 |
+
async def step(request: StepRequest):
|
| 121 |
+
"""Apply an action and return next observation, reward, done, info"""
|
| 122 |
+
global world_instance, current_task, current_objective
|
| 123 |
+
|
| 124 |
+
if world_instance is None:
|
| 125 |
+
raise HTTPException(status_code=500, detail="World not initialized")
|
| 126 |
+
|
| 127 |
+
action = request.action
|
| 128 |
+
action_type = action.get("action_type", "")
|
| 129 |
+
|
| 130 |
+
# Execute action
|
| 131 |
+
info = {}
|
| 132 |
+
reward = 0.0
|
| 133 |
+
done = False
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
if action_type == "scale":
|
| 137 |
+
deployment = action.get("deployment", "")
|
| 138 |
+
replicas = action.get("replicas", 1)
|
| 139 |
+
world_instance.scale(deployment, replicas)
|
| 140 |
+
info["scaled"] = deployment
|
| 141 |
+
info["replicas"] = replicas
|
| 142 |
+
|
| 143 |
+
elif action_type == "delete_pod":
|
| 144 |
+
pod_name = action.get("pod_name", "")
|
| 145 |
+
world_instance.delete_pod(pod_name)
|
| 146 |
+
info["deleted"] = pod_name
|
| 147 |
+
|
| 148 |
+
elif action_type == "patch":
|
| 149 |
+
resource_type = action.get("resource_type", "")
|
| 150 |
+
name = action.get("name", "")
|
| 151 |
+
patch = action.get("patch", {})
|
| 152 |
+
world_instance.apply_patch(resource_type, name, patch)
|
| 153 |
+
info["patched"] = f"{resource_type}/{name}"
|
| 154 |
+
|
| 155 |
+
elif action_type == "rollout_restart":
|
| 156 |
+
deployment = action.get("deployment", "")
|
| 157 |
+
world_instance.rollout_restart(deployment)
|
| 158 |
+
info["restarted"] = deployment
|
| 159 |
+
|
| 160 |
+
elif action_type == "drain_node":
|
| 161 |
+
node_name = action.get("node_name", "")
|
| 162 |
+
world_instance.apply_patch("node", node_name, {"status": "SchedulingDisabled"})
|
| 163 |
+
info["drained"] = node_name
|
| 164 |
+
|
| 165 |
+
elif action_type == "set_hpa":
|
| 166 |
+
deployment = action.get("deployment", "")
|
| 167 |
+
min_replicas = action.get("min_replicas", 1)
|
| 168 |
+
max_replicas = action.get("max_replicas", 10)
|
| 169 |
+
cpu_target = action.get("cpu_target_percent", 80)
|
| 170 |
+
hpa_name = f"{deployment}-hpa"
|
| 171 |
+
world_instance.apply_patch("hpa", hpa_name, {
|
| 172 |
+
"min_replicas": min_replicas,
|
| 173 |
+
"max_replicas": max_replicas,
|
| 174 |
+
"cpu_target_percent": cpu_target
|
| 175 |
+
})
|
| 176 |
+
info["hpa_set"] = deployment
|
| 177 |
+
|
| 178 |
+
elif action_type == "describe":
|
| 179 |
+
# Investigation action - no state change
|
| 180 |
+
resource_type = action.get("resource_type", "")
|
| 181 |
+
name = action.get("name", "")
|
| 182 |
+
info["described"] = f"{resource_type}/{name}"
|
| 183 |
+
|
| 184 |
+
else:
|
| 185 |
+
info["error"] = f"Unknown action type: {action_type}"
|
| 186 |
+
reward = -0.1 # Penalty for invalid action
|
| 187 |
+
|
| 188 |
+
except Exception as e:
|
| 189 |
+
info["error"] = str(e)
|
| 190 |
+
reward = -0.1
|
| 191 |
+
|
| 192 |
+
# Always tick after an action
|
| 193 |
+
world_instance.tick()
|
| 194 |
+
|
| 195 |
+
# Calculate reward based on current state
|
| 196 |
+
reward = calculate_reward(world_instance, current_task)
|
| 197 |
+
|
| 198 |
+
# Check if task is done
|
| 199 |
+
max_steps = config.get("tasks", {}).get(current_task, {}).get("max_steps", 15)
|
| 200 |
+
if world_instance.step_count >= max_steps:
|
| 201 |
+
done = True
|
| 202 |
+
|
| 203 |
+
# Check if all pods are running (simplified done check)
|
| 204 |
+
if check_task_complete(world_instance, current_task):
|
| 205 |
+
done = True
|
| 206 |
+
|
| 207 |
+
observation = world_instance.get_observation(current_objective)
|
| 208 |
+
|
| 209 |
+
reward_signal = RewardSignal(reward=reward, done=done, info=info)
|
| 210 |
+
|
| 211 |
+
return {
|
| 212 |
+
"observation": observation.model_dump(),
|
| 213 |
+
"reward": reward_signal.reward,
|
| 214 |
+
"done": reward_signal.done,
|
| 215 |
+
"info": reward_signal.info
|
| 216 |
+
}
|
| 217 |
|
|
|
|
|
|
|
| 218 |
|
| 219 |
+
def calculate_reward(world: World, task_id: str) -> float:
|
| 220 |
+
"""Calculate reward based on current state"""
|
| 221 |
+
if task_id == "pod_recovery":
|
| 222 |
+
pods = world.get_pods()
|
| 223 |
+
frontend_pods = [p for p in pods if p.deployment == "frontend"]
|
| 224 |
+
running = [p for p in frontend_pods if p.status == "Running"]
|
| 225 |
+
if frontend_pods:
|
| 226 |
+
return len(running) / len(frontend_pods)
|
| 227 |
+
elif task_id == "autoscaling":
|
| 228 |
+
pods = world.get_pods()
|
| 229 |
+
backend_pods = [p for p in pods if p.deployment == "backend"]
|
| 230 |
+
running = [p for p in backend_pods if p.status == "Running"]
|
| 231 |
+
if backend_pods:
|
| 232 |
+
return min(len(running) / len(backend_pods), 1.0)
|
| 233 |
+
elif task_id == "incident":
|
| 234 |
+
pods = world.get_pods()
|
| 235 |
+
key_services = ["auth-service", "api-gateway", "frontend"]
|
| 236 |
+
healthy_count = 0
|
| 237 |
+
for svc in key_services:
|
| 238 |
+
svc_pods = [p for p in pods if p.deployment == svc]
|
| 239 |
+
running = [p for p in svc_pods if p.status == "Running"]
|
| 240 |
+
if svc_pods and len(running) >= len(svc_pods) * 0.8:
|
| 241 |
+
healthy_count += 1
|
| 242 |
+
return healthy_count / len(key_services) if key_services else 0.0
|
| 243 |
+
|
| 244 |
+
return 0.0
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def check_task_complete(world: World, task_id: str) -> bool:
|
| 248 |
+
"""Check if task is complete"""
|
| 249 |
+
if task_id == "pod_recovery":
|
| 250 |
+
pods = world.get_pods()
|
| 251 |
+
frontend_pods = [p for p in pods if p.deployment == "frontend"]
|
| 252 |
+
running = [p for p in frontend_pods if p.status == "Running"]
|
| 253 |
+
return len(frontend_pods) > 0 and len(running) == len(frontend_pods)
|
| 254 |
+
elif task_id == "autoscaling":
|
| 255 |
+
pods = world.get_pods()
|
| 256 |
+
backend_pods = [p for p in pods if p.deployment == "backend"]
|
| 257 |
+
running = [p for p in backend_pods if p.status == "Running"]
|
| 258 |
+
return len(backend_pods) >= 2 and len(running) >= 2
|
| 259 |
+
elif task_id == "incident":
|
| 260 |
+
pods = world.get_pods()
|
| 261 |
+
key_services = ["auth-service", "api-gateway", "frontend"]
|
| 262 |
+
for svc in key_services:
|
| 263 |
+
svc_pods = [p for p in pods if p.deployment == svc]
|
| 264 |
+
running = [p for p in svc_pods if p.status == "Running"]
|
| 265 |
+
if svc_pods and len(running) < len(svc_pods) * 0.8:
|
| 266 |
+
return False
|
| 267 |
+
return True
|
| 268 |
+
return False
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
@app.get("/state")
|
| 272 |
+
async def get_state():
|
| 273 |
+
"""Return full current simulator state"""
|
| 274 |
+
global world_instance, current_objective
|
| 275 |
+
|
| 276 |
+
if world_instance is None:
|
| 277 |
+
raise HTTPException(status_code=500, detail="World not initialized")
|
| 278 |
+
|
| 279 |
+
return world_instance.get_observation(current_objective).model_dump()
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
if __name__ == "__main__":
|
| 283 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Conditions - Failure injectors"""
|
| 2 |
+
|
| 3 |
+
__all__ = ["crash_loop", "oom_kill", "node_failure", "cascade_failure"]
|
| 4 |
+
|
| 5 |
+
from .crash_loop import CrashLoopCondition
|
| 6 |
+
from .oom_kill import OOMKillCondition
|
| 7 |
+
from .node_failure import NodeFailureCondition
|
| 8 |
+
from .cascade_failure import CascadeFailureCondition
|
| 9 |
+
|
| 10 |
+
__all__ += [
|
| 11 |
+
"CrashLoopCondition",
|
| 12 |
+
"OOMKillCondition",
|
| 13 |
+
"NodeFailureCondition",
|
| 14 |
+
"CascadeFailureCondition"
|
| 15 |
+
]
|
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CascadeFailureCondition - Simulates multi-service dependency failure
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Any, Optional
|
| 6 |
+
from ..COEnv_environment import World
|
| 7 |
+
import random
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class CascadeFailureCondition:
|
| 11 |
+
"""Injects cascading failures across multiple services"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, world: World, config: Dict[str, Any]):
|
| 14 |
+
self.world = world
|
| 15 |
+
self.config = config
|
| 16 |
+
|
| 17 |
+
def inject(self, root_cause_service: Optional[str] = None, failure_probability: Optional[float] = None):
|
| 18 |
+
"""
|
| 19 |
+
Inject cascading failures starting from a root cause service
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
root_cause_service: Specific service to start failure (None for random)
|
| 23 |
+
failure_probability: Probability of failure propagating to dependencies (0.0-1.0)
|
| 24 |
+
"""
|
| 25 |
+
if failure_probability is None:
|
| 26 |
+
failure_probability = self.config.get("cascade_failure_probability", 0.7)
|
| 27 |
+
else:
|
| 28 |
+
failure_probability = float(failure_probability)
|
| 29 |
+
|
| 30 |
+
if root_cause_service is None:
|
| 31 |
+
critical_services = ["auth-service", "database", "api-gateway"]
|
| 32 |
+
deployments = self.world.get_deployments()
|
| 33 |
+
critical_deployments = [d for d in deployments if d.name in critical_services]
|
| 34 |
+
if critical_deployments:
|
| 35 |
+
root_cause_service = random.choice(critical_deployments).name
|
| 36 |
+
else:
|
| 37 |
+
deployments = self.world.get_deployments()
|
| 38 |
+
root_cause_service = random.choice(deployments).name if deployments else "frontend"
|
| 39 |
+
|
| 40 |
+
root_deployment = next((d for d in self.world.get_deployments() if d.name == root_cause_service), None)
|
| 41 |
+
if root_deployment:
|
| 42 |
+
from ..oom_kill import OOMKillCondition
|
| 43 |
+
oom_condition = OOMKillCondition(self.world, self.config)
|
| 44 |
+
oom_condition.inject(target_deployment=root_cause_service, failure_rate=0.8)
|
| 45 |
+
|
| 46 |
+
self._add_cascade_event(f"Root cause failure in {root_cause_service}", "Warning")
|
| 47 |
+
|
| 48 |
+
deployments = self.world.get_deployments()
|
| 49 |
+
for deployment in deployments:
|
| 50 |
+
if deployment.name != root_cause_service and failure_probability is not None and random.random() < failure_probability:
|
| 51 |
+
failure_type = random.choice(["crashloop", "oom", "slow"])
|
| 52 |
+
|
| 53 |
+
if failure_type == "crashloop":
|
| 54 |
+
from ..crash_loop import CrashLoopCondition
|
| 55 |
+
condition = CrashLoopCondition(self.world, self.config)
|
| 56 |
+
condition.inject(target_deployment=deployment.name, failure_rate=0.6)
|
| 57 |
+
elif failure_type == "oom":
|
| 58 |
+
from ..oom_kill import OOMKillCondition
|
| 59 |
+
condition = OOMKillCondition(self.world, self.config)
|
| 60 |
+
condition.inject(target_deployment=deployment.name, failure_rate=0.6)
|
| 61 |
+
else:
|
| 62 |
+
pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
|
| 63 |
+
for pod in pods[:1]:
|
| 64 |
+
patch = {
|
| 65 |
+
"cpu_request": int(pod.cpu_request * 1.5) if pod.cpu_request else 750,
|
| 66 |
+
"mem_request": int(pod.mem_request * 1.5) if pod.mem_request else 384
|
| 67 |
+
}
|
| 68 |
+
self.world.apply_patch("pod", pod.name, patch)
|
| 69 |
+
|
| 70 |
+
self._add_cascade_event(f"Cascading failure detected in {deployment.name}", "Warning")
|
| 71 |
+
|
| 72 |
+
def _add_cascade_event(self, message: str, event_type: str):
|
| 73 |
+
"""Add a cascade failure event"""
|
| 74 |
+
from ..models import ClusterEvent
|
| 75 |
+
from datetime import datetime
|
| 76 |
+
|
| 77 |
+
event = ClusterEvent(
|
| 78 |
+
event_id=f"event-cascade-{random.randint(1000, 9999)}",
|
| 79 |
+
timestamp=datetime.now().isoformat(),
|
| 80 |
+
type=event_type,
|
| 81 |
+
reason="CascadeFailure",
|
| 82 |
+
message=message,
|
| 83 |
+
involved_object="cluster"
|
| 84 |
+
)
|
| 85 |
+
self.world.events.append(event)
|
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CrashLoopCondition - Simulates pods stuck in CrashLoopBackOff
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Any, Optional
|
| 6 |
+
from ..COEnv_environment import World
|
| 7 |
+
import random
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class CrashLoopCondition:
|
| 11 |
+
"""Injects CrashLoopBackOff failures into pods"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, world: World, config: Dict[str, Any]):
|
| 14 |
+
self.world = world
|
| 15 |
+
self.config = config
|
| 16 |
+
|
| 17 |
+
def inject(self, target_deployment: Optional[str] = None, failure_rate: Optional[float] = None):
|
| 18 |
+
"""
|
| 19 |
+
Inject crash loop failures into pods
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
target_deployment: Specific deployment to target (None for random)
|
| 23 |
+
failure_rate: Probability of each pod failing (0.0-1.0)
|
| 24 |
+
"""
|
| 25 |
+
if failure_rate is None:
|
| 26 |
+
failure_rate = self.config.get("crash_loop_failure_rate", 0.8)
|
| 27 |
+
else:
|
| 28 |
+
failure_rate = float(failure_rate)
|
| 29 |
+
|
| 30 |
+
deployments = self.world.get_deployments()
|
| 31 |
+
|
| 32 |
+
if target_deployment is not None:
|
| 33 |
+
target_deps = [d for d in deployments if d.name == target_deployment]
|
| 34 |
+
else:
|
| 35 |
+
target_deps = [random.choice(deployments)] if deployments else []
|
| 36 |
+
|
| 37 |
+
for deployment in target_deps:
|
| 38 |
+
pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
|
| 39 |
+
|
| 40 |
+
for pod in pods:
|
| 41 |
+
if failure_rate is not None and random.random() < failure_rate:
|
| 42 |
+
patch = {
|
| 43 |
+
"status": "CrashLoopBackOff",
|
| 44 |
+
"restarts": random.randint(5, 20)
|
| 45 |
+
}
|
| 46 |
+
self.world.apply_patch("pod", pod.name, patch)
|
| 47 |
+
self._add_crashloop_event(pod.name)
|
| 48 |
+
|
| 49 |
+
def _add_crashloop_event(self, pod_name: str):
|
| 50 |
+
"""Add a crashloop event"""
|
| 51 |
+
from .models import ClusterEvent
|
| 52 |
+
from datetime import datetime
|
| 53 |
+
|
| 54 |
+
event = ClusterEvent(
|
| 55 |
+
event_id=f"event-crashloop-{random.randint(1000, 9999)}",
|
| 56 |
+
timestamp=datetime.now().isoformat(),
|
| 57 |
+
type="Warning",
|
| 58 |
+
reason="BackOff",
|
| 59 |
+
message=f"Back-off restarting failed container pod/{pod_name}",
|
| 60 |
+
involved_object=pod_name
|
| 61 |
+
)
|
| 62 |
+
self.world.events.append(event)
|
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
NodeFailureCondition - Simulates node outages and scheduling disruption
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Any, Optional
|
| 6 |
+
from ..COEnv_environment import World
|
| 7 |
+
import random
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class NodeFailureCondition:
|
| 11 |
+
"""Injects node failures into the cluster"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, world: World, config: Dict[str, Any]):
|
| 14 |
+
self.world = world
|
| 15 |
+
self.config = config
|
| 16 |
+
|
| 17 |
+
def inject(self, target_node: Optional[str] = None, failure_rate: Optional[float] = None):
|
| 18 |
+
"""
|
| 19 |
+
Inject node failures
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
target_node: Specific node to target (None for random)
|
| 23 |
+
failure_rate: Probability of node failing (0.0-1.0)
|
| 24 |
+
"""
|
| 25 |
+
if failure_rate is None:
|
| 26 |
+
failure_rate = self.config.get("node_failure_rate", 0.3)
|
| 27 |
+
else:
|
| 28 |
+
failure_rate = float(failure_rate)
|
| 29 |
+
|
| 30 |
+
nodes = self.world.get_nodes()
|
| 31 |
+
|
| 32 |
+
if target_node:
|
| 33 |
+
target_nodes = [n for n in nodes if n.name == target_node]
|
| 34 |
+
else:
|
| 35 |
+
target_nodes = [n for n in nodes if failure_rate is not None and random.random() < failure_rate]
|
| 36 |
+
|
| 37 |
+
for node in target_nodes:
|
| 38 |
+
patch = {
|
| 39 |
+
"status": "NotReady",
|
| 40 |
+
"cpu_usage": 0.0,
|
| 41 |
+
"mem_usage": 0.0
|
| 42 |
+
}
|
| 43 |
+
self.world.apply_patch("node", node.name, patch)
|
| 44 |
+
|
| 45 |
+
pods_on_node = [p for p in self.world.get_pods() if p.node == node.name]
|
| 46 |
+
for pod in pods_on_node:
|
| 47 |
+
patch = {
|
| 48 |
+
"node": None,
|
| 49 |
+
"status": "Pending"
|
| 50 |
+
}
|
| 51 |
+
self.world.apply_patch("pod", pod.name, patch)
|
| 52 |
+
|
| 53 |
+
self._add_node_failure_event(node.name)
|
| 54 |
+
|
| 55 |
+
def _add_node_failure_event(self, node_name: str):
|
| 56 |
+
"""Add a node failure event"""
|
| 57 |
+
from models import ClusterEvent
|
| 58 |
+
from datetime import datetime
|
| 59 |
+
|
| 60 |
+
event = ClusterEvent(
|
| 61 |
+
event_id=f"event-nodefail-{random.randint(1000, 9999)}",
|
| 62 |
+
timestamp=datetime.now().isoformat(),
|
| 63 |
+
type="Warning",
|
| 64 |
+
reason="NodeNotReady",
|
| 65 |
+
message=f"Node {node_name} status is now: NodeNotReady",
|
| 66 |
+
involved_object=node_name
|
| 67 |
+
)
|
| 68 |
+
self.world.events.append(event)
|
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OOMKillCondition - Simulates memory-limit failures causing repeated restarts
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List, Any, Optional
|
| 6 |
+
from ..COEnv_environment import World
|
| 7 |
+
import random
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class OOMKillCondition:
|
| 11 |
+
"""Injects OOMKill failures into pods"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, world: World, config: Dict[str, Any]):
|
| 14 |
+
self.world = world
|
| 15 |
+
self.config = config
|
| 16 |
+
|
| 17 |
+
def inject(self, target_deployment: Optional[str] = None, failure_rate: Optional[float] = None):
|
| 18 |
+
"""
|
| 19 |
+
Inject OOMKill failures into pods
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
target_deployment: Specific deployment to target (None for random)
|
| 23 |
+
failure_rate: Probability of each pod failing (0.0-1.0)
|
| 24 |
+
"""
|
| 25 |
+
if failure_rate is None:
|
| 26 |
+
failure_rate = self.config.get("oom_kill_failure_rate", 0.6)
|
| 27 |
+
else:
|
| 28 |
+
# Ensure failure_rate is a float
|
| 29 |
+
failure_rate = float(failure_rate)
|
| 30 |
+
|
| 31 |
+
deployments = self.world.get_deployments()
|
| 32 |
+
|
| 33 |
+
if target_deployment is not None:
|
| 34 |
+
target_deps = [d for d in deployments if d.name == target_deployment]
|
| 35 |
+
else:
|
| 36 |
+
# Target a random deployment
|
| 37 |
+
target_deps = [random.choice(deployments)] if deployments else []
|
| 38 |
+
|
| 39 |
+
for deployment in target_deps:
|
| 40 |
+
# Get pods for this deployment
|
| 41 |
+
pods = [p for p in self.world.get_pods() if p.deployment == deployment.name]
|
| 42 |
+
|
| 43 |
+
for pod in pods:
|
| 44 |
+
if failure_rate is not None and random.random() < failure_rate:
|
| 45 |
+
# Simulate OOMKill by setting high memory usage and restart count
|
| 46 |
+
patch = {
|
| 47 |
+
"status": "Running", # OOMKill pods often show as Running but crash
|
| 48 |
+
"restarts": random.randint(10, 30) # High restart count from OOM
|
| 49 |
+
}
|
| 50 |
+
self.world.apply_patch("pod", pod.name, patch)
|
| 51 |
+
|
| 52 |
+
# Also reduce the pod's memory limit to simulate the condition that caused OOM
|
| 53 |
+
mem_patch = {
|
| 54 |
+
"mem_limit": max(64, pod.mem_limit // 2) if pod.mem_limit else 128
|
| 55 |
+
}
|
| 56 |
+
self.world.apply_patch("pod", pod.name, mem_patch)
|
| 57 |
+
|
| 58 |
+
# Add event
|
| 59 |
+
self._add_oom_event(pod.name)
|
| 60 |
+
|
| 61 |
+
def _add_oom_event(self, pod_name: str):
|
| 62 |
+
"""Add an OOMKill event"""
|
| 63 |
+
from .models import ClusterEvent
|
| 64 |
+
from datetime import datetime
|
| 65 |
+
|
| 66 |
+
event = ClusterEvent(
|
| 67 |
+
event_id=f"event-oom-{random.randint(1000, 9999)}",
|
| 68 |
+
timestamp=datetime.now().isoformat(),
|
| 69 |
+
type="Warning",
|
| 70 |
+
reason="OOMKilling",
|
| 71 |
+
message=f"Container {pod_name} exceeded memory limit",
|
| 72 |
+
involved_object=pod_name
|
| 73 |
+
)
|
| 74 |
+
self.world.events.append(event)
|
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Executor - Action execution bridge"""
|
| 2 |
+
|
| 3 |
+
from typing import Dict, Any, Optional
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ExecutionResult:
|
| 7 |
+
"""Result of action execution"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, observation, reward: float, done: bool, info: Dict[str, Any]):
|
| 10 |
+
self.observation = observation
|
| 11 |
+
self.reward = reward
|
| 12 |
+
self.done = done
|
| 13 |
+
self.info = info
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Executor:
|
| 17 |
+
"""Maps validated actions to world method calls"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, world, grader):
|
| 20 |
+
self.world = world
|
| 21 |
+
self.grader = grader
|
| 22 |
+
|
| 23 |
+
def execute(self, action: Dict[str, Any], task_id: str, max_steps: int) -> ExecutionResult:
|
| 24 |
+
"""
|
| 25 |
+
Execute an action and return the result
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
action: The action to execute
|
| 29 |
+
task_id: Current task ID for grading
|
| 30 |
+
max_steps: Maximum steps for the episode
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
ExecutionResult with observation, reward, done, and info
|
| 34 |
+
"""
|
| 35 |
+
action_type = action.get("action_type", "")
|
| 36 |
+
info = {}
|
| 37 |
+
reward = 0.0
|
| 38 |
+
done = False
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
if action_type == "scale":
|
| 42 |
+
deployment = action.get("deployment", "")
|
| 43 |
+
replicas = action.get("replicas", 1)
|
| 44 |
+
self.world.scale(deployment, replicas)
|
| 45 |
+
info["scaled"] = deployment
|
| 46 |
+
info["replicas"] = replicas
|
| 47 |
+
|
| 48 |
+
elif action_type == "delete_pod":
|
| 49 |
+
pod_name = action.get("pod_name", "")
|
| 50 |
+
self.world.delete_pod(pod_name)
|
| 51 |
+
info["deleted"] = pod_name
|
| 52 |
+
|
| 53 |
+
elif action_type == "patch":
|
| 54 |
+
resource_type = action.get("resource_type", "")
|
| 55 |
+
name = action.get("name", "")
|
| 56 |
+
patch = action.get("patch", {})
|
| 57 |
+
self.world.apply_patch(resource_type, name, patch)
|
| 58 |
+
info["patched"] = f"{resource_type}/{name}"
|
| 59 |
+
|
| 60 |
+
elif action_type == "rollout_restart":
|
| 61 |
+
deployment = action.get("deployment", "")
|
| 62 |
+
self.world.rollout_restart(deployment)
|
| 63 |
+
info["restarted"] = deployment
|
| 64 |
+
|
| 65 |
+
elif action_type == "drain_node":
|
| 66 |
+
node_name = action.get("node_name", "")
|
| 67 |
+
self.world.apply_patch("node", node_name, {"status": "SchedulingDisabled"})
|
| 68 |
+
info["drained"] = node_name
|
| 69 |
+
|
| 70 |
+
elif action_type == "set_hpa":
|
| 71 |
+
deployment = action.get("deployment", "")
|
| 72 |
+
min_replicas = action.get("min_replicas", 1)
|
| 73 |
+
max_replicas = action.get("max_replicas", 10)
|
| 74 |
+
cpu_target = action.get("cpu_target_percent", 80)
|
| 75 |
+
hpa_name = f"{deployment}-hpa"
|
| 76 |
+
self.world.apply_patch("hpa", hpa_name, {
|
| 77 |
+
"min_replicas": min_replicas,
|
| 78 |
+
"max_replicas": max_replicas,
|
| 79 |
+
"cpu_target_percent": cpu_target
|
| 80 |
+
})
|
| 81 |
+
info["hpa_set"] = deployment
|
| 82 |
+
|
| 83 |
+
elif action_type == "describe":
|
| 84 |
+
# Investigation action - no state change
|
| 85 |
+
resource_type = action.get("resource_type", "")
|
| 86 |
+
name = action.get("name", "")
|
| 87 |
+
info["described"] = f"{resource_type}/{name}"
|
| 88 |
+
|
| 89 |
+
else:
|
| 90 |
+
info["error"] = f"Unknown action type: {action_type}"
|
| 91 |
+
reward = -0.1
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
info["error"] = str(e)
|
| 95 |
+
reward = -0.1
|
| 96 |
+
|
| 97 |
+
# Always advance time after an action
|
| 98 |
+
self.world.tick()
|
| 99 |
+
|
| 100 |
+
# Calculate reward
|
| 101 |
+
world_state = self.world.get_full_state()
|
| 102 |
+
reward = self.grader.grade(world_state, self.world.step_count, max_steps)
|
| 103 |
+
|
| 104 |
+
# Check if done
|
| 105 |
+
if self.world.step_count >= max_steps:
|
| 106 |
+
done = True
|
| 107 |
+
|
| 108 |
+
# Check task completion
|
| 109 |
+
if self._check_task_complete(task_id):
|
| 110 |
+
done = True
|
| 111 |
+
|
| 112 |
+
observation = self.world.get_observation()
|
| 113 |
+
|
| 114 |
+
return ExecutionResult(
|
| 115 |
+
observation=observation,
|
| 116 |
+
reward=reward,
|
| 117 |
+
done=done,
|
| 118 |
+
info=info
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
def _check_task_complete(self, task_id: str) -> bool:
|
| 122 |
+
"""Check if task is complete"""
|
| 123 |
+
pods = self.world.get_pods()
|
| 124 |
+
|
| 125 |
+
if task_id == "pod_recovery":
|
| 126 |
+
frontend_pods = [p for p in pods if p.deployment == "frontend"]
|
| 127 |
+
running = [p for p in frontend_pods if p.status == "Running"]
|
| 128 |
+
return len(frontend_pods) > 0 and len(running) == len(frontend_pods)
|
| 129 |
+
|
| 130 |
+
elif task_id == "autoscaling":
|
| 131 |
+
backend_pods = [p for p in pods if p.deployment == "backend"]
|
| 132 |
+
running = [p for p in backend_pods if p.status == "Running"]
|
| 133 |
+
return len(backend_pods) >= 2 and len(running) >= 2
|
| 134 |
+
|
| 135 |
+
elif task_id == "incident":
|
| 136 |
+
key_services = ["auth-service", "api-gateway", "frontend"]
|
| 137 |
+
for svc in key_services:
|
| 138 |
+
svc_pods = [p for p in pods if p.deployment == svc]
|
| 139 |
+
running = [p for p in svc_pods if p.status == "Running"]
|
| 140 |
+
if svc_pods and len(running) < len(svc_pods) * 0.8:
|
| 141 |
+
return False
|
| 142 |
+
return True
|
| 143 |
+
|
| 144 |
+
return False
|
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Graders"""
|
| 2 |
+
|
| 3 |
+
__all__ = ["grader_pod_recovery", "grader_autoscaling", "grader_incident"]
|
| 4 |
+
|
| 5 |
+
from .grader_pod_recovery import grade as pod_recovery_grade
|
| 6 |
+
from .grader_autoscaling import grade as autoscaling_grade
|
| 7 |
+
from .grader_incident import grade as incident_grade
|
| 8 |
+
|
| 9 |
+
__all__ += ["pod_recovery_grade", "autoscaling_grade", "incident_grade"]
|
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader for Autoscaling Task
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def grade(world_state: Dict[str, Any], step: int, max_steps: int) -> float:
|
| 9 |
+
"""Grade the autoscaling task"""
|
| 10 |
+
score = 0.0
|
| 11 |
+
|
| 12 |
+
# Check backend deployment status
|
| 13 |
+
backend_deployment = next((d for d in world_state.get("deployments", []) if d.get("name") == "backend"), None)
|
| 14 |
+
if not backend_deployment:
|
| 15 |
+
return 0.0
|
| 16 |
+
|
| 17 |
+
# Check if we have adequate replicas
|
| 18 |
+
desired = backend_deployment.get("desired_replicas", 0)
|
| 19 |
+
available = backend_deployment.get("available_replicas", 0)
|
| 20 |
+
|
| 21 |
+
if desired > 0:
|
| 22 |
+
replica_ratio = min(available / desired, 1.0)
|
| 23 |
+
score += replica_ratio * 0.4 # 40% for proper scaling
|
| 24 |
+
|
| 25 |
+
# Check backend pod health
|
| 26 |
+
backend_pods = [p for p in world_state.get("pods", [])
|
| 27 |
+
if p.get("deployment") == "backend" and p.get("status") == "Running"]
|
| 28 |
+
total_backend_pods = [p for p in world_state.get("pods", [])
|
| 29 |
+
if p.get("deployment") == "backend"]
|
| 30 |
+
|
| 31 |
+
if total_backend_pods:
|
| 32 |
+
health_ratio = len(backend_pods) / len(total_backend_pods)
|
| 33 |
+
score += health_ratio * 0.4 # 40% for pod health
|
| 34 |
+
|
| 35 |
+
# Efficiency bonus
|
| 36 |
+
if max_steps > 0:
|
| 37 |
+
efficiency = 1.0 - (step / max_steps)
|
| 38 |
+
score += efficiency * 0.2 # 20% for efficiency
|
| 39 |
+
|
| 40 |
+
return min(score, 1.0)
|
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader for Incident Task
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def grade(world_state: Dict[str, Any], step: int, max_steps: int) -> float:
|
| 9 |
+
"""Grade the incident task"""
|
| 10 |
+
score = 0.0
|
| 11 |
+
|
| 12 |
+
# Key services that should be healthy after incident resolution
|
| 13 |
+
key_services = ["auth-service", "api-gateway", "frontend"]
|
| 14 |
+
healthy_services = 0
|
| 15 |
+
|
| 16 |
+
for service_name in key_services:
|
| 17 |
+
# Check if deployment exists and has running pods
|
| 18 |
+
deployment = next((d for d in world_state.get("deployments", []) if d.get("name") == service_name), None)
|
| 19 |
+
if deployment:
|
| 20 |
+
desired = deployment.get("desired_replicas", 0)
|
| 21 |
+
available = deployment.get("available_replicas", 0)
|
| 22 |
+
|
| 23 |
+
if desired > 0:
|
| 24 |
+
# Consider service healthy if at least 80% of desired replicas are available
|
| 25 |
+
if available / desired >= 0.8:
|
| 26 |
+
healthy_services += 1
|
| 27 |
+
|
| 28 |
+
# Score based on proportion of healthy services
|
| 29 |
+
if key_services:
|
| 30 |
+
service_health_score = healthy_services / len(key_services)
|
| 31 |
+
score += service_health_score * 0.6 # 60% for service health
|
| 32 |
+
|
| 33 |
+
# Check for absence of crashlooping pods in key services
|
| 34 |
+
key_service_pods = [p for p in world_state.get("pods", [])
|
| 35 |
+
if p.get("deployment") in key_services]
|
| 36 |
+
crashloop_pods = [p for p in key_service_pods
|
| 37 |
+
if p.get("status") == "CrashLoopBackOff"]
|
| 38 |
+
|
| 39 |
+
if key_service_pods:
|
| 40 |
+
crashloop_ratio = len(crashloop_pods) / len(key_service_pods)
|
| 41 |
+
# Penalize for crashlooping pods (inverse relationship)
|
| 42 |
+
health_bonus = (1.0 - crashloop_ratio) * 0.3 # 30% for no crashloops
|
| 43 |
+
score += health_bonus
|
| 44 |
+
|
| 45 |
+
# Efficiency bonus
|
| 46 |
+
if max_steps > 0:
|
| 47 |
+
efficiency = 1.0 - (step / max_steps)
|
| 48 |
+
score += efficiency * 0.1 # 10% for efficiency
|
| 49 |
+
|
| 50 |
+
return min(score, 1.0)
|
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader for Pod Recovery Task
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def grade(world_state: Dict[str, Any], step: int, max_steps: int) -> float:
|
| 9 |
+
"""Grade the pod recovery task"""
|
| 10 |
+
score = 0.0
|
| 11 |
+
|
| 12 |
+
# Count running frontend pods
|
| 13 |
+
frontend_pods = [p for p in world_state.get("pods", [])
|
| 14 |
+
if p.get("deployment") == "frontend" and p.get("status") == "Running"]
|
| 15 |
+
total_frontend_pods = [p for p in world_state.get("pods", [])
|
| 16 |
+
if p.get("deployment") == "frontend"]
|
| 17 |
+
|
| 18 |
+
if total_frontend_pods:
|
| 19 |
+
running_ratio = len(frontend_pods) / len(total_frontend_pods)
|
| 20 |
+
score += running_ratio * 0.5
|
| 21 |
+
|
| 22 |
+
# Bonus for all pods running
|
| 23 |
+
if total_frontend_pods and len(frontend_pods) == len(total_frontend_pods):
|
| 24 |
+
score += 0.4
|
| 25 |
+
|
| 26 |
+
# Efficiency bonus (faster is better)
|
| 27 |
+
if max_steps > 0:
|
| 28 |
+
efficiency = 1.0 - (step / max_steps)
|
| 29 |
+
score += efficiency * 0.1
|
| 30 |
+
|
| 31 |
+
return min(score, 1.0)
|
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
KubeSimEnv Models - Pydantic models for OpenEnv compliance
|
| 3 |
+
All typed models are mandatory for OpenEnv spec compliance.
|
| 4 |
+
Every endpoint uses these.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
from typing import List, Dict, Any, Optional, Literal
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class NodeStatus(BaseModel):
|
| 13 |
+
"""Status of a Kubernetes node"""
|
| 14 |
+
name: str
|
| 15 |
+
status: Literal["Ready", "NotReady", "Unknown", "SchedulingDisabled"]
|
| 16 |
+
cpu_capacity: int # in cores
|
| 17 |
+
mem_capacity: int # in MB
|
| 18 |
+
cpu_usage: float = Field(ge=0, le=100) # percentage
|
| 19 |
+
mem_usage: float = Field(ge=0, le=100) # percentage
|
| 20 |
+
last_updated: str # ISO timestamp
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class PodStatus(BaseModel):
|
| 24 |
+
"""Status of a Kubernetes pod"""
|
| 25 |
+
name: str
|
| 26 |
+
status: Literal["Pending", "Running", "Succeeded", "Failed", "Unknown", "CrashLoopBackOff"]
|
| 27 |
+
node: Optional[str] = None
|
| 28 |
+
restarts: int = 0
|
| 29 |
+
cpu_request: int = Field(default=0) # in millicores
|
| 30 |
+
mem_request: int = Field(default=0) # in MB
|
| 31 |
+
cpu_limit: Optional[int] = Field(default=None) # in millicores
|
| 32 |
+
mem_limit: Optional[int] = Field(default=None) # in MB
|
| 33 |
+
deployment: Optional[str] = None
|
| 34 |
+
last_updated: str # ISO timestamp
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class DeploymentStatus(BaseModel):
|
| 38 |
+
"""Status of a Kubernetes deployment"""
|
| 39 |
+
name: str
|
| 40 |
+
desired_replicas: int
|
| 41 |
+
available_replicas: int
|
| 42 |
+
image: str
|
| 43 |
+
last_updated: str # ISO timestamp
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class ServiceStatus(BaseModel):
|
| 47 |
+
"""Status of a Kubernetes service"""
|
| 48 |
+
name: str
|
| 49 |
+
type: Literal["ClusterIP", "NodePort", "LoadBalancer", "ExternalName"]
|
| 50 |
+
ports: List[Dict[str, Any]]
|
| 51 |
+
selector: Optional[Dict[str, str]] = None
|
| 52 |
+
cluster_ip: Optional[str] = None
|
| 53 |
+
last_updated: str # ISO timestamp
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class ConfigMapStatus(BaseModel):
|
| 57 |
+
"""Status of a Kubernetes ConfigMap"""
|
| 58 |
+
name: str
|
| 59 |
+
data: Dict[str, str]
|
| 60 |
+
last_updated: str # ISO timestamp
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class HPAStatus(BaseModel):
|
| 64 |
+
"""Status of a HorizontalPodAutoscaler"""
|
| 65 |
+
name: str
|
| 66 |
+
min_replicas: int
|
| 67 |
+
max_replicas: int
|
| 68 |
+
current_replicas: int
|
| 69 |
+
cpu_target_percent: int
|
| 70 |
+
last_updated: str # ISO timestamp
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class ClusterEvent(BaseModel):
|
| 74 |
+
"""Kubernetes-style event"""
|
| 75 |
+
event_id: str
|
| 76 |
+
timestamp: str # ISO timestamp
|
| 77 |
+
type: Literal["Normal", "Warning"]
|
| 78 |
+
reason: str
|
| 79 |
+
message: str
|
| 80 |
+
involved_object: str
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class ClusterObservation(BaseModel):
|
| 84 |
+
"""Main observation model - typed cluster snapshot"""
|
| 85 |
+
nodes: List[NodeStatus]
|
| 86 |
+
pods: List[PodStatus]
|
| 87 |
+
deployments: List[DeploymentStatus]
|
| 88 |
+
services: List[ServiceStatus]
|
| 89 |
+
configmaps: List[ConfigMapStatus]
|
| 90 |
+
hpas: List[HPAStatus]
|
| 91 |
+
events: List[ClusterEvent]
|
| 92 |
+
step: int
|
| 93 |
+
objective: str
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class RewardSignal(BaseModel):
|
| 97 |
+
"""Reward signal returned by step()"""
|
| 98 |
+
reward: float
|
| 99 |
+
done: bool
|
| 100 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# Action Models - These represent the structured action space
|
| 104 |
+
class KubeAction(BaseModel):
|
| 105 |
+
"""Base action model"""
|
| 106 |
+
action_type: Literal[
|
| 107 |
+
"scale", "delete_pod", "patch", "rollout_restart",
|
| 108 |
+
"set_hpa", "drain_node", "describe"
|
| 109 |
+
]
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class ScaleAction(KubeAction):
|
| 113 |
+
"""Scale a deployment to a specific replica count"""
|
| 114 |
+
deployment: str
|
| 115 |
+
replicas: int
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class DeletePodAction(KubeAction):
|
| 119 |
+
"""Delete a specific pod"""
|
| 120 |
+
pod_name: str
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class PatchAction(KubeAction):
|
| 124 |
+
"""Patch a resource with specific changes"""
|
| 125 |
+
resource_type: Literal["deployment", "pod", "node", "service"]
|
| 126 |
+
name: str
|
| 127 |
+
patch: Dict[str, Any]
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class RolloutRestartAction(KubeAction):
|
| 131 |
+
"""Restart a deployment rollout"""
|
| 132 |
+
deployment: str
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class SetHPAAction(KubeAction):
|
| 136 |
+
"""Set HorizontalPodAutoscaler for a deployment"""
|
| 137 |
+
deployment: str
|
| 138 |
+
min_replicas: int
|
| 139 |
+
max_replicas: int
|
| 140 |
+
cpu_target_percent: int
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class DrainNodeAction(KubeAction):
|
| 144 |
+
"""Drain a node (evict all pods)"""
|
| 145 |
+
node_name: str
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class DescribeAction(KubeAction):
|
| 149 |
+
"""Describe/get details of a resource"""
|
| 150 |
+
resource_type: Literal["deployment", "pod", "node", "service"]
|
| 151 |
+
name: str
|
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
openenv[core]>=0.2.0
|
| 2 |
-
fastapi>=0.115.0
|
| 3 |
-
uvicorn>=0.24.0
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Tasks"""
|
| 2 |
+
|
| 3 |
+
__all__ = ["task_pod_recovery", "task_autoscaling", "task_incident"]
|
| 4 |
+
|
| 5 |
+
from .task_pod_recovery import PodRecoveryTask
|
| 6 |
+
from .task_autoscaling import AutoscalingTask
|
| 7 |
+
from .task_incident import IncidentTask
|
| 8 |
+
|
| 9 |
+
__all__ += ["PodRecoveryTask", "AutoscalingTask", "IncidentTask"]
|
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Autoscaling Task - Medium difficulty
|
| 3 |
+
Configure HPA to handle traffic spike
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any
|
| 7 |
+
from ..COEnv_environment import World
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class AutoscalingTask:
|
| 11 |
+
"""Autoscaling task implementation"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, world: World, config: Dict[str, Any]):
|
| 14 |
+
self.world = world
|
| 15 |
+
self.config = config
|
| 16 |
+
self.task_id = "autoscaling"
|
| 17 |
+
self.description = "Configure HPA to handle traffic spike"
|
| 18 |
+
|
| 19 |
+
def reset(self):
|
| 20 |
+
"""Reset the task to initial state"""
|
| 21 |
+
self.world.reset_to_healthy()
|
| 22 |
+
|
| 23 |
+
backend_pods = [p for p in self.world.get_pods() if p.deployment == "backend"]
|
| 24 |
+
for pod in backend_pods:
|
| 25 |
+
patch = {
|
| 26 |
+
"cpu_request": int(pod.cpu_request * 3) if pod.cpu_request else 750,
|
| 27 |
+
"mem_request": int(pod.mem_request * 3) if pod.mem_request else 384
|
| 28 |
+
}
|
| 29 |
+
self.world.apply_patch("pod", pod.name, patch)
|
| 30 |
+
|
| 31 |
+
self.objective = "Backend service is overloaded due to traffic spike. Configure HPA to automatically scale the backend deployment based on CPU utilization."
|
| 32 |
+
|
| 33 |
+
def is_complete(self) -> bool:
|
| 34 |
+
"""Check if the task is complete"""
|
| 35 |
+
backend_deployment = next((d for d in self.world.get_deployments() if d.name == "backend"), None)
|
| 36 |
+
if not backend_deployment:
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
backend_pods = [p for p in self.world.get_pods()
|
| 40 |
+
if p.deployment == "backend" and p.status == "Running"]
|
| 41 |
+
return len(backend_pods) >= 2
|
| 42 |
+
|
| 43 |
+
def get_observation(self) -> Dict[str, Any]:
|
| 44 |
+
"""Get current observation for the task"""
|
| 45 |
+
observation = self.world.get_full_state()
|
| 46 |
+
observation["objective"] = self.objective
|
| 47 |
+
return observation
|
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Incident Task - Hard difficulty
|
| 3 |
+
Handle multi-service cascading incident
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any
|
| 7 |
+
from ..COEnv_environment import World
|
| 8 |
+
from ..conditions.cascade_failure import CascadeFailureCondition
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class IncidentTask:
|
| 12 |
+
"""Incident task implementation"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, world: World, config: Dict[str, Any]):
|
| 15 |
+
self.world = world
|
| 16 |
+
self.config = config
|
| 17 |
+
self.task_id = "incident"
|
| 18 |
+
self.description = "Handle multi-service cascading incident"
|
| 19 |
+
|
| 20 |
+
def reset(self):
|
| 21 |
+
"""Reset the task to initial state"""
|
| 22 |
+
self.world.reset_to_healthy()
|
| 23 |
+
|
| 24 |
+
cascade_condition = CascadeFailureCondition(self.world, self.config)
|
| 25 |
+
cascade_condition.inject(root_cause_service="auth-service", failure_probability=0.6)
|
| 26 |
+
|
| 27 |
+
self.objective = "Auth-service OOMKill has caused cascading failures. Identify the root cause, fix memory limits, restart workloads, and verify downstream recovery."
|
| 28 |
+
|
| 29 |
+
def is_complete(self) -> bool:
|
| 30 |
+
"""Check if the task is complete"""
|
| 31 |
+
key_services = ["auth-service", "api-gateway", "frontend"]
|
| 32 |
+
healthy_services = 0
|
| 33 |
+
|
| 34 |
+
for service_name in key_services:
|
| 35 |
+
deployment = next((d for d in self.world.get_deployments() if d.name == service_name), None)
|
| 36 |
+
if deployment:
|
| 37 |
+
running_pods = [p for p in self.world.get_pods()
|
| 38 |
+
if p.deployment == service_name and p.status == "Running"]
|
| 39 |
+
if len(running_pods) >= deployment.desired_replicas * 0.8:
|
| 40 |
+
healthy_services += 1
|
| 41 |
+
|
| 42 |
+
return healthy_services >= len(key_services) * 0.67
|
| 43 |
+
|
| 44 |
+
def get_observation(self) -> Dict[str, Any]:
|
| 45 |
+
"""Get current observation for the task"""
|
| 46 |
+
observation = self.world.get_full_state()
|
| 47 |
+
observation["objective"] = self.objective
|
| 48 |
+
return observation
|
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pod Recovery Task - Easy difficulty
|
| 3 |
+
Fix crash-looping pods by identifying and patching bad configuration
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Dict, Any
|
| 7 |
+
from ..COEnv_environment import World
|
| 8 |
+
from ..conditions.crash_loop import CrashLoopCondition
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class PodRecoveryTask:
|
| 12 |
+
"""Pod recovery task implementation"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, world: World, config: Dict[str, Any]):
|
| 15 |
+
self.world = world
|
| 16 |
+
self.config = config
|
| 17 |
+
self.task_id = "pod_recovery"
|
| 18 |
+
self.description = "Fix crash-looping pods by identifying and patching bad configuration"
|
| 19 |
+
|
| 20 |
+
def reset(self):
|
| 21 |
+
"""Reset the task to initial state"""
|
| 22 |
+
self.world.reset_to_healthy()
|
| 23 |
+
|
| 24 |
+
crash_loop_condition = CrashLoopCondition(self.world, self.config)
|
| 25 |
+
crash_loop_condition.inject(target_deployment="frontend", failure_rate=0.8)
|
| 26 |
+
|
| 27 |
+
self.objective = "All frontend pods should be running. Investigate the CrashLoopBackOff pods and fix the configuration issue."
|
| 28 |
+
|
| 29 |
+
def is_complete(self) -> bool:
|
| 30 |
+
"""Check if the task is complete"""
|
| 31 |
+
frontend_pods = [p for p in self.world.get_pods()
|
| 32 |
+
if p.deployment == "frontend" and p.status == "Running"]
|
| 33 |
+
total_frontend_pods = [p for p in self.world.get_pods()
|
| 34 |
+
if p.deployment == "frontend"]
|
| 35 |
+
|
| 36 |
+
if not total_frontend_pods:
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
return len(frontend_pods) == len(total_frontend_pods)
|
| 40 |
+
|
| 41 |
+
def get_observation(self) -> Dict[str, Any]:
|
| 42 |
+
"""Get current observation for the task"""
|
| 43 |
+
observation = self.world.get_full_state()
|
| 44 |
+
observation["objective"] = self.objective
|
| 45 |
+
return observation
|
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
KubeSimEnv Utils - Probability helpers and simulation utilities
|
| 3 |
+
Random failure rate generators, latency simulators, resource usage curves.
|
| 4 |
+
Makes the simulation feel realistic and non-deterministic in the right ways.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import random
|
| 8 |
+
import math
|
| 9 |
+
from typing import Dict, List, Any, Optional
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ProbabilityHelpers:
|
| 14 |
+
"""Helpers for generating realistic probabilities and distributions"""
|
| 15 |
+
|
| 16 |
+
@staticmethod
|
| 17 |
+
def weighted_random_choice(choices: List[Any], weights: List[float]) -> Any:
|
| 18 |
+
"""Make a weighted random choice"""
|
| 19 |
+
if not choices or not weights or len(choices) != len(weights):
|
| 20 |
+
return random.choice(choices) if choices else None
|
| 21 |
+
|
| 22 |
+
# Normalize weights
|
| 23 |
+
total_weight = sum(weights)
|
| 24 |
+
if total_weight == 0:
|
| 25 |
+
return random.choice(choices)
|
| 26 |
+
|
| 27 |
+
normalized_weights = [w / total_weight for w in weights]
|
| 28 |
+
|
| 29 |
+
# Make choice
|
| 30 |
+
r = random.random()
|
| 31 |
+
cumulative_weight = 0
|
| 32 |
+
for choice, weight in zip(choices, normalized_weights):
|
| 33 |
+
cumulative_weight += weight
|
| 34 |
+
if r <= cumulative_weight:
|
| 35 |
+
return choice
|
| 36 |
+
return choices[-1] # Fallback
|
| 37 |
+
|
| 38 |
+
@staticmethod
|
| 39 |
+
def exponential_backoff(attempt: int, base_delay: float = 1.0, max_delay: float = 60.0) -> float:
|
| 40 |
+
"""Calculate exponential backoff delay"""
|
| 41 |
+
delay = base_delay * (2 ** attempt)
|
| 42 |
+
return min(delay, max_delay)
|
| 43 |
+
|
| 44 |
+
@staticmethod
|
| 45 |
+
def poisson_arrival_rate(lambda_rate: float, time_window: float) -> int:
|
| 46 |
+
"""Generate number of events in time window using Poisson distribution"""
|
| 47 |
+
# Simple approximation - in reality would use numpy.random.poisson
|
| 48 |
+
return int(lambda_rate * time_window + random.gauss(0, math.sqrt(lambda_rate * time_window)))
|
| 49 |
+
|
| 50 |
+
@staticmethod
|
| 51 |
+
def failure_probability_over_time(base_rate: float, time_elapsed: float,
|
| 52 |
+
max_rate: float = 1.0) -> float:
|
| 53 |
+
"""Calculate failure probability that increases over time"""
|
| 54 |
+
probability = base_rate * (1 + math.log(1 + time_elapsed))
|
| 55 |
+
return min(probability, max_rate)
|
| 56 |
+
|
| 57 |
+
@staticmethod
|
| 58 |
+
def random_failure_rate(min_rate: float = 0.1, max_rate: float = 0.9) -> float:
|
| 59 |
+
"""Generate a random failure rate within bounds"""
|
| 60 |
+
return random.uniform(min_rate, max_rate)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class LatencySimulator:
|
| 64 |
+
"""Simulates network and service latency"""
|
| 65 |
+
|
| 66 |
+
def __init__(self, base_latency_ms: float = 50.0):
|
| 67 |
+
self.base_latency_ms = base_latency_ms
|
| 68 |
+
self.load_factor = 1.0
|
| 69 |
+
|
| 70 |
+
def set_load(self, load_factor: float):
|
| 71 |
+
"""Set system load factor (1.0 = normal, >1.0 = overloaded)"""
|
| 72 |
+
self.load_factor = max(0.1, load_factor)
|
| 73 |
+
|
| 74 |
+
def get_latency(self) -> float:
|
| 75 |
+
"""Get simulated latency in milliseconds"""
|
| 76 |
+
# Base latency + load-dependent component + random jitter
|
| 77 |
+
load_latency = self.base_latency_ms * (self.load_factor - 1.0) * 2
|
| 78 |
+
jitter = random.gauss(0, self.base_latency_ms * 0.1)
|
| 79 |
+
latency = self.base_latency_ms + max(0, load_latency) + jitter
|
| 80 |
+
return max(1.0, latency) # Minimum 1ms latency
|
| 81 |
+
|
| 82 |
+
def get_latency_with_spike(self, spike_probability: float = 0.05,
|
| 83 |
+
spike_multiplier: float = 5.0) -> float:
|
| 84 |
+
"""Get latency with occasional spikes"""
|
| 85 |
+
latency = self.get_latency()
|
| 86 |
+
if random.random() < spike_probability:
|
| 87 |
+
latency *= spike_multiplier
|
| 88 |
+
return latency
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class ResourceUsageSimulator:
|
| 92 |
+
"""Simulates realistic CPU and memory usage patterns"""
|
| 93 |
+
|
| 94 |
+
def __init__(self):
|
| 95 |
+
self.time_offset = random.uniform(0, 2 * math.pi)
|
| 96 |
+
|
| 97 |
+
def get_cpu_usage(self, base_usage: float = 0.3,
|
| 98 |
+
variation: float = 0.2) -> float:
|
| 99 |
+
"""Get CPU usage as percentage (0-100)"""
|
| 100 |
+
# Simulate daily patterns with some noise
|
| 101 |
+
time_factor = (datetime.now().timestamp() / 3600) % 24 # Hours in day
|
| 102 |
+
daily_pattern = 0.5 * math.sin(2 * math.pi * time_factor / 24) + 0.5
|
| 103 |
+
|
| 104 |
+
usage = base_usage + variation * daily_pattern
|
| 105 |
+
usage += random.gauss(0, 0.05) # Noise
|
| 106 |
+
return max(0.0, min(1.0, usage)) * 100 # Clamp to 0-100%
|
| 107 |
+
|
| 108 |
+
def get_memory_usage(self, base_usage: float = 0.4,
|
| 109 |
+
variation: float = 0.15) -> float:
|
| 110 |
+
"""Get memory usage as percentage (0-100)"""
|
| 111 |
+
# Memory usage tends to creep up over time (simulate leak)
|
| 112 |
+
time_factor = min((datetime.now().timestamp() / 86400) % 7, 1.0) # Weekly pattern
|
| 113 |
+
leak_factor = 0.1 * time_factor # Slow leak over week
|
| 114 |
+
|
| 115 |
+
usage = base_usage + leak_factor
|
| 116 |
+
usage += random.gauss(0, 0.03) # Noise
|
| 117 |
+
return max(0.0, min(1.0, usage)) * 100 # Clamp to 0-100%
|
| 118 |
+
|
| 119 |
+
def get_resource_curve(self, resource_type: str,
|
| 120 |
+
time_elapsed: float) -> float:
|
| 121 |
+
"""Get resource usage following a specific curve"""
|
| 122 |
+
if resource_type == "cpu":
|
| 123 |
+
# CPU: periodic with bursts
|
| 124 |
+
return 0.3 + 0.4 * math.sin(time_elapsed / 100) + 0.2 * random.random()
|
| 125 |
+
elif resource_type == "memory":
|
| 126 |
+
# Memory: gradual increase with occasional GC drops
|
| 127 |
+
base = 0.2 + 0.6 * (1 - math.exp(-time_elapsed / 1000))
|
| 128 |
+
gc_drop = 0.3 if random.random() < 0.01 else 0 # Occasional GC
|
| 129 |
+
return max(0, base - gc_drop)
|
| 130 |
+
elif resource_type == "disk":
|
| 131 |
+
# Disk: steady growth
|
| 132 |
+
return 0.1 + 0.8 * min(time_elapsed / 10000, 1.0)
|
| 133 |
+
else:
|
| 134 |
+
return 0.5
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class NetworkSimulator:
|
| 138 |
+
"""Simulates network conditions and partitions"""
|
| 139 |
+
|
| 140 |
+
def __init__(self):
|
| 141 |
+
self.partition_probability = 0.01
|
| 142 |
+
self.latency_ms = 10.0
|
| 143 |
+
self.bandwidth_mbps = 1000.0
|
| 144 |
+
|
| 145 |
+
def simulate_partition(self) -> bool:
|
| 146 |
+
"""Return True if network partition is simulated"""
|
| 147 |
+
return random.random() < self.partition_probability
|
| 148 |
+
|
| 149 |
+
def get_latency(self) -> float:
|
| 150 |
+
"""Get network latency in milliseconds"""
|
| 151 |
+
# Base latency with occasional spikes
|
| 152 |
+
latency = self.latency_ms + random.gauss(0, self.latency_ms * 0.2)
|
| 153 |
+
if random.random() < 0.05: # 5% chance of spike
|
| 154 |
+
latency *= random.uniform(2, 10)
|
| 155 |
+
return max(1.0, latency)
|
| 156 |
+
|
| 157 |
+
def get_bandwidth(self) -> float:
|
| 158 |
+
"""Get available bandwidth in Mbps"""
|
| 159 |
+
# Bandwidth varies with usage and conditions
|
| 160 |
+
usage_factor = random.uniform(0.3, 0.9)
|
| 161 |
+
condition_factor = random.uniform(0.8, 1.2)
|
| 162 |
+
return self.bandwidth_mbps * usage_factor * condition_factor
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def generate_failure_scenario(config: Dict[str, Any]) -> Dict[str, Any]:
|
| 166 |
+
"""Generate a random failure scenario based on config"""
|
| 167 |
+
scenario = {
|
| 168 |
+
"type": random.choice(["crashloop", "oom", "node_failure", "cascade"]),
|
| 169 |
+
"severity": random.uniform(0.3, 0.9),
|
| 170 |
+
"duration": random.randint(30, 300), # seconds
|
| 171 |
+
"affected_components": []
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
# Add specific parameters based on type
|
| 175 |
+
if scenario["type"] == "crashloop":
|
| 176 |
+
scenario["failure_rate"] = config.get("crash_loop_failure_rate", 0.7)
|
| 177 |
+
elif scenario["type"] == "oom":
|
| 178 |
+
scenario["failure_rate"] = config.get("oom_kill_failure_rate", 0.6)
|
| 179 |
+
elif scenario["type"] == "node_failure":
|
| 180 |
+
scenario["failure_rate"] = config.get("node_failure_rate", 0.4)
|
| 181 |
+
elif scenario["type"] == "cascade":
|
| 182 |
+
scenario["probability"] = config.get("cascade_failure_probability", 0.5)
|
| 183 |
+
|
| 184 |
+
return scenario
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def apply_realistic_noise(value: float, noise_percent: float = 10.0) -> float:
|
| 188 |
+
"""Apply realistic noise to a value"""
|
| 189 |
+
noise = random.gauss(0, value * (noise_percent / 100.0))
|
| 190 |
+
return max(0, value + noise)
|
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Validator - Action validation"""
|
| 2 |
+
|
| 3 |
+
from typing import Dict, Any, Optional, Tuple
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Validator:
|
| 7 |
+
"""Validates actions before execution"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, world):
|
| 10 |
+
self.world = world
|
| 11 |
+
|
| 12 |
+
def validate(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
| 13 |
+
"""
|
| 14 |
+
Validate an action before execution
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
(is_valid, error_message)
|
| 18 |
+
"""
|
| 19 |
+
action_type = action.get("action_type", "")
|
| 20 |
+
|
| 21 |
+
if action_type == "scale":
|
| 22 |
+
return self._validate_scale(action)
|
| 23 |
+
elif action_type == "delete_pod":
|
| 24 |
+
return self._validate_delete_pod(action)
|
| 25 |
+
elif action_type == "patch":
|
| 26 |
+
return self._validate_patch(action)
|
| 27 |
+
elif action_type == "rollout_restart":
|
| 28 |
+
return self._validate_rollout_restart(action)
|
| 29 |
+
elif action_type == "set_hpa":
|
| 30 |
+
return self._validate_set_hpa(action)
|
| 31 |
+
elif action_type == "drain_node":
|
| 32 |
+
return self._validate_drain_node(action)
|
| 33 |
+
elif action_type == "describe":
|
| 34 |
+
return self._validate_describe(action)
|
| 35 |
+
else:
|
| 36 |
+
return False, f"Unknown action type: {action_type}"
|
| 37 |
+
|
| 38 |
+
def _validate_scale(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
| 39 |
+
deployment = action.get("deployment", "")
|
| 40 |
+
replicas = action.get("replicas", 0)
|
| 41 |
+
|
| 42 |
+
if not deployment:
|
| 43 |
+
return False, "Deployment name is required"
|
| 44 |
+
|
| 45 |
+
if replicas < 0 or replicas > 100:
|
| 46 |
+
return False, "Replicas must be between 0 and 100"
|
| 47 |
+
|
| 48 |
+
# Check if deployment exists
|
| 49 |
+
deployments = self.world.get_deployments()
|
| 50 |
+
if not any(d.name == deployment for d in deployments):
|
| 51 |
+
return False, f"Deployment '{deployment}' does not exist"
|
| 52 |
+
|
| 53 |
+
return True, None
|
| 54 |
+
|
| 55 |
+
def _validate_delete_pod(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
| 56 |
+
pod_name = action.get("pod_name", "")
|
| 57 |
+
|
| 58 |
+
if not pod_name:
|
| 59 |
+
return False, "Pod name is required"
|
| 60 |
+
|
| 61 |
+
# Check if pod exists
|
| 62 |
+
pods = self.world.get_pods()
|
| 63 |
+
if not any(p.name == pod_name for p in pods):
|
| 64 |
+
return False, f"Pod '{pod_name}' does not exist"
|
| 65 |
+
|
| 66 |
+
return True, None
|
| 67 |
+
|
| 68 |
+
def _validate_patch(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
| 69 |
+
resource_type = action.get("resource_type", "")
|
| 70 |
+
name = action.get("name", "")
|
| 71 |
+
|
| 72 |
+
if not resource_type:
|
| 73 |
+
return False, "Resource type is required"
|
| 74 |
+
|
| 75 |
+
if not name:
|
| 76 |
+
return False, "Resource name is required"
|
| 77 |
+
|
| 78 |
+
# Check if resource exists
|
| 79 |
+
if resource_type == "deployment":
|
| 80 |
+
deployments = self.world.get_deployments()
|
| 81 |
+
if not any(d.name == name for d in deployments):
|
| 82 |
+
return False, f"Deployment '{name}' does not exist"
|
| 83 |
+
elif resource_type == "pod":
|
| 84 |
+
pods = self.world.get_pods()
|
| 85 |
+
if not any(p.name == name for p in pods):
|
| 86 |
+
return False, f"Pod '{name}' does not exist"
|
| 87 |
+
elif resource_type == "node":
|
| 88 |
+
nodes = self.world.get_nodes()
|
| 89 |
+
if not any(n.name == name for n in nodes):
|
| 90 |
+
return False, f"Node '{name}' does not exist"
|
| 91 |
+
elif resource_type == "service":
|
| 92 |
+
services = self.world.get_services()
|
| 93 |
+
if not any(s.name == name for s in services):
|
| 94 |
+
return False, f"Service '{name}' does not exist"
|
| 95 |
+
elif resource_type == "configmap":
|
| 96 |
+
configmaps = self.world.get_configmaps()
|
| 97 |
+
if not any(cm.name == name for cm in configmaps):
|
| 98 |
+
return False, f"ConfigMap '{name}' does not exist"
|
| 99 |
+
|
| 100 |
+
return True, None
|
| 101 |
+
|
| 102 |
+
def _validate_rollout_restart(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
| 103 |
+
deployment = action.get("deployment", "")
|
| 104 |
+
|
| 105 |
+
if not deployment:
|
| 106 |
+
return False, "Deployment name is required"
|
| 107 |
+
|
| 108 |
+
# Check if deployment exists
|
| 109 |
+
deployments = self.world.get_deployments()
|
| 110 |
+
if not any(d.name == deployment for d in deployments):
|
| 111 |
+
return False, f"Deployment '{deployment}' does not exist"
|
| 112 |
+
|
| 113 |
+
return True, None
|
| 114 |
+
|
| 115 |
+
def _validate_set_hpa(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
| 116 |
+
deployment = action.get("deployment", "")
|
| 117 |
+
min_replicas = action.get("min_replicas", 0)
|
| 118 |
+
max_replicas = action.get("max_replicas", 0)
|
| 119 |
+
|
| 120 |
+
if not deployment:
|
| 121 |
+
return False, "Deployment name is required"
|
| 122 |
+
|
| 123 |
+
if min_replicas < 1 or min_replicas > 50:
|
| 124 |
+
return False, "Min replicas must be between 1 and 50"
|
| 125 |
+
|
| 126 |
+
if max_replicas < 1 or max_replicas > 100:
|
| 127 |
+
return False, "Max replicas must be between 1 and 100"
|
| 128 |
+
|
| 129 |
+
if min_replicas > max_replicas:
|
| 130 |
+
return False, "Min replicas cannot be greater than max replicas"
|
| 131 |
+
|
| 132 |
+
# Check if deployment exists
|
| 133 |
+
deployments = self.world.get_deployments()
|
| 134 |
+
if not any(d.name == deployment for d in deployments):
|
| 135 |
+
return False, f"Deployment '{deployment}' does not exist"
|
| 136 |
+
|
| 137 |
+
return True, None
|
| 138 |
+
|
| 139 |
+
def _validate_drain_node(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
| 140 |
+
node_name = action.get("node_name", "")
|
| 141 |
+
|
| 142 |
+
if not node_name:
|
| 143 |
+
return False, "Node name is required"
|
| 144 |
+
|
| 145 |
+
# Check if node exists
|
| 146 |
+
nodes = self.world.get_nodes()
|
| 147 |
+
if not any(n.name == node_name for n in nodes):
|
| 148 |
+
return False, f"Node '{node_name}' does not exist"
|
| 149 |
+
|
| 150 |
+
# Check if node is already drained/scheduling disabled
|
| 151 |
+
node = next((n for n in nodes if n.name == node_name), None)
|
| 152 |
+
if node and node.status == "SchedulingDisabled":
|
| 153 |
+
return False, f"Node '{node_name}' is already drained"
|
| 154 |
+
|
| 155 |
+
return True, None
|
| 156 |
+
|
| 157 |
+
def _validate_describe(self, action: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
| 158 |
+
resource_type = action.get("resource_type", "")
|
| 159 |
+
name = action.get("name", "")
|
| 160 |
+
|
| 161 |
+
if not resource_type:
|
| 162 |
+
return False, "Resource type is required"
|
| 163 |
+
|
| 164 |
+
if not name:
|
| 165 |
+
return False, "Resource name is required"
|
| 166 |
+
|
| 167 |
+
return True, None
|
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""COEnv Worker - Episode loop manager"""
|
| 2 |
+
|
| 3 |
+
from typing import Dict, Any, Optional, List
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class EpisodeResult:
|
| 7 |
+
"""Result of a single episode"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, rewards: List[float], steps: int, success: bool, info: Dict[str, Any]):
|
| 10 |
+
self.rewards = rewards
|
| 11 |
+
self.steps = steps
|
| 12 |
+
self.success = success
|
| 13 |
+
self.info = info
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Worker:
|
| 17 |
+
"""Manages the full lifecycle of a single episode"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, world, executor, validator):
|
| 20 |
+
self.world = world
|
| 21 |
+
self.executor = executor
|
| 22 |
+
self.validator = validator
|
| 23 |
+
|
| 24 |
+
def run_episode(self, task_id: str, task_objective: str, max_steps: int,
|
| 25 |
+
get_action_fn=None) -> EpisodeResult:
|
| 26 |
+
"""
|
| 27 |
+
Run a single episode
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
task_id: Task ID to run
|
| 31 |
+
task_objective: Objective string for the task
|
| 32 |
+
max_steps: Maximum steps for the episode
|
| 33 |
+
get_action_fn: Function to get action from agent (if None, uses random)
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
EpisodeResult with rewards, steps, success, and info
|
| 37 |
+
"""
|
| 38 |
+
import random
|
| 39 |
+
|
| 40 |
+
# Reset world with task condition
|
| 41 |
+
observation = self.world.reset(task_objective)
|
| 42 |
+
rewards = []
|
| 43 |
+
steps = 0
|
| 44 |
+
info = {}
|
| 45 |
+
|
| 46 |
+
for step in range(1, max_steps + 1):
|
| 47 |
+
steps = step
|
| 48 |
+
|
| 49 |
+
# Get action from agent (or random for now)
|
| 50 |
+
if get_action_fn:
|
| 51 |
+
action = get_action_fn(observation)
|
| 52 |
+
else:
|
| 53 |
+
# Random action for testing
|
| 54 |
+
action = self._random_action()
|
| 55 |
+
|
| 56 |
+
# Validate action
|
| 57 |
+
is_valid, error_msg = self.validator.validate(action)
|
| 58 |
+
if not is_valid:
|
| 59 |
+
info["error"] = error_msg
|
| 60 |
+
# Still execute but with penalty
|
| 61 |
+
result = self.executor.execute(action, task_id, max_steps)
|
| 62 |
+
result.reward = -0.1 # Penalty for invalid action
|
| 63 |
+
else:
|
| 64 |
+
# Execute valid action
|
| 65 |
+
result = self.executor.execute(action, task_id, max_steps)
|
| 66 |
+
|
| 67 |
+
rewards.append(result.reward)
|
| 68 |
+
|
| 69 |
+
if result.done:
|
| 70 |
+
info["success"] = True
|
| 71 |
+
info["final_reward"] = result.reward
|
| 72 |
+
break
|
| 73 |
+
else:
|
| 74 |
+
info["success"] = False
|
| 75 |
+
info["final_reward"] = rewards[-1] if rewards else 0.0
|
| 76 |
+
|
| 77 |
+
return EpisodeResult(
|
| 78 |
+
rewards=rewards,
|
| 79 |
+
steps=steps,
|
| 80 |
+
success=info.get("success", False),
|
| 81 |
+
info=info
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
def _random_action(self) -> Dict[str, Any]:
|
| 85 |
+
"""Generate a random action for testing"""
|
| 86 |
+
import random
|
| 87 |
+
|
| 88 |
+
action_types = [
|
| 89 |
+
{"action_type": "describe", "resource_type": "deployment", "name": "frontend"},
|
| 90 |
+
{"action_type": "describe", "resource_type": "pod", "name": "frontend-abc123"},
|
| 91 |
+
{"action_type": "scale", "deployment": "frontend", "replicas": random.randint(1, 5)},
|
| 92 |
+
{"action_type": "scale", "deployment": "backend", "replicas": random.randint(1, 5)},
|
| 93 |
+
{"action_type": "delete_pod", "pod_name": "frontend-xyz789"},
|
| 94 |
+
{"action_type": "patch", "resource_type": "deployment", "name": "frontend",
|
| 95 |
+
"patch": {"desired_replicas": random.randint(1, 5)}},
|
| 96 |
+
{"action_type": "rollout_restart", "deployment": "frontend"},
|
| 97 |
+
{"action_type": "set_hpa", "deployment": "backend",
|
| 98 |
+
"min_replicas": 1, "max_replicas": 10, "cpu_target_percent": 80},
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
return random.choice(action_types)
|
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test - Environment (from test_world.py)
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'server'))
|
| 8 |
+
|
| 9 |
+
from COEnv_environment import World
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_world_initialization():
|
| 13 |
+
"""Test that the world initializes correctly"""
|
| 14 |
+
config = {
|
| 15 |
+
"num_nodes": 2,
|
| 16 |
+
"node_cpu_capacity": 4,
|
| 17 |
+
"node_mem_capacity": 8192,
|
| 18 |
+
"pod_cpu_request": 250,
|
| 19 |
+
"pod_mem_request": 128,
|
| 20 |
+
"pod_cpu_limit": 500,
|
| 21 |
+
"pod_mem_limit": 256,
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
world = World(config)
|
| 25 |
+
print("World initialized successfully")
|
| 26 |
+
|
| 27 |
+
nodes = world.get_nodes()
|
| 28 |
+
pods = world.get_pods()
|
| 29 |
+
deployments = world.get_deployments()
|
| 30 |
+
services = world.get_services()
|
| 31 |
+
|
| 32 |
+
print(f"Nodes: {len(nodes)}")
|
| 33 |
+
print(f"Pods: {len(pods)}")
|
| 34 |
+
print(f"Deployments: {len(deployments)}")
|
| 35 |
+
print(f"Services: {len(services)}")
|
| 36 |
+
|
| 37 |
+
assert len(nodes) == 2
|
| 38 |
+
assert len(pods) > 0
|
| 39 |
+
assert len(deployments) > 0
|
| 40 |
+
assert len(services) > 0
|
| 41 |
+
|
| 42 |
+
print("All tests passed!")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
test_world_initialization()
|