Spaces:
Paused
Paused
Commit ·
12263fa
1
Parent(s): 9c5fcc9
Add Cloud Arena Mathematical Model RL environment
Browse files- Dockerfile +20 -0
- README.md +16 -5
- app.py +96 -0
- cloud_arena/__init__.py +3 -0
- cloud_arena/environment.py +941 -0
- cloud_arena/evaluation.py +126 -0
- cloud_arena/training.py +130 -0
- cloud_arena/visualization.py +56 -0
- models/.gitkeep +0 -0
- outputs/.gitkeep +0 -0
- requirements.txt +9 -0
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# System deps
|
| 6 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 7 |
+
build-essential git && \
|
| 8 |
+
rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Python deps
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 13 |
+
|
| 14 |
+
# App code
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
# HF Spaces expects port 7860
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,11 +1,22 @@
|
|
| 1 |
---
|
| 2 |
title: Openenv
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
short_description:
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Openenv
|
| 3 |
+
emoji: ☁️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
short_description: Cloud Arena Mathematical Model RL Training
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Cloud Arena — Mathematical Model RL Training
|
| 12 |
+
|
| 13 |
+
Multi-objective cloud operations RL environment trained with **MaskablePPO**.
|
| 14 |
+
|
| 15 |
+
This is the **Mathematical Model** (MLP + stable-baselines3), NOT the LLM model.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
- 125-dim observation space, 150 discrete actions
|
| 19 |
+
- 6-phase curriculum learning
|
| 20 |
+
- Action masking, fog-of-war, chaos events
|
| 21 |
+
- Boss fight evaluation scenarios
|
| 22 |
+
- Interactive training dashboard
|
app.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cloud Arena — Mathematical Model RL Training on HF Spaces
|
| 3 |
+
This is the MATHEMATICAL model (MaskablePPO + MLP), NOT the LLM model.
|
| 4 |
+
The LLM model (cell5_ppo.py) is a SEPARATE system.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import gradio as gr
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
os.makedirs("./models", exist_ok=True)
|
| 12 |
+
os.makedirs("./outputs", exist_ok=True)
|
| 13 |
+
|
| 14 |
+
# Global state
|
| 15 |
+
training_state = {"model": None, "callback": None, "status": "idle"}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def run_training(timesteps):
|
| 19 |
+
from cloud_arena.training import train_model
|
| 20 |
+
training_state["status"] = "training"
|
| 21 |
+
try:
|
| 22 |
+
ts = int(timesteps)
|
| 23 |
+
model, callback, _ = train_model(total_timesteps=ts)
|
| 24 |
+
training_state["model"] = model
|
| 25 |
+
training_state["callback"] = callback
|
| 26 |
+
training_state["status"] = "done"
|
| 27 |
+
|
| 28 |
+
from cloud_arena.visualization import generate_dashboard
|
| 29 |
+
img_path = generate_dashboard(callback, "outputs/dashboard.png")
|
| 30 |
+
|
| 31 |
+
summary = (
|
| 32 |
+
f"✅ Training Complete\n"
|
| 33 |
+
f"Episodes: {len(callback.episode_rewards)}\n"
|
| 34 |
+
f"Final Phase: {callback.current_level}\n"
|
| 35 |
+
f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n"
|
| 36 |
+
f"Avg Savings: {np.mean(callback.episode_savings):.1f}%"
|
| 37 |
+
)
|
| 38 |
+
return summary, img_path
|
| 39 |
+
except Exception as e:
|
| 40 |
+
training_state["status"] = "error"
|
| 41 |
+
return f"❌ Error: {e}", None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def run_evaluation():
|
| 45 |
+
from cloud_arena.evaluation import evaluate_model
|
| 46 |
+
try:
|
| 47 |
+
results = evaluate_model()
|
| 48 |
+
wr = np.mean(results["win"]) * 100
|
| 49 |
+
cost = np.mean(results["cost_score"])
|
| 50 |
+
sec = np.mean(results["security_score"])
|
| 51 |
+
sav = np.mean(results["savings_pct"])
|
| 52 |
+
return (
|
| 53 |
+
f"Win Rate: {wr:.1f}%\n"
|
| 54 |
+
f"Cost Score: {cost:.3f}\n"
|
| 55 |
+
f"Security: {sec:.3f}\n"
|
| 56 |
+
f"Savings: {sav:.1f}%"
|
| 57 |
+
)
|
| 58 |
+
except Exception as e:
|
| 59 |
+
return f"❌ Error: {e}"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def run_bosses():
|
| 63 |
+
from cloud_arena.evaluation import run_boss_fights, BOSS_NAMES
|
| 64 |
+
try:
|
| 65 |
+
scores = run_boss_fights()
|
| 66 |
+
lines = [f"{BOSS_NAMES[k]}: {v:.1f}%" for k, v in scores.items()]
|
| 67 |
+
overall = np.mean(list(scores.values()))
|
| 68 |
+
lines.append(f"\nOverall: {overall:.1f}%")
|
| 69 |
+
return "\n".join(lines)
|
| 70 |
+
except Exception as e:
|
| 71 |
+
return f"❌ Error: {e}"
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
with gr.Blocks(title="Cloud Arena RL", theme=gr.themes.Base()) as demo:
|
| 75 |
+
gr.Markdown("# ☁️ Cloud Arena — Mathematical Model RL")
|
| 76 |
+
gr.Markdown("MaskablePPO training on a multi-objective cloud ops environment.")
|
| 77 |
+
|
| 78 |
+
with gr.Tab("Train"):
|
| 79 |
+
ts_input = gr.Number(value=500000, label="Total Timesteps")
|
| 80 |
+
train_btn = gr.Button("🚀 Start Training", variant="primary")
|
| 81 |
+
train_output = gr.Textbox(label="Status", lines=6)
|
| 82 |
+
train_img = gr.Image(label="Dashboard")
|
| 83 |
+
train_btn.click(run_training, inputs=ts_input, outputs=[train_output, train_img])
|
| 84 |
+
|
| 85 |
+
with gr.Tab("Evaluate"):
|
| 86 |
+
eval_btn = gr.Button("📊 Run Evaluation")
|
| 87 |
+
eval_output = gr.Textbox(label="Results", lines=8)
|
| 88 |
+
eval_btn.click(run_evaluation, outputs=eval_output)
|
| 89 |
+
|
| 90 |
+
with gr.Tab("Boss Fights"):
|
| 91 |
+
boss_btn = gr.Button("⚔️ Run Boss Fights")
|
| 92 |
+
boss_output = gr.Textbox(label="Boss Scores", lines=8)
|
| 93 |
+
boss_btn.click(run_bosses, outputs=boss_output)
|
| 94 |
+
|
| 95 |
+
if __name__ == "__main__":
|
| 96 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
cloud_arena/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from cloud_arena.environment import CloudArenaEnv, ResourceObject, get_action_masks
|
| 2 |
+
|
| 3 |
+
__all__ = ["CloudArenaEnv", "ResourceObject", "get_action_masks"]
|
cloud_arena/environment.py
ADDED
|
@@ -0,0 +1,941 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cloud Arena Environment — Mathematical Model RL
|
| 2 |
+
# Extracted from cloud_arena_final.py
|
| 3 |
+
# This is the MATHEMATICAL model env, NOT the LLM model.
|
| 4 |
+
|
| 5 |
+
import sys, math, random, copy
|
| 6 |
+
from collections import deque
|
| 7 |
+
from typing import Dict, List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import gymnasium as gym
|
| 11 |
+
from gymnasium import spaces
|
| 12 |
+
|
| 13 |
+
# ── Seeds ─────────────────────────────────────────────────────────────────────
|
| 14 |
+
GLOBAL_SEED = 42
|
| 15 |
+
np.random.seed(GLOBAL_SEED)
|
| 16 |
+
random.seed(GLOBAL_SEED)
|
| 17 |
+
|
| 18 |
+
# ── Observation layout (must sum to OBS_DIM) ──────────────────────────────────
|
| 19 |
+
MAX_RES_IN_OBS = 8 # fixed obs slots (pad unused with zeros)
|
| 20 |
+
N_FEAT_PER_RES = 10 # features per resource slot in obs
|
| 21 |
+
N_BLOCK_B = 8 # global security block
|
| 22 |
+
N_BLOCK_C = 7 # global cost block
|
| 23 |
+
N_BLOCK_D = 6 # environment state block
|
| 24 |
+
N_BLOCK_E = 24 # history: 8 actions + 8 rewards + 8 progress
|
| 25 |
+
OBS_DIM = MAX_RES_IN_OBS * N_FEAT_PER_RES + N_BLOCK_B + N_BLOCK_C + N_BLOCK_D + N_BLOCK_E
|
| 26 |
+
# = 80 + 8 + 7 + 6 + 24 = 125
|
| 27 |
+
|
| 28 |
+
assert OBS_DIM == 125, f"OBS_DIM mismatch: {OBS_DIM}"
|
| 29 |
+
|
| 30 |
+
# ── Action space ──────────────────────────────────────────────────────────────
|
| 31 |
+
N_ACTION_TYPES = 15
|
| 32 |
+
MAX_RESOURCES = 10
|
| 33 |
+
N_ACTIONS = N_ACTION_TYPES * MAX_RESOURCES # 150
|
| 34 |
+
|
| 35 |
+
A_NOOP=0; A_ANALYZE=1; A_VERIFY_DEPS=2; A_RESIZE_DOWN=3; A_RESIZE_UP=4
|
| 36 |
+
A_STOP=5; A_RESTART=6; A_DELETE=7; A_PATCH=8; A_ENCRYPT=9
|
| 37 |
+
A_RESTRICT=10; A_ROTATE_CREDS=11; A_ENABLE_LOG=12; A_ARCHIVE=13; A_OPT_NET=14
|
| 38 |
+
|
| 39 |
+
# Action cost penalties (small friction — makes actions non-free)
|
| 40 |
+
ACTION_COSTS = {
|
| 41 |
+
A_NOOP: 0.0, A_ANALYZE: -0.01, A_VERIFY_DEPS: -0.01,
|
| 42 |
+
A_RESIZE_DOWN: -0.02, A_RESIZE_UP: -0.02,
|
| 43 |
+
A_STOP: -0.03, A_RESTART: -0.02, A_DELETE: -0.05,
|
| 44 |
+
A_PATCH: -0.02, A_ENCRYPT: -0.02, A_RESTRICT: -0.02,
|
| 45 |
+
A_ROTATE_CREDS: -0.02, A_ENABLE_LOG: -0.01,
|
| 46 |
+
A_ARCHIVE: -0.03, A_OPT_NET: -0.02,
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# ── Curriculum ────────────────────────────────────────────────────────────────
|
| 50 |
+
# n_resources active per phase
|
| 51 |
+
N_RESOURCES_PHASE = {0: 4, 1: 5, 2: 6, 3: 7, 4: 8, 5: 10}
|
| 52 |
+
|
| 53 |
+
# Phase feature flags
|
| 54 |
+
PHASE_FOG = {0: False, 1: True, 2: True, 3: True, 4: True, 5: True}
|
| 55 |
+
PHASE_EVENTS = {0: False, 1: False, 2: True, 3: True, 4: True, 5: True}
|
| 56 |
+
PHASE_CHAOS = {0: False, 1: False, 2: False, 3: True, 4: True, 5: True}
|
| 57 |
+
CHAOS_INIT_PROB = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.20, 4: 0.30, 5: 0.35}
|
| 58 |
+
|
| 59 |
+
# Win thresholds: cost must drop to this fraction of initial AND security >= sec_thr
|
| 60 |
+
WIN_COST_THR = {0: 0.55, 1: 0.60, 2: 0.60, 3: 0.65, 4: 0.65, 5: 0.70}
|
| 61 |
+
WIN_SEC_THR = {0: 0.00, 1: 0.60, 2: 0.70, 3: 0.70, 4: 0.75, 5: 0.80}
|
| 62 |
+
|
| 63 |
+
MAX_STEPS = 150
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 67 |
+
# RESOURCE OBJECT
|
| 68 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 69 |
+
|
| 70 |
+
class ResourceObject:
|
| 71 |
+
CRIT = {"LOW": 0.3, "MED": 0.6, "HIGH": 1.0}
|
| 72 |
+
|
| 73 |
+
def __init__(self, idx: int, criticality: str = "MED",
|
| 74 |
+
category: str = "compute", rng: random.Random = None):
|
| 75 |
+
rng = rng or random.Random(idx)
|
| 76 |
+
self.idx = idx
|
| 77 |
+
self.criticality = self.CRIT[criticality]
|
| 78 |
+
self.category = category
|
| 79 |
+
|
| 80 |
+
# ── Cost state ──────────────────────────────────────────────────────
|
| 81 |
+
self.allocated = rng.uniform(0.70, 1.00) # initially overprovisioned
|
| 82 |
+
self.usage = rng.uniform(0.15, 0.50) # true usage (always < allocated)
|
| 83 |
+
self.usage = min(self.usage, self.allocated - 0.10)
|
| 84 |
+
self.cost_rate = self.allocated # cost ∝ allocated
|
| 85 |
+
self.activity_status = 1.0 # 1=active, 0=idle
|
| 86 |
+
|
| 87 |
+
# ── State flags ──────────────────────────────────────────────────────
|
| 88 |
+
self.health = 1
|
| 89 |
+
self.is_stopped = False
|
| 90 |
+
self.is_deleted = False
|
| 91 |
+
self.alert_flag = 0
|
| 92 |
+
|
| 93 |
+
# ── Security state (hidden under fog) ────────────────────────────────
|
| 94 |
+
self.risk_score = rng.uniform(0.05, 0.20)
|
| 95 |
+
self.vulnerability = False
|
| 96 |
+
self.encryption = True
|
| 97 |
+
self.over_permission = False
|
| 98 |
+
self.logging_enabled = True
|
| 99 |
+
self.credential_age = rng.uniform(0.0, 0.3)
|
| 100 |
+
self.exposure = rng.uniform(0.0, 0.15)
|
| 101 |
+
self.sensitivity = rng.uniform(0.3, 0.8)
|
| 102 |
+
|
| 103 |
+
# ── Fog of war ───────────────────────────────────────────────────────
|
| 104 |
+
self.fog_active = True # True = attributes hidden until ANALYZE
|
| 105 |
+
self.cost_known = False
|
| 106 |
+
self.deps_known = False
|
| 107 |
+
self.steps_since_analyze = 0
|
| 108 |
+
self.staleness = 0.0
|
| 109 |
+
self.STALE_STEPS = 15 # after this many steps, fog re-activates
|
| 110 |
+
|
| 111 |
+
# ── Dependency ───────────────────────────────────────────────────────
|
| 112 |
+
self.dependency_children: List[int] = [] # indices of resources that depend on this
|
| 113 |
+
self.dependency_parent: Optional[int] = None
|
| 114 |
+
|
| 115 |
+
# ── Diagnostics ──────────────────────────────────────────────────────
|
| 116 |
+
self.steps_broken = 0
|
| 117 |
+
self.time_broken = 0.0
|
| 118 |
+
|
| 119 |
+
# ── Derived properties ────────────────────────────────────────────────────
|
| 120 |
+
|
| 121 |
+
def overprovision_ratio(self) -> float:
|
| 122 |
+
return max(0.0, (self.allocated - self.usage) / max(self.allocated, 1e-6))
|
| 123 |
+
|
| 124 |
+
def get_cost(self) -> float:
|
| 125 |
+
if self.is_deleted:
|
| 126 |
+
return 0.0
|
| 127 |
+
if self.is_stopped:
|
| 128 |
+
return self.cost_rate * 0.05 # minimal maintenance cost
|
| 129 |
+
return self.cost_rate
|
| 130 |
+
|
| 131 |
+
# ── Observation vector (10 dims) ──────────────────────────────────────────
|
| 132 |
+
|
| 133 |
+
def to_obs(self, fog: bool = False) -> np.ndarray:
|
| 134 |
+
if fog and self.fog_active:
|
| 135 |
+
risk_obs = 0.0
|
| 136 |
+
cost_obs = 0.5 # agent sees estimated cost when under fog
|
| 137 |
+
exp_obs = 0.0
|
| 138 |
+
else:
|
| 139 |
+
risk_obs = self.risk_score
|
| 140 |
+
cost_obs = self.cost_rate
|
| 141 |
+
exp_obs = self.exposure
|
| 142 |
+
|
| 143 |
+
return np.array([
|
| 144 |
+
float(self.health), # 0
|
| 145 |
+
risk_obs, # 1 (hidden under fog)
|
| 146 |
+
self.criticality, # 2
|
| 147 |
+
cost_obs, # 3 (hidden under fog)
|
| 148 |
+
self.activity_status, # 4
|
| 149 |
+
exp_obs, # 5 (hidden under fog)
|
| 150 |
+
self.sensitivity, # 6
|
| 151 |
+
self.staleness, # 7 (always visible)
|
| 152 |
+
float(self.alert_flag), # 8 (always visible for critical)
|
| 153 |
+
self.time_broken, # 9
|
| 154 |
+
], dtype=np.float32)
|
| 155 |
+
|
| 156 |
+
# ── Per-step tick ─────────────────────────────────────────────────────────
|
| 157 |
+
|
| 158 |
+
def tick(self, rng: random.Random, phase: int, event_prob: float = 0.0):
|
| 159 |
+
if self.is_deleted:
|
| 160 |
+
return
|
| 161 |
+
|
| 162 |
+
# Staleness
|
| 163 |
+
self.steps_since_analyze += 1
|
| 164 |
+
self.staleness = min(self.steps_since_analyze / self.STALE_STEPS, 1.0)
|
| 165 |
+
if self.steps_since_analyze >= self.STALE_STEPS:
|
| 166 |
+
self.fog_active = True # knowledge expires
|
| 167 |
+
|
| 168 |
+
# Usage drift (only when running)
|
| 169 |
+
if not self.is_stopped and self.health:
|
| 170 |
+
self.usage = float(np.clip(
|
| 171 |
+
self.usage + rng.uniform(-0.03, 0.03), 0.10, self.allocated))
|
| 172 |
+
|
| 173 |
+
# Credential aging
|
| 174 |
+
self.credential_age = min(self.credential_age + 0.01, 1.0)
|
| 175 |
+
|
| 176 |
+
# Broken resource tracking
|
| 177 |
+
if not self.health:
|
| 178 |
+
self.steps_broken += 1
|
| 179 |
+
self.time_broken = min(self.steps_broken / MAX_STEPS, 1.0)
|
| 180 |
+
self.risk_score = min(self.risk_score + 0.015, 1.0)
|
| 181 |
+
if self.criticality >= 1.0:
|
| 182 |
+
self.alert_flag = 1 # high-criticality broken = visible alert
|
| 183 |
+
|
| 184 |
+
# Random security events (Phase 2+)
|
| 185 |
+
if phase >= 2 and rng.random() < event_prob and self.health:
|
| 186 |
+
ev = rng.choice(["vuln", "expose", "iam", "log_off"])
|
| 187 |
+
if ev == "vuln":
|
| 188 |
+
self.vulnerability = True
|
| 189 |
+
self.risk_score = min(self.risk_score + 0.25, 1.0)
|
| 190 |
+
elif ev == "expose":
|
| 191 |
+
self.exposure = min(self.exposure + 0.35, 1.0)
|
| 192 |
+
self.risk_score = min(self.risk_score + 0.20, 1.0)
|
| 193 |
+
elif ev == "iam":
|
| 194 |
+
self.over_permission = True
|
| 195 |
+
self.risk_score = min(self.risk_score + 0.15, 1.0)
|
| 196 |
+
elif ev == "log_off":
|
| 197 |
+
self.logging_enabled = False
|
| 198 |
+
self.risk_score = min(self.risk_score + 0.05, 1.0)
|
| 199 |
+
|
| 200 |
+
# ── Actions ───────────────────────────────────────────────────────────────
|
| 201 |
+
|
| 202 |
+
def do_analyze(self):
|
| 203 |
+
self.fog_active = False
|
| 204 |
+
self.cost_known = True
|
| 205 |
+
self.steps_since_analyze = 0
|
| 206 |
+
self.staleness = 0.0
|
| 207 |
+
|
| 208 |
+
def do_verify_deps(self):
|
| 209 |
+
self.deps_known = True
|
| 210 |
+
|
| 211 |
+
def do_resize_down(self) -> float:
|
| 212 |
+
"""Returns cost delta (positive = saving)."""
|
| 213 |
+
new_alloc = max(self.usage + 0.10, 0.25)
|
| 214 |
+
if new_alloc < self.allocated - 0.02:
|
| 215 |
+
saved = (self.allocated - new_alloc)
|
| 216 |
+
self.allocated = new_alloc
|
| 217 |
+
self.cost_rate = new_alloc
|
| 218 |
+
return saved
|
| 219 |
+
return 0.0
|
| 220 |
+
|
| 221 |
+
def do_resize_up(self):
|
| 222 |
+
self.allocated = min(self.allocated + 0.20, 1.0)
|
| 223 |
+
self.cost_rate = self.allocated
|
| 224 |
+
|
| 225 |
+
def do_stop(self) -> float:
|
| 226 |
+
if not self.is_stopped:
|
| 227 |
+
self.is_stopped = True
|
| 228 |
+
self.activity_status = 0.0
|
| 229 |
+
return self.cost_rate * 0.95 # 95% cost eliminated
|
| 230 |
+
return 0.0
|
| 231 |
+
|
| 232 |
+
def do_restart(self):
|
| 233 |
+
self.is_stopped = False
|
| 234 |
+
self.activity_status = 1.0
|
| 235 |
+
self.health = 1
|
| 236 |
+
|
| 237 |
+
def do_delete(self) -> float:
|
| 238 |
+
saved = self.cost_rate
|
| 239 |
+
self.is_deleted = True
|
| 240 |
+
self.health = 0
|
| 241 |
+
return saved
|
| 242 |
+
|
| 243 |
+
def do_patch(self):
|
| 244 |
+
self.vulnerability = False
|
| 245 |
+
self.risk_score = max(self.risk_score - 0.30, 0.0)
|
| 246 |
+
|
| 247 |
+
def do_encrypt(self):
|
| 248 |
+
self.encryption = True
|
| 249 |
+
self.risk_score = max(self.risk_score - 0.15, 0.0)
|
| 250 |
+
|
| 251 |
+
def do_restrict(self):
|
| 252 |
+
self.exposure = max(self.exposure - 0.40, 0.0)
|
| 253 |
+
self.risk_score = max(self.risk_score - 0.20, 0.0)
|
| 254 |
+
|
| 255 |
+
def do_rotate_creds(self):
|
| 256 |
+
self.credential_age = 0.0
|
| 257 |
+
self.over_permission = False
|
| 258 |
+
self.risk_score = max(self.risk_score - 0.10, 0.0)
|
| 259 |
+
|
| 260 |
+
def do_enable_logging(self):
|
| 261 |
+
self.logging_enabled = True
|
| 262 |
+
self.risk_score = max(self.risk_score - 0.05, 0.0)
|
| 263 |
+
|
| 264 |
+
def do_archive(self) -> float:
|
| 265 |
+
if not self.is_stopped:
|
| 266 |
+
self.is_stopped = True
|
| 267 |
+
self.activity_status = 0.0
|
| 268 |
+
return self.cost_rate * 0.70
|
| 269 |
+
return 0.0
|
| 270 |
+
|
| 271 |
+
def do_opt_network(self):
|
| 272 |
+
self.exposure = max(self.exposure - 0.15, 0.0)
|
| 273 |
+
self.risk_score = max(self.risk_score - 0.08, 0.0)
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 277 |
+
# ENVIRONMENT
|
| 278 |
+
# ══════════════════════════════════════════════════════════════════════════════
|
| 279 |
+
|
| 280 |
+
class CloudArenaEnv(gym.Env):
|
| 281 |
+
"""
|
| 282 |
+
Cloud Arena: multi-objective cloud operations RL environment.
|
| 283 |
+
Observation: 125-dim flat float32.
|
| 284 |
+
Action space: Discrete(150) = 15 types × 10 resource slots.
|
| 285 |
+
"""
|
| 286 |
+
metadata = {"render_modes": []}
|
| 287 |
+
|
| 288 |
+
def __init__(self,
|
| 289 |
+
curriculum_ref: List[int] = None,
|
| 290 |
+
global_step_ref: List[int] = None):
|
| 291 |
+
super().__init__()
|
| 292 |
+
self._curriculum_ref = curriculum_ref or [0]
|
| 293 |
+
self._global_step_ref = global_step_ref or [0]
|
| 294 |
+
|
| 295 |
+
self.observation_space = spaces.Box(
|
| 296 |
+
low=-np.inf, high=np.inf, shape=(OBS_DIM,), dtype=np.float32)
|
| 297 |
+
self.action_space = spaces.Discrete(N_ACTIONS)
|
| 298 |
+
|
| 299 |
+
# Episode state (set in reset)
|
| 300 |
+
self.resources: List[ResourceObject] = []
|
| 301 |
+
self.n_active = 0
|
| 302 |
+
self.step_count = 0
|
| 303 |
+
self.chaos_active = False
|
| 304 |
+
self.chaos_steps = 0
|
| 305 |
+
self.veto_count = 0
|
| 306 |
+
self.cascade_count = 0
|
| 307 |
+
self.initial_total_cost = 1.0
|
| 308 |
+
self.prev_total_cost = 1.0
|
| 309 |
+
self.prev_risk_agg = 0.0
|
| 310 |
+
self._action_hist = deque([0.0] * 8, maxlen=8)
|
| 311 |
+
self._reward_hist = deque([0.0] * 8, maxlen=8)
|
| 312 |
+
self._progress_hist= deque([0.0] * 8, maxlen=8)
|
| 313 |
+
|
| 314 |
+
# ── Properties ────────────────────────────────────────────────────────────
|
| 315 |
+
|
| 316 |
+
@property
|
| 317 |
+
def curriculum_level(self) -> int:
|
| 318 |
+
return self._curriculum_ref[0]
|
| 319 |
+
|
| 320 |
+
# ── Reset ─────────────────────────────────────────────────────────────────
|
| 321 |
+
|
| 322 |
+
def reset(self, seed=None, options=None):
|
| 323 |
+
super().reset(seed=seed)
|
| 324 |
+
rng = random.Random(seed if seed is not None else GLOBAL_SEED + self.step_count)
|
| 325 |
+
|
| 326 |
+
self.step_count = 0
|
| 327 |
+
self.chaos_active = False
|
| 328 |
+
self.chaos_steps = 0
|
| 329 |
+
self.veto_count = 0
|
| 330 |
+
self.cascade_count = 0
|
| 331 |
+
|
| 332 |
+
phase = self.curriculum_level
|
| 333 |
+
scenario = options.get("scenario", 0) if options else 0
|
| 334 |
+
|
| 335 |
+
if scenario > 0:
|
| 336 |
+
self._setup_boss_scenario(scenario, rng)
|
| 337 |
+
else:
|
| 338 |
+
self._setup_normal_episode(phase, rng)
|
| 339 |
+
|
| 340 |
+
self.initial_total_cost = max(sum(r.get_cost() for r in self.resources), 1e-6)
|
| 341 |
+
self.prev_total_cost = self.initial_total_cost
|
| 342 |
+
self.prev_risk_agg = self._risk_aggregate()
|
| 343 |
+
|
| 344 |
+
self._action_hist = deque([0.0] * 8, maxlen=8)
|
| 345 |
+
self._reward_hist = deque([0.0] * 8, maxlen=8)
|
| 346 |
+
self._progress_hist = deque([0.0] * 8, maxlen=8)
|
| 347 |
+
|
| 348 |
+
return self._build_obs(), {}
|
| 349 |
+
|
| 350 |
+
def _setup_normal_episode(self, phase: int, rng: random.Random):
|
| 351 |
+
"""Standard episode with phase-appropriate resources."""
|
| 352 |
+
self.n_active = N_RESOURCES_PHASE[phase]
|
| 353 |
+
n = self.n_active
|
| 354 |
+
|
| 355 |
+
# Criticality distribution: ~20% HIGH, ~40% MED, ~40% LOW
|
| 356 |
+
crits = []
|
| 357 |
+
for i in range(n):
|
| 358 |
+
if i == 0:
|
| 359 |
+
crits.append("HIGH")
|
| 360 |
+
elif i < n // 2:
|
| 361 |
+
crits.append("MED")
|
| 362 |
+
else:
|
| 363 |
+
crits.append("LOW")
|
| 364 |
+
|
| 365 |
+
cats = ["compute", "compute", "storage", "database",
|
| 366 |
+
"compute", "storage", "compute", "database",
|
| 367 |
+
"compute", "storage"][:n]
|
| 368 |
+
|
| 369 |
+
self.resources = []
|
| 370 |
+
for i in range(n):
|
| 371 |
+
r = ResourceObject(i, crits[i], cats[i], rng)
|
| 372 |
+
|
| 373 |
+
# Phase 0: full observability — reveal everything upfront
|
| 374 |
+
if not PHASE_FOG[phase]:
|
| 375 |
+
r.fog_active = False
|
| 376 |
+
r.cost_known = True
|
| 377 |
+
r.deps_known = True
|
| 378 |
+
|
| 379 |
+
# Phase 0: no security issues to start (clean state)
|
| 380 |
+
if phase == 0:
|
| 381 |
+
r.risk_score = rng.uniform(0.02, 0.08)
|
| 382 |
+
r.vulnerability = False
|
| 383 |
+
r.encryption = True
|
| 384 |
+
r.over_permission = False
|
| 385 |
+
r.logging_enabled = True
|
| 386 |
+
r.exposure = rng.uniform(0.0, 0.05)
|
| 387 |
+
else:
|
| 388 |
+
# 💥 ANTI-CHEAT FIX: Force the agent to actually do SecOps in Phase 1+!
|
| 389 |
+
r.vulnerability = rng.random() < 0.40
|
| 390 |
+
r.encryption = rng.random() > 0.30 # 30% unencrypted
|
| 391 |
+
r.over_permission = rng.random() < 0.30
|
| 392 |
+
r.logging_enabled = rng.random() > 0.20
|
| 393 |
+
r.exposure = rng.uniform(0.10, 0.40)
|
| 394 |
+
r.risk_score = rng.uniform(0.30, 0.60)
|
| 395 |
+
|
| 396 |
+
self.resources.append(r)
|
| 397 |
+
|
| 398 |
+
# Set up simple dependency: resource 0 (HIGH) has children [1]
|
| 399 |
+
# This means deleting resource 0 would cascade to resource 1
|
| 400 |
+
# Agent can't delete resource 0 anyway (HIGH criticality), so it's safe
|
| 401 |
+
if n >= 2:
|
| 402 |
+
self.resources[0].dependency_children = [1]
|
| 403 |
+
self.resources[1].dependency_parent = 0
|
| 404 |
+
|
| 405 |
+
# Chaos initialization for Phase 3+
|
| 406 |
+
if PHASE_CHAOS[phase] and rng.random() < CHAOS_INIT_PROB[phase]:
|
| 407 |
+
self.chaos_active = True
|
| 408 |
+
# Break 1-2 non-critical resources
|
| 409 |
+
victims = [r for r in self.resources if r.criticality < 1.0][:2]
|
| 410 |
+
for v in victims:
|
| 411 |
+
v.health = 0
|
| 412 |
+
v.risk_score = min(v.risk_score + 0.40, 1.0)
|
| 413 |
+
v.alert_flag = 0 # hidden unless HIGH criticality
|
| 414 |
+
|
| 415 |
+
def _setup_boss_scenario(self, scenario: int, rng: random.Random):
|
| 416 |
+
"""Boss fight: predefined stressful starting conditions."""
|
| 417 |
+
phase = max(self.curriculum_level, 3) # boss fights at phase 3+ difficulty
|
| 418 |
+
self._setup_normal_episode(phase, rng)
|
| 419 |
+
|
| 420 |
+
if scenario == 1: # Cost Crisis
|
| 421 |
+
for r in self.resources:
|
| 422 |
+
r.allocated = min(r.allocated + rng.uniform(0.10, 0.25), 1.0)
|
| 423 |
+
r.cost_rate = r.allocated
|
| 424 |
+
r.usage = max(r.usage - 0.10, 0.10)
|
| 425 |
+
|
| 426 |
+
elif scenario == 2: # Security Breach
|
| 427 |
+
for r in self.resources:
|
| 428 |
+
r.fog_active = True # force fog — agent must analyze
|
| 429 |
+
r.cost_known = False
|
| 430 |
+
r.vulnerability = (rng.random() < 0.60)
|
| 431 |
+
r.encryption = (rng.random() < 0.30)
|
| 432 |
+
r.over_permission = (rng.random() < 0.50)
|
| 433 |
+
r.logging_enabled = (rng.random() < 0.40)
|
| 434 |
+
r.exposure = rng.uniform(0.30, 0.80)
|
| 435 |
+
r.risk_score = rng.uniform(0.40, 0.90)
|
| 436 |
+
|
| 437 |
+
elif scenario == 3: # Infrastructure Failure (NOOP Test)
|
| 438 |
+
self.chaos_active = True
|
| 439 |
+
for r in self.resources[:3]:
|
| 440 |
+
r.health = 0
|
| 441 |
+
r.risk_score = min(r.risk_score + 0.50, 1.0)
|
| 442 |
+
|
| 443 |
+
elif scenario == 4: # Traffic Surge (underprovisioned)
|
| 444 |
+
for r in self.resources:
|
| 445 |
+
r.usage = min(r.allocated - 0.05, rng.uniform(0.75, 0.95))
|
| 446 |
+
r.risk_score = min(r.risk_score + 0.10, 0.50)
|
| 447 |
+
|
| 448 |
+
elif scenario == 5: # Final Boss: everything
|
| 449 |
+
self.chaos_active = True
|
| 450 |
+
for i, r in enumerate(self.resources):
|
| 451 |
+
r.allocated = min(r.allocated + 0.15, 1.0)
|
| 452 |
+
r.cost_rate = r.allocated
|
| 453 |
+
r.vulnerability = (rng.random() < 0.50)
|
| 454 |
+
r.encryption = (rng.random() < 0.40)
|
| 455 |
+
r.exposure = rng.uniform(0.20, 0.70)
|
| 456 |
+
r.risk_score = rng.uniform(0.30, 0.80)
|
| 457 |
+
if i < 2:
|
| 458 |
+
r.health = 0
|
| 459 |
+
|
| 460 |
+
# ── Step ──────────────────────────────────────────────────────────────────
|
| 461 |
+
|
| 462 |
+
def step(self, action: int):
|
| 463 |
+
action = int(action)
|
| 464 |
+
self.step_count += 1
|
| 465 |
+
self._global_step_ref[0] += 1
|
| 466 |
+
|
| 467 |
+
atype = action // MAX_RESOURCES
|
| 468 |
+
ridx = action % MAX_RESOURCES
|
| 469 |
+
|
| 470 |
+
phase = self.curriculum_level
|
| 471 |
+
|
| 472 |
+
# ── Tick all resources ────────────────────────────────────────────────
|
| 473 |
+
event_prob = 0.04 if PHASE_EVENTS[phase] else 0.0
|
| 474 |
+
rng = random.Random(self._global_step_ref[0])
|
| 475 |
+
for r in self.resources:
|
| 476 |
+
r.tick(rng, phase, event_prob)
|
| 477 |
+
|
| 478 |
+
# ── Chaos events (Phase 3+) ───────────────────────────────────────────
|
| 479 |
+
if PHASE_CHAOS[phase] and rng.random() < 0.03:
|
| 480 |
+
healthy = [r for r in self.resources if r.health and not r.is_deleted
|
| 481 |
+
and r.criticality < 1.0]
|
| 482 |
+
if healthy:
|
| 483 |
+
victim = rng.choice(healthy)
|
| 484 |
+
victim.health = 0
|
| 485 |
+
victim.risk_score = min(victim.risk_score + 0.40, 1.0)
|
| 486 |
+
self.chaos_active = True
|
| 487 |
+
|
| 488 |
+
if self.chaos_active:
|
| 489 |
+
self.chaos_steps += 1
|
| 490 |
+
if self.chaos_steps > 20:
|
| 491 |
+
self.chaos_active = False # chaos resolves after ~20 steps
|
| 492 |
+
|
| 493 |
+
# ── Snapshot pre-action state ─────────────────────────────────────────
|
| 494 |
+
cost_before = sum(r.get_cost() for r in self.resources)
|
| 495 |
+
risk_before = self._risk_aggregate()
|
| 496 |
+
|
| 497 |
+
# ── Apply action ──────────────────────────────────────────────────────
|
| 498 |
+
cost_delta, sec_delta, veto = self._apply_action(atype, ridx)
|
| 499 |
+
if veto:
|
| 500 |
+
self.veto_count += 1
|
| 501 |
+
|
| 502 |
+
# ── Post-action state ─────────────────────────────────────────────────
|
| 503 |
+
cost_now = sum(r.get_cost() for r in self.resources)
|
| 504 |
+
risk_now = self._risk_aggregate()
|
| 505 |
+
|
| 506 |
+
# ── Compute reward ────────────────────────────────────────────────────
|
| 507 |
+
reward = self._compute_reward(
|
| 508 |
+
atype, ridx, veto, cost_before, cost_now, risk_before, risk_now)
|
| 509 |
+
|
| 510 |
+
# ── Check win/done ────────────────────────────────────────────────────
|
| 511 |
+
win = self._check_win(cost_now, risk_now, phase)
|
| 512 |
+
terminated = win
|
| 513 |
+
truncated = (self.step_count >= MAX_STEPS)
|
| 514 |
+
|
| 515 |
+
if terminated or truncated:
|
| 516 |
+
reward += self._terminal_reward(win, cost_now, risk_now, phase)
|
| 517 |
+
reward = float(np.clip(reward, -30.0, 60.0))
|
| 518 |
+
else:
|
| 519 |
+
reward = float(np.clip(reward, -2.0, 5.0))
|
| 520 |
+
|
| 521 |
+
# ── Update history ────────────────────────────────────────────────────
|
| 522 |
+
self._action_hist.append(atype / N_ACTION_TYPES)
|
| 523 |
+
self._reward_hist.append(np.clip(reward / 5.0, -1.0, 1.0))
|
| 524 |
+
self._progress_hist.append(max(0.0, (self.initial_total_cost - cost_now)
|
| 525 |
+
/ max(self.initial_total_cost, 1e-6)))
|
| 526 |
+
self.prev_total_cost = cost_now
|
| 527 |
+
self.prev_risk_agg = risk_now
|
| 528 |
+
|
| 529 |
+
info = {
|
| 530 |
+
"win": int(win),
|
| 531 |
+
"cost_score": float(np.clip(1.0 - cost_now / max(self.initial_total_cost, 1e-6), 0, 1)),
|
| 532 |
+
"security_score": float(np.clip(1.0 - risk_now, 0, 1)),
|
| 533 |
+
"reliability_score": self._reliability_score(),
|
| 534 |
+
"savings_pct": float(np.clip(
|
| 535 |
+
(self.initial_total_cost - cost_now)
|
| 536 |
+
/ max(self.initial_total_cost, 1e-6) * 100, 0, 100)),
|
| 537 |
+
"veto_rate": self.veto_count / max(self.step_count, 1),
|
| 538 |
+
"cascade_count": self.cascade_count,
|
| 539 |
+
"risk": risk_now,
|
| 540 |
+
"chaos_active": self.chaos_active,
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
return self._build_obs(), reward, terminated, truncated, info
|
| 544 |
+
|
| 545 |
+
# ── Action application ────────────────────────────────────────────────────
|
| 546 |
+
|
| 547 |
+
def _apply_action(self, atype: int, ridx: int) -> Tuple[float, float, bool]:
|
| 548 |
+
"""Returns (cost_delta, security_delta, was_vetoed)."""
|
| 549 |
+
|
| 550 |
+
if atype == A_NOOP:
|
| 551 |
+
return 0.0, 0.0, False # NOOP is never a veto
|
| 552 |
+
|
| 553 |
+
# Validate resource index
|
| 554 |
+
if ridx >= len(self.resources):
|
| 555 |
+
return 0.0, 0.0, True
|
| 556 |
+
|
| 557 |
+
r = self.resources[ridx]
|
| 558 |
+
|
| 559 |
+
if r.is_deleted:
|
| 560 |
+
return 0.0, 0.0, True
|
| 561 |
+
|
| 562 |
+
cost_before = r.get_cost()
|
| 563 |
+
risk_before = r.risk_score
|
| 564 |
+
veto = False
|
| 565 |
+
|
| 566 |
+
if atype == A_ANALYZE:
|
| 567 |
+
r.do_analyze()
|
| 568 |
+
|
| 569 |
+
elif atype == A_VERIFY_DEPS:
|
| 570 |
+
r.do_verify_deps()
|
| 571 |
+
|
| 572 |
+
elif atype == A_RESIZE_DOWN:
|
| 573 |
+
if r.overprovision_ratio() > 0.08 and not r.is_stopped:
|
| 574 |
+
r.do_resize_down()
|
| 575 |
+
else:
|
| 576 |
+
veto = True
|
| 577 |
+
|
| 578 |
+
elif atype == A_RESIZE_UP:
|
| 579 |
+
if r.usage > r.allocated - 0.12:
|
| 580 |
+
r.do_resize_up()
|
| 581 |
+
else:
|
| 582 |
+
veto = True
|
| 583 |
+
|
| 584 |
+
elif atype == A_STOP:
|
| 585 |
+
can_stop = (not r.is_stopped and
|
| 586 |
+
(r.activity_status < 0.35 or r.criticality <= 0.3) and
|
| 587 |
+
r.criticality < 1.0)
|
| 588 |
+
if can_stop:
|
| 589 |
+
r.do_stop()
|
| 590 |
+
else:
|
| 591 |
+
veto = True
|
| 592 |
+
|
| 593 |
+
elif atype == A_RESTART:
|
| 594 |
+
if r.is_stopped:
|
| 595 |
+
r.do_restart()
|
| 596 |
+
else:
|
| 597 |
+
veto = True
|
| 598 |
+
|
| 599 |
+
elif atype == A_DELETE:
|
| 600 |
+
can_delete = (r.deps_known and r.criticality < 1.0 and not r.is_stopped)
|
| 601 |
+
if can_delete:
|
| 602 |
+
has_crit_child = any(
|
| 603 |
+
(ci < len(self.resources) and
|
| 604 |
+
not self.resources[ci].is_deleted and
|
| 605 |
+
self.resources[ci].criticality >= 0.6)
|
| 606 |
+
for ci in r.dependency_children)
|
| 607 |
+
if has_crit_child:
|
| 608 |
+
veto = True
|
| 609 |
+
else:
|
| 610 |
+
r.do_delete()
|
| 611 |
+
for ci in r.dependency_children:
|
| 612 |
+
if ci < len(self.resources) and not self.resources[ci].is_deleted:
|
| 613 |
+
child = self.resources[ci]
|
| 614 |
+
child.health = 0
|
| 615 |
+
child.risk_score = min(child.risk_score + 0.3, 1.0)
|
| 616 |
+
self.cascade_count += 1
|
| 617 |
+
else:
|
| 618 |
+
veto = True
|
| 619 |
+
|
| 620 |
+
elif atype == A_PATCH:
|
| 621 |
+
if r.vulnerability:
|
| 622 |
+
r.do_patch()
|
| 623 |
+
else:
|
| 624 |
+
veto = True
|
| 625 |
+
|
| 626 |
+
elif atype == A_ENCRYPT:
|
| 627 |
+
if not r.encryption:
|
| 628 |
+
r.do_encrypt()
|
| 629 |
+
else:
|
| 630 |
+
veto = True
|
| 631 |
+
|
| 632 |
+
elif atype == A_RESTRICT:
|
| 633 |
+
if r.exposure > 0.15:
|
| 634 |
+
r.do_restrict()
|
| 635 |
+
else:
|
| 636 |
+
veto = True
|
| 637 |
+
|
| 638 |
+
elif atype == A_ROTATE_CREDS:
|
| 639 |
+
if r.credential_age > 0.40:
|
| 640 |
+
r.do_rotate_creds()
|
| 641 |
+
else:
|
| 642 |
+
veto = True
|
| 643 |
+
|
| 644 |
+
elif atype == A_ENABLE_LOG:
|
| 645 |
+
if not r.logging_enabled:
|
| 646 |
+
r.do_enable_logging()
|
| 647 |
+
else:
|
| 648 |
+
veto = True
|
| 649 |
+
|
| 650 |
+
elif atype == A_ARCHIVE:
|
| 651 |
+
if r.category == "storage" and r.activity_status < 0.35:
|
| 652 |
+
r.do_archive()
|
| 653 |
+
else:
|
| 654 |
+
veto = True
|
| 655 |
+
|
| 656 |
+
elif atype == A_OPT_NET:
|
| 657 |
+
if r.exposure > 0.08:
|
| 658 |
+
r.do_opt_network()
|
| 659 |
+
else:
|
| 660 |
+
veto = True
|
| 661 |
+
|
| 662 |
+
cost_after = r.get_cost() if not r.is_deleted else 0.0
|
| 663 |
+
risk_after = r.risk_score if not r.is_deleted else 0.0
|
| 664 |
+
|
| 665 |
+
return (cost_before - cost_after), (risk_before - risk_after), veto
|
| 666 |
+
|
| 667 |
+
# ── Reward ────────────────────────────────────────────────────────────────
|
| 668 |
+
|
| 669 |
+
def _compute_reward(self, atype, ridx, veto,
|
| 670 |
+
cost_before, cost_now, risk_before, risk_now) -> float:
|
| 671 |
+
|
| 672 |
+
phase = self.curriculum_level
|
| 673 |
+
w_cost = 0.25
|
| 674 |
+
w_sec = 0.35 if phase >= 1 else 0.0
|
| 675 |
+
w_stab = 0.25
|
| 676 |
+
|
| 677 |
+
# ── 1. Dense cost channel ─────────────────────────────────────────────
|
| 678 |
+
r_cost = -w_cost * (cost_now / max(self.initial_total_cost, 1e-6))
|
| 679 |
+
|
| 680 |
+
# ── 2. Dense security channel ─────────────────────────────────────────
|
| 681 |
+
r_sec = -w_sec * risk_now
|
| 682 |
+
|
| 683 |
+
# ── 3. Stability/reliability ──────────────────────────────────────────
|
| 684 |
+
n_broken = sum(1 for r in self.resources if not r.health and not r.is_deleted)
|
| 685 |
+
r_stab = -w_stab * (n_broken / max(len(self.resources), 1))
|
| 686 |
+
|
| 687 |
+
# ── 4. Delta reward (THE MOST IMPORTANT SIGNAL) ───────────────────────
|
| 688 |
+
# Positive when agent caused improvement, zero otherwise
|
| 689 |
+
cost_improvement = (cost_before - cost_now) / max(self.initial_total_cost, 1e-6)
|
| 690 |
+
risk_improvement = risk_before - risk_now
|
| 691 |
+
|
| 692 |
+
r_delta = 3.0 * cost_improvement # strong signal for cost savings
|
| 693 |
+
r_delta += 4.0 * risk_improvement # strong signal for security improvements
|
| 694 |
+
r_delta = float(np.clip(r_delta, -1.0, 2.0))
|
| 695 |
+
|
| 696 |
+
# ── 5. NOOP shaping ───────────────────────────────────────────────────
|
| 697 |
+
if atype == A_NOOP:
|
| 698 |
+
if self.chaos_active:
|
| 699 |
+
r_noop = +0.10 # correct — don't touch things during chaos
|
| 700 |
+
elif risk_now < 0.10 and cost_now < self.initial_total_cost * 0.60:
|
| 701 |
+
r_noop = +0.05 # correct — system is genuinely healthy
|
| 702 |
+
elif risk_now < 0.25:
|
| 703 |
+
r_noop = +0.01 # acceptable
|
| 704 |
+
elif risk_now < 0.50:
|
| 705 |
+
r_noop = -0.05 # negligence
|
| 706 |
+
else:
|
| 707 |
+
r_noop = -0.15 # gross negligence
|
| 708 |
+
else:
|
| 709 |
+
r_noop = 0.0
|
| 710 |
+
|
| 711 |
+
# ── 6. Action cost penalty ────────────────────────────────────────────
|
| 712 |
+
r_action = ACTION_COSTS.get(atype, -0.02)
|
| 713 |
+
|
| 714 |
+
# ── 7. Veto penalty ───────────────────────────────────────────────────
|
| 715 |
+
r_veto = -0.10 if veto else 0.0
|
| 716 |
+
|
| 717 |
+
# ── 8. Temporal neglect ───────────────────────────────────────────────
|
| 718 |
+
# Phase 1+: growing penalty for ignoring known high-risk resources
|
| 719 |
+
r_neglect = 0.0
|
| 720 |
+
if phase >= 1:
|
| 721 |
+
for r in self.resources:
|
| 722 |
+
if (not r.fog_active and not r.is_deleted and
|
| 723 |
+
r.risk_score > 0.60):
|
| 724 |
+
neglect_scale = min(r.steps_broken / MAX_STEPS, 1.0)
|
| 725 |
+
r_neglect -= 0.02 * (1.0 + neglect_scale) * r.criticality
|
| 726 |
+
r_neglect = max(r_neglect, -0.20)
|
| 727 |
+
|
| 728 |
+
total = r_cost + r_sec + r_stab + r_delta + r_noop + r_action + r_veto + r_neglect
|
| 729 |
+
return float(total)
|
| 730 |
+
|
| 731 |
+
def _terminal_reward(self, win: bool, cost_now: float,
|
| 732 |
+
risk_now: float, phase: int) -> float:
|
| 733 |
+
r = 0.0
|
| 734 |
+
if win:
|
| 735 |
+
speed_bonus = 10.0 * (1.0 - self.step_count / MAX_STEPS)
|
| 736 |
+
r += 15.0 + speed_bonus
|
| 737 |
+
else:
|
| 738 |
+
# Partial credit
|
| 739 |
+
cost_reduction = (self.initial_total_cost - cost_now) / max(self.initial_total_cost, 1e-6)
|
| 740 |
+
r += 3.0 * max(cost_reduction, 0.0)
|
| 741 |
+
r -= 5.0 # timeout penalty
|
| 742 |
+
|
| 743 |
+
r -= 10.0 * risk_now # end-state security penalty
|
| 744 |
+
if self.cascade_count > 0:
|
| 745 |
+
r -= 5.0 * min(self.cascade_count, 3)
|
| 746 |
+
return r
|
| 747 |
+
|
| 748 |
+
# ── Win condition ─────────────────────────────────────────────────────────
|
| 749 |
+
|
| 750 |
+
def _check_win(self, cost_now: float, risk_now: float, phase: int) -> bool:
|
| 751 |
+
cost_ratio = cost_now / max(self.initial_total_cost, 1e-6)
|
| 752 |
+
cost_win = cost_ratio < WIN_COST_THR[phase]
|
| 753 |
+
|
| 754 |
+
sec_score = 1.0 - risk_now
|
| 755 |
+
sec_win = sec_score >= WIN_SEC_THR[phase]
|
| 756 |
+
|
| 757 |
+
# No critical resources broken
|
| 758 |
+
no_crit_broken = not any(
|
| 759 |
+
r.criticality >= 1.0 and not r.health and not r.is_deleted
|
| 760 |
+
for r in self.resources)
|
| 761 |
+
|
| 762 |
+
return cost_win and sec_win and no_crit_broken
|
| 763 |
+
|
| 764 |
+
# ── Observation ───────────────────────────────────────────────────────────
|
| 765 |
+
|
| 766 |
+
def _build_obs(self) -> np.ndarray:
|
| 767 |
+
phase = self.curriculum_level
|
| 768 |
+
fog = PHASE_FOG[phase]
|
| 769 |
+
|
| 770 |
+
# Block A: resource observations (padded to MAX_RES_IN_OBS)
|
| 771 |
+
block_a = np.zeros(MAX_RES_IN_OBS * N_FEAT_PER_RES, dtype=np.float32)
|
| 772 |
+
for i, r in enumerate(self.resources[:MAX_RES_IN_OBS]):
|
| 773 |
+
block_a[i * N_FEAT_PER_RES: (i + 1) * N_FEAT_PER_RES] = r.to_obs(fog)
|
| 774 |
+
|
| 775 |
+
# Block B: global security (8 dims)
|
| 776 |
+
active = [r for r in self.resources if not r.is_deleted]
|
| 777 |
+
n_a = max(len(active), 1)
|
| 778 |
+
risk_agg = self._risk_aggregate()
|
| 779 |
+
n_vuln = sum(1 for r in active if r.vulnerability)
|
| 780 |
+
n_exposed = sum(1 for r in active if r.exposure > 0.3)
|
| 781 |
+
n_unenc = sum(1 for r in active if not r.encryption)
|
| 782 |
+
n_no_log = sum(1 for r in active if not r.logging_enabled)
|
| 783 |
+
n_overperm = sum(1 for r in active if r.over_permission)
|
| 784 |
+
block_b = np.array([
|
| 785 |
+
risk_agg,
|
| 786 |
+
n_vuln / n_a,
|
| 787 |
+
n_exposed / n_a,
|
| 788 |
+
n_unenc / n_a,
|
| 789 |
+
n_no_log / n_a,
|
| 790 |
+
n_overperm / n_a,
|
| 791 |
+
min(sum(r.credential_age for r in active) / n_a, 1.0),
|
| 792 |
+
float(self.chaos_active),
|
| 793 |
+
], dtype=np.float32)
|
| 794 |
+
|
| 795 |
+
# Block C: global cost (7 dims)
|
| 796 |
+
total_cost = sum(r.get_cost() for r in self.resources)
|
| 797 |
+
n_idle = sum(1 for r in active if r.activity_status < 0.3)
|
| 798 |
+
n_overprov = sum(1 for r in active if r.overprovision_ratio() > 0.2)
|
| 799 |
+
n_stopped = sum(1 for r in self.resources if r.is_stopped)
|
| 800 |
+
n_deleted = sum(1 for r in self.resources if r.is_deleted)
|
| 801 |
+
block_c = np.array([
|
| 802 |
+
total_cost / max(self.initial_total_cost, 1e-6),
|
| 803 |
+
n_idle / n_a,
|
| 804 |
+
n_overprov / n_a,
|
| 805 |
+
n_stopped / max(len(self.resources), 1),
|
| 806 |
+
n_deleted / max(len(self.resources), 1),
|
| 807 |
+
(self.initial_total_cost - total_cost) / max(self.initial_total_cost, 1e-6),
|
| 808 |
+
float(self._check_win(total_cost, risk_agg, self.curriculum_level)),
|
| 809 |
+
], dtype=np.float32)
|
| 810 |
+
|
| 811 |
+
# Block D: environment state (6 dims)
|
| 812 |
+
n_broken = sum(1 for r in active if not r.health)
|
| 813 |
+
block_d = np.array([
|
| 814 |
+
self.step_count / MAX_STEPS,
|
| 815 |
+
self.curriculum_level / 5.0,
|
| 816 |
+
float(self.chaos_active),
|
| 817 |
+
n_broken / n_a,
|
| 818 |
+
self.veto_count / max(self.step_count, 1),
|
| 819 |
+
self.cascade_count / max(n_a, 1),
|
| 820 |
+
], dtype=np.float32)
|
| 821 |
+
|
| 822 |
+
# Block E: history (24 dims)
|
| 823 |
+
block_e = np.array(
|
| 824 |
+
list(self._action_hist) +
|
| 825 |
+
list(self._reward_hist) +
|
| 826 |
+
list(self._progress_hist),
|
| 827 |
+
dtype=np.float32)
|
| 828 |
+
|
| 829 |
+
obs = np.concatenate([block_a, block_b, block_c, block_d, block_e])
|
| 830 |
+
assert obs.shape == (OBS_DIM,), f"Obs shape {obs.shape} != {OBS_DIM}"
|
| 831 |
+
return obs
|
| 832 |
+
|
| 833 |
+
# ── Action masks ──────────────────────────────────────────────────────────
|
| 834 |
+
|
| 835 |
+
def action_masks(self) -> np.ndarray:
|
| 836 |
+
mask = np.zeros(N_ACTIONS, dtype=bool)
|
| 837 |
+
|
| 838 |
+
# NOOP (action 0) — always valid
|
| 839 |
+
mask[A_NOOP * MAX_RESOURCES] = True
|
| 840 |
+
|
| 841 |
+
for ridx in range(MAX_RESOURCES):
|
| 842 |
+
# Resources beyond active set are always invalid
|
| 843 |
+
if ridx >= len(self.resources):
|
| 844 |
+
# Only NOOP is already set; skip rest
|
| 845 |
+
continue
|
| 846 |
+
|
| 847 |
+
r = self.resources[ridx]
|
| 848 |
+
|
| 849 |
+
if r.is_deleted:
|
| 850 |
+
continue
|
| 851 |
+
|
| 852 |
+
aid = lambda atype: atype * MAX_RESOURCES + ridx # noqa
|
| 853 |
+
|
| 854 |
+
# ANALYZE — always valid (costs a small amount)
|
| 855 |
+
mask[aid(A_ANALYZE)] = True
|
| 856 |
+
|
| 857 |
+
# VERIFY_DEPS — always valid
|
| 858 |
+
mask[aid(A_VERIFY_DEPS)] = True
|
| 859 |
+
|
| 860 |
+
# 💥 ANTI-CHEAT FIX: If fog is active, the agent CANNOT execute these actions!
|
| 861 |
+
if r.fog_active:
|
| 862 |
+
continue # Skips evaluating the rest, keeping them False (Masked)
|
| 863 |
+
|
| 864 |
+
# --- ONLY EVALUATED IF FOG IS LIFTED ---
|
| 865 |
+
|
| 866 |
+
# RESIZE_DOWN — valid if overprovisioned and running
|
| 867 |
+
mask[aid(A_RESIZE_DOWN)] = (r.overprovision_ratio() > 0.08
|
| 868 |
+
and not r.is_stopped)
|
| 869 |
+
|
| 870 |
+
# RESIZE_UP — valid if near capacity
|
| 871 |
+
mask[aid(A_RESIZE_UP)] = (r.usage > r.allocated - 0.12
|
| 872 |
+
and not r.is_stopped)
|
| 873 |
+
|
| 874 |
+
# STOP — valid if idle or LOW criticality and currently running
|
| 875 |
+
mask[aid(A_STOP)] = (not r.is_stopped
|
| 876 |
+
and r.criticality < 1.0
|
| 877 |
+
and (r.activity_status < 0.35 or r.criticality <= 0.3))
|
| 878 |
+
|
| 879 |
+
# RESTART — valid if stopped
|
| 880 |
+
mask[aid(A_RESTART)] = r.is_stopped
|
| 881 |
+
|
| 882 |
+
# DELETE — valid if deps known, not critical, no critical children
|
| 883 |
+
has_crit_child = any(
|
| 884 |
+
(ci < len(self.resources) and
|
| 885 |
+
not self.resources[ci].is_deleted and
|
| 886 |
+
self.resources[ci].criticality >= 0.6)
|
| 887 |
+
for ci in r.dependency_children)
|
| 888 |
+
mask[aid(A_DELETE)] = (r.deps_known and r.criticality < 1.0
|
| 889 |
+
and not has_crit_child)
|
| 890 |
+
|
| 891 |
+
# Security fixes (Phase 1+)
|
| 892 |
+
mask[aid(A_PATCH)] = r.vulnerability
|
| 893 |
+
mask[aid(A_ENCRYPT)] = not r.encryption
|
| 894 |
+
mask[aid(A_RESTRICT)] = r.exposure > 0.15
|
| 895 |
+
mask[aid(A_ROTATE_CREDS)] = r.credential_age > 0.40
|
| 896 |
+
mask[aid(A_ENABLE_LOG)] = not r.logging_enabled
|
| 897 |
+
mask[aid(A_ARCHIVE)] = (r.category == "storage"
|
| 898 |
+
and r.activity_status < 0.35)
|
| 899 |
+
mask[aid(A_OPT_NET)] = r.exposure > 0.08
|
| 900 |
+
|
| 901 |
+
# Collapse guard: always at least 3 valid actions
|
| 902 |
+
if mask.sum() < 3:
|
| 903 |
+
mask[A_NOOP * MAX_RESOURCES] = True
|
| 904 |
+
if len(self.resources) > 0:
|
| 905 |
+
mask[A_ANALYZE * MAX_RESOURCES] = True
|
| 906 |
+
if len(self.resources) > 1:
|
| 907 |
+
mask[A_ANALYZE * MAX_RESOURCES + 1] = True
|
| 908 |
+
|
| 909 |
+
return mask
|
| 910 |
+
|
| 911 |
+
# ── Helpers ───────────────────────────────────────────────────────────────
|
| 912 |
+
|
| 913 |
+
def _risk_aggregate(self) -> float:
|
| 914 |
+
active = [r for r in self.resources if not r.is_deleted]
|
| 915 |
+
if not active:
|
| 916 |
+
return 0.0
|
| 917 |
+
weighted = sum(r.risk_score * r.criticality for r in active)
|
| 918 |
+
total_w = sum(r.criticality for r in active)
|
| 919 |
+
return weighted / max(total_w, 1e-6)
|
| 920 |
+
|
| 921 |
+
def _reliability_score(self) -> float:
|
| 922 |
+
active = [r for r in self.resources if not r.is_deleted]
|
| 923 |
+
if not active:
|
| 924 |
+
return 0.0
|
| 925 |
+
broken_w = sum(r.criticality for r in active if not r.health)
|
| 926 |
+
total_w = sum(r.criticality for r in active)
|
| 927 |
+
return max(0.0, 1.0 - broken_w / max(total_w, 1e-6))
|
| 928 |
+
|
| 929 |
+
def render(self): pass
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
# ── Gymnasium wrapper ─────────────────────────────────────────────────────────
|
| 933 |
+
|
| 934 |
+
from sb3_contrib.common.wrappers import ActionMasker
|
| 935 |
+
|
| 936 |
+
def get_action_masks(env) -> np.ndarray:
|
| 937 |
+
"""Extract mask through ActionMasker wrapper."""
|
| 938 |
+
inner = env
|
| 939 |
+
while hasattr(inner, "env"):
|
| 940 |
+
inner = inner.env
|
| 941 |
+
return inner.action_masks()
|
cloud_arena/evaluation.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cloud Arena Evaluation — Mathematical Model
|
| 2 |
+
# Extracted from cloud_arena_final.py (Cells 4-5)
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
from typing import List
|
| 8 |
+
|
| 9 |
+
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
|
| 10 |
+
from sb3_contrib import MaskablePPO
|
| 11 |
+
from sb3_contrib.common.wrappers import ActionMasker
|
| 12 |
+
|
| 13 |
+
from cloud_arena.environment import (
|
| 14 |
+
CloudArenaEnv, get_action_masks, MAX_RESOURCES, MAX_STEPS, A_NOOP,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _get_inner(vec_env):
|
| 19 |
+
inner = vec_env.envs[0]
|
| 20 |
+
while hasattr(inner, "env"):
|
| 21 |
+
inner = inner.env
|
| 22 |
+
return inner
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def evaluate_model(model_path="./models/cloud_arena_final",
|
| 26 |
+
vecnorm_path="./models/cloud_arena_vecnorm.pkl",
|
| 27 |
+
level=0, n_eval=30):
|
| 28 |
+
results = {k: [] for k in ["win","cost_score","security_score",
|
| 29 |
+
"reliability_score","savings_pct","veto_rate",
|
| 30 |
+
"cascade_count","steps"]}
|
| 31 |
+
|
| 32 |
+
def make_eval_env():
|
| 33 |
+
env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[500000])
|
| 34 |
+
return ActionMasker(env, get_action_masks)
|
| 35 |
+
|
| 36 |
+
raw = DummyVecEnv([make_eval_env])
|
| 37 |
+
eval_env = VecNormalize.load(vecnorm_path, raw)
|
| 38 |
+
eval_env.training = False
|
| 39 |
+
eval_env.norm_reward = False
|
| 40 |
+
|
| 41 |
+
model = MaskablePPO.load(model_path, env=eval_env)
|
| 42 |
+
|
| 43 |
+
for ep in range(n_eval):
|
| 44 |
+
obs = eval_env.reset()
|
| 45 |
+
done = False
|
| 46 |
+
steps = 0
|
| 47 |
+
while not done:
|
| 48 |
+
masks = [_get_inner(eval_env).action_masks()]
|
| 49 |
+
act, _ = model.predict(obs, deterministic=True, action_masks=masks)
|
| 50 |
+
obs, rew, done_arr, info_arr = eval_env.step(act)
|
| 51 |
+
done = bool(done_arr[0])
|
| 52 |
+
steps += 1
|
| 53 |
+
info = info_arr[0] if info_arr else {}
|
| 54 |
+
for k in results:
|
| 55 |
+
results[k].append(info.get(k, 0) if k != "steps" else steps)
|
| 56 |
+
|
| 57 |
+
return results
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
BOSS_NAMES = {
|
| 61 |
+
1: "Cost Crisis",
|
| 62 |
+
2: "Security Breach",
|
| 63 |
+
3: "Infrastructure Failure",
|
| 64 |
+
4: "Traffic Surge",
|
| 65 |
+
5: "Final Boss",
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def run_boss_fights(model_path="./models/cloud_arena_final",
|
| 70 |
+
vecnorm_path="./models/cloud_arena_vecnorm.pkl",
|
| 71 |
+
level=0, n_runs=10):
|
| 72 |
+
model = MaskablePPO.load(model_path)
|
| 73 |
+
boss_scores = {}
|
| 74 |
+
|
| 75 |
+
for s_id, name in BOSS_NAMES.items():
|
| 76 |
+
runs = []
|
| 77 |
+
for seed in range(100, 100 + n_runs):
|
| 78 |
+
def _init():
|
| 79 |
+
env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[0])
|
| 80 |
+
return ActionMasker(env, get_action_masks)
|
| 81 |
+
|
| 82 |
+
raw = DummyVecEnv([_init])
|
| 83 |
+
vec = VecNormalize.load(vecnorm_path, raw)
|
| 84 |
+
vec.training = False
|
| 85 |
+
vec.norm_reward = False
|
| 86 |
+
|
| 87 |
+
inner = _get_inner(vec)
|
| 88 |
+
raw_obs, _ = inner.reset(seed=seed, options={"scenario": s_id})
|
| 89 |
+
obs = vec.normalize_obs(np.array([raw_obs]))
|
| 90 |
+
|
| 91 |
+
done = False
|
| 92 |
+
steps = 0
|
| 93 |
+
noops_chaos = 0
|
| 94 |
+
chaos_steps_total = 0
|
| 95 |
+
|
| 96 |
+
while not done:
|
| 97 |
+
masks = [inner.action_masks()]
|
| 98 |
+
act, _ = model.predict(obs, deterministic=True, action_masks=masks)
|
| 99 |
+
a_type = int(act[0]) // MAX_RESOURCES
|
| 100 |
+
if inner.chaos_active:
|
| 101 |
+
chaos_steps_total += 1
|
| 102 |
+
if a_type == A_NOOP:
|
| 103 |
+
noops_chaos += 1
|
| 104 |
+
obs, _, done_arr, info_arr = vec.step(act)
|
| 105 |
+
done = bool(done_arr[0])
|
| 106 |
+
steps += 1
|
| 107 |
+
|
| 108 |
+
info = info_arr[0] if info_arr else {}
|
| 109 |
+
info.update({"steps": steps, "noops_chaos": noops_chaos, "chaos_steps": chaos_steps_total})
|
| 110 |
+
runs.append(info)
|
| 111 |
+
vec.close()
|
| 112 |
+
|
| 113 |
+
wins = [r.get("win", 0) for r in runs]
|
| 114 |
+
costs = [r.get("cost_score", 0) for r in runs]
|
| 115 |
+
secs = [r.get("security_score", 0) for r in runs]
|
| 116 |
+
rels = [r.get("reliability_score", 0) for r in runs]
|
| 117 |
+
|
| 118 |
+
if s_id == 3:
|
| 119 |
+
noop_r = [r["noops_chaos"] / max(r["chaos_steps"], 1) for r in runs]
|
| 120 |
+
score = (0.4 * np.mean(noop_r) + 0.6 * np.mean(rels)) * 100
|
| 121 |
+
else:
|
| 122 |
+
score = (0.4 * np.mean(wins) + 0.3 * np.mean(costs) + 0.3 * np.mean(secs)) * 100
|
| 123 |
+
|
| 124 |
+
boss_scores[s_id] = score
|
| 125 |
+
|
| 126 |
+
return boss_scores
|
cloud_arena/training.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cloud Arena Training — Mathematical Model (MaskablePPO)
|
| 2 |
+
# Extracted from cloud_arena_final.py (Cell 3)
|
| 3 |
+
|
| 4 |
+
import os, sys, math
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, sync_envs_normalization
|
| 9 |
+
from stable_baselines3.common.callbacks import BaseCallback
|
| 10 |
+
from stable_baselines3.common.monitor import Monitor
|
| 11 |
+
from sb3_contrib import MaskablePPO
|
| 12 |
+
from sb3_contrib.common.wrappers import ActionMasker
|
| 13 |
+
from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
|
| 14 |
+
|
| 15 |
+
from cloud_arena.environment import (
|
| 16 |
+
CloudArenaEnv, get_action_masks, GLOBAL_SEED,
|
| 17 |
+
N_ACTION_TYPES, MAX_RESOURCES, N_ACTIONS, MAX_STEPS,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
TOTAL_TIMESTEPS = 500_000
|
| 22 |
+
|
| 23 |
+
def cosine_lr(progress_remaining: float, init_lr: float = 3e-4, min_lr: float = 5e-5):
|
| 24 |
+
return min_lr + (init_lr - min_lr) * 0.5 * (1.0 + math.cos(math.pi * (1.0 - progress_remaining)))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class SafeMaskableEvalCallback(MaskableEvalCallback):
|
| 28 |
+
def _on_step(self) -> bool:
|
| 29 |
+
if self.model.get_vec_normalize_env() is not None:
|
| 30 |
+
sync_envs_normalization(self.training_env, self.eval_env)
|
| 31 |
+
return super()._on_step()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class CloudArenaCallback(BaseCallback):
|
| 35 |
+
EMA_ALPHA = 0.02
|
| 36 |
+
MIN_EPS_PER_PHASE = 800
|
| 37 |
+
PHASE_THRESHOLDS = {0: 0.65, 1: 0.62, 2: 0.58, 3: 0.55, 4: 0.52}
|
| 38 |
+
PROGRESS_EVERY = 500
|
| 39 |
+
|
| 40 |
+
def __init__(self, curriculum_ref, verbose=0):
|
| 41 |
+
super().__init__(verbose)
|
| 42 |
+
self._curriculum_ref = curriculum_ref
|
| 43 |
+
self.ema_win_rate = 0.0
|
| 44 |
+
self.current_level = 0
|
| 45 |
+
self._phase_eps = 0
|
| 46 |
+
self.episode_rewards = []
|
| 47 |
+
self.episode_wins = []
|
| 48 |
+
self.episode_savings = []
|
| 49 |
+
self.episode_security = []
|
| 50 |
+
self.episode_veto_rates = []
|
| 51 |
+
self.curriculum_log = [(0, 0)]
|
| 52 |
+
self.action_freq = np.zeros(N_ACTION_TYPES)
|
| 53 |
+
|
| 54 |
+
def _on_step(self) -> bool:
|
| 55 |
+
if self.num_timesteps % self.PROGRESS_EVERY == 0:
|
| 56 |
+
self._print_progress()
|
| 57 |
+
actions = self.locals.get("actions")
|
| 58 |
+
if actions is not None:
|
| 59 |
+
for a in actions:
|
| 60 |
+
atype = int(a) // MAX_RESOURCES
|
| 61 |
+
if atype < N_ACTION_TYPES:
|
| 62 |
+
self.action_freq[atype] += 1
|
| 63 |
+
dones = self.locals.get("dones", [False])
|
| 64 |
+
if dones[0]:
|
| 65 |
+
info = self.locals.get("infos", [{}])[0]
|
| 66 |
+
self._on_episode_end(info)
|
| 67 |
+
return True
|
| 68 |
+
|
| 69 |
+
def _on_episode_end(self, info):
|
| 70 |
+
if "final_info" in info:
|
| 71 |
+
info = info["final_info"]
|
| 72 |
+
win = int(info.get("win", 0))
|
| 73 |
+
self.ema_win_rate = (1 - self.EMA_ALPHA) * self.ema_win_rate + self.EMA_ALPHA * win
|
| 74 |
+
self.episode_rewards.append(float(self.locals.get("rewards", [0])[0]))
|
| 75 |
+
self.episode_wins.append(win)
|
| 76 |
+
self.episode_savings.append(info.get("savings_pct", 0))
|
| 77 |
+
self.episode_security.append(info.get("security_score", 0))
|
| 78 |
+
self.episode_veto_rates.append(info.get("veto_rate", 0))
|
| 79 |
+
self._phase_eps += 1
|
| 80 |
+
thr = self.PHASE_THRESHOLDS.get(self.current_level, 0.50)
|
| 81 |
+
if self.current_level < 5 and self._phase_eps >= self.MIN_EPS_PER_PHASE and self.ema_win_rate >= thr:
|
| 82 |
+
self._try_promote()
|
| 83 |
+
|
| 84 |
+
def _try_promote(self):
|
| 85 |
+
self.current_level += 1
|
| 86 |
+
self._curriculum_ref[0] = self.current_level
|
| 87 |
+
self._phase_eps = 0
|
| 88 |
+
self.ema_win_rate = 0.0
|
| 89 |
+
self.curriculum_log.append((self.num_timesteps, self.current_level))
|
| 90 |
+
print(f"\n✄ PROMOTED -> Phase {self.current_level}")
|
| 91 |
+
|
| 92 |
+
def _print_progress(self):
|
| 93 |
+
pct = min(100.0, self.num_timesteps / TOTAL_TIMESTEPS * 100)
|
| 94 |
+
sys.stdout.write(f"\rProgress: {pct:.1f}% | Phase: {self.current_level} | EMA Win: {self.ema_win_rate*100:.1f}%")
|
| 95 |
+
sys.stdout.flush()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def train_model(total_timesteps=TOTAL_TIMESTEPS, save_dir="./models"):
|
| 99 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 100 |
+
os.makedirs("./logs/", exist_ok=True)
|
| 101 |
+
os.makedirs("./eval_logs/", exist_ok=True)
|
| 102 |
+
|
| 103 |
+
torch.manual_seed(GLOBAL_SEED)
|
| 104 |
+
curriculum_ref = [0]
|
| 105 |
+
global_step_ref = [0]
|
| 106 |
+
|
| 107 |
+
def make_env():
|
| 108 |
+
env = CloudArenaEnv(curriculum_ref, global_step_ref)
|
| 109 |
+
env = Monitor(env)
|
| 110 |
+
return ActionMasker(env, get_action_masks)
|
| 111 |
+
|
| 112 |
+
train_env = DummyVecEnv([make_env])
|
| 113 |
+
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
| 114 |
+
|
| 115 |
+
eval_env = DummyVecEnv([make_env])
|
| 116 |
+
eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, training=False)
|
| 117 |
+
eval_env.obs_rms = train_env.obs_rms
|
| 118 |
+
|
| 119 |
+
model = MaskablePPO("MlpPolicy", train_env, learning_rate=cosine_lr, ent_coef=0.01, verbose=0)
|
| 120 |
+
arena_cb = CloudArenaCallback(curriculum_ref)
|
| 121 |
+
eval_cb = SafeMaskableEvalCallback(eval_env, best_model_save_path=save_dir, eval_freq=10000)
|
| 122 |
+
|
| 123 |
+
print("Starting Pipeline...")
|
| 124 |
+
model.learn(total_timesteps=total_timesteps, callback=[arena_cb, eval_cb])
|
| 125 |
+
|
| 126 |
+
model.save(os.path.join(save_dir, "cloud_arena_final"))
|
| 127 |
+
train_env.save(os.path.join(save_dir, "cloud_arena_vecnorm.pkl"))
|
| 128 |
+
print("\n✅ Model and VecNormalize stats saved.")
|
| 129 |
+
|
| 130 |
+
return model, arena_cb, train_env
|
cloud_arena/visualization.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Cloud Arena Visualization — Mathematical Model
|
| 2 |
+
|
| 3 |
+
import matplotlib
|
| 4 |
+
matplotlib.use('Agg')
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
REF_BG = '#0e1117'
|
| 10 |
+
REF_CYAN = '#00d4ff'
|
| 11 |
+
REF_AMBER = '#ffa500'
|
| 12 |
+
REF_NEON = '#39ff14'
|
| 13 |
+
TEXT_COLOR = '#e6e6e6'
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def smooth(y, box_pts=50):
|
| 17 |
+
if len(y) < box_pts:
|
| 18 |
+
return y
|
| 19 |
+
box = np.ones(box_pts) / box_pts
|
| 20 |
+
return np.convolve(y, box, mode='valid')
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def generate_dashboard(callback, output_path="outputs/training_dashboard.png"):
|
| 24 |
+
rewards = np.array(callback.episode_rewards)
|
| 25 |
+
savings = np.array(callback.episode_savings)
|
| 26 |
+
security = np.array(callback.episode_security)
|
| 27 |
+
|
| 28 |
+
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(22, 6), facecolor=REF_BG)
|
| 29 |
+
|
| 30 |
+
for ax in [ax1, ax2, ax3]:
|
| 31 |
+
ax.set_facecolor(REF_BG)
|
| 32 |
+
ax.grid(True, alpha=0.05, color='white')
|
| 33 |
+
ax.spines['top'].set_visible(False)
|
| 34 |
+
ax.spines['right'].set_visible(False)
|
| 35 |
+
ax.spines['left'].set_color('#333333')
|
| 36 |
+
ax.spines['bottom'].set_color('#333333')
|
| 37 |
+
ax.tick_params(colors=TEXT_COLOR, labelsize=10)
|
| 38 |
+
|
| 39 |
+
ax1.plot(rewards, color=REF_CYAN, alpha=0.15)
|
| 40 |
+
ax1.plot(smooth(rewards), color=REF_CYAN, lw=3)
|
| 41 |
+
ax1.set_title("Learning Curve", color=TEXT_COLOR, fontsize=14, fontweight='bold')
|
| 42 |
+
|
| 43 |
+
ax2.plot(savings, color=REF_AMBER, alpha=0.15)
|
| 44 |
+
ax2.plot(smooth(savings), color=REF_AMBER, lw=3)
|
| 45 |
+
ax2.set_title("Cost Optimization %", color=TEXT_COLOR, fontsize=14, fontweight='bold')
|
| 46 |
+
ax2.set_ylim(0, 100)
|
| 47 |
+
|
| 48 |
+
ax3.plot(security, color=REF_NEON, alpha=0.15)
|
| 49 |
+
ax3.plot(smooth(security), color=REF_NEON, lw=3)
|
| 50 |
+
ax3.set_title("Security Score", color=TEXT_COLOR, fontsize=14, fontweight='bold')
|
| 51 |
+
ax3.set_ylim(0, 1)
|
| 52 |
+
|
| 53 |
+
plt.tight_layout()
|
| 54 |
+
plt.savefig(output_path, dpi=200, bbox_inches='tight', facecolor=REF_BG)
|
| 55 |
+
plt.close()
|
| 56 |
+
return output_path
|
models/.gitkeep
ADDED
|
File without changes
|
outputs/.gitkeep
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── Mathematical Model RL Dependencies ONLY ──
|
| 2 |
+
# DO NOT add transformers/peft/trl here — those belong to the LLM model
|
| 3 |
+
gymnasium>=0.29.0
|
| 4 |
+
stable-baselines3>=2.3.0
|
| 5 |
+
sb3-contrib>=2.3.0
|
| 6 |
+
numpy>=1.24.0
|
| 7 |
+
torch>=2.0.0
|
| 8 |
+
matplotlib>=3.7.0
|
| 9 |
+
gradio>=4.0.0
|