kavin57447 commited on
Commit
12263fa
·
1 Parent(s): 9c5fcc9

Add Cloud Arena Mathematical Model RL environment

Browse files
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # System deps
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential git && \
8
+ rm -rf /var/lib/apt/lists/*
9
+
10
+ # Python deps
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # App code
15
+ COPY . .
16
+
17
+ # HF Spaces expects port 7860
18
+ EXPOSE 7860
19
+
20
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,11 +1,22 @@
1
  ---
2
  title: Openenv
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: docker
7
  pinned: false
8
- short_description: this the working environment
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Openenv
3
+ emoji: ☁️
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
  pinned: false
8
+ short_description: Cloud Arena Mathematical Model RL Training
9
  ---
10
 
11
+ # Cloud Arena Mathematical Model RL Training
12
+
13
+ Multi-objective cloud operations RL environment trained with **MaskablePPO**.
14
+
15
+ This is the **Mathematical Model** (MLP + stable-baselines3), NOT the LLM model.
16
+
17
+ ## Features
18
+ - 125-dim observation space, 150 discrete actions
19
+ - 6-phase curriculum learning
20
+ - Action masking, fog-of-war, chaos events
21
+ - Boss fight evaluation scenarios
22
+ - Interactive training dashboard
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cloud Arena — Mathematical Model RL Training on HF Spaces
3
+ This is the MATHEMATICAL model (MaskablePPO + MLP), NOT the LLM model.
4
+ The LLM model (cell5_ppo.py) is a SEPARATE system.
5
+ """
6
+
7
+ import os
8
+ import gradio as gr
9
+ import numpy as np
10
+
11
+ os.makedirs("./models", exist_ok=True)
12
+ os.makedirs("./outputs", exist_ok=True)
13
+
14
+ # Global state
15
+ training_state = {"model": None, "callback": None, "status": "idle"}
16
+
17
+
18
+ def run_training(timesteps):
19
+ from cloud_arena.training import train_model
20
+ training_state["status"] = "training"
21
+ try:
22
+ ts = int(timesteps)
23
+ model, callback, _ = train_model(total_timesteps=ts)
24
+ training_state["model"] = model
25
+ training_state["callback"] = callback
26
+ training_state["status"] = "done"
27
+
28
+ from cloud_arena.visualization import generate_dashboard
29
+ img_path = generate_dashboard(callback, "outputs/dashboard.png")
30
+
31
+ summary = (
32
+ f"✅ Training Complete\n"
33
+ f"Episodes: {len(callback.episode_rewards)}\n"
34
+ f"Final Phase: {callback.current_level}\n"
35
+ f"EMA Win Rate: {callback.ema_win_rate*100:.1f}%\n"
36
+ f"Avg Savings: {np.mean(callback.episode_savings):.1f}%"
37
+ )
38
+ return summary, img_path
39
+ except Exception as e:
40
+ training_state["status"] = "error"
41
+ return f"❌ Error: {e}", None
42
+
43
+
44
+ def run_evaluation():
45
+ from cloud_arena.evaluation import evaluate_model
46
+ try:
47
+ results = evaluate_model()
48
+ wr = np.mean(results["win"]) * 100
49
+ cost = np.mean(results["cost_score"])
50
+ sec = np.mean(results["security_score"])
51
+ sav = np.mean(results["savings_pct"])
52
+ return (
53
+ f"Win Rate: {wr:.1f}%\n"
54
+ f"Cost Score: {cost:.3f}\n"
55
+ f"Security: {sec:.3f}\n"
56
+ f"Savings: {sav:.1f}%"
57
+ )
58
+ except Exception as e:
59
+ return f"❌ Error: {e}"
60
+
61
+
62
+ def run_bosses():
63
+ from cloud_arena.evaluation import run_boss_fights, BOSS_NAMES
64
+ try:
65
+ scores = run_boss_fights()
66
+ lines = [f"{BOSS_NAMES[k]}: {v:.1f}%" for k, v in scores.items()]
67
+ overall = np.mean(list(scores.values()))
68
+ lines.append(f"\nOverall: {overall:.1f}%")
69
+ return "\n".join(lines)
70
+ except Exception as e:
71
+ return f"❌ Error: {e}"
72
+
73
+
74
+ with gr.Blocks(title="Cloud Arena RL", theme=gr.themes.Base()) as demo:
75
+ gr.Markdown("# ☁️ Cloud Arena — Mathematical Model RL")
76
+ gr.Markdown("MaskablePPO training on a multi-objective cloud ops environment.")
77
+
78
+ with gr.Tab("Train"):
79
+ ts_input = gr.Number(value=500000, label="Total Timesteps")
80
+ train_btn = gr.Button("🚀 Start Training", variant="primary")
81
+ train_output = gr.Textbox(label="Status", lines=6)
82
+ train_img = gr.Image(label="Dashboard")
83
+ train_btn.click(run_training, inputs=ts_input, outputs=[train_output, train_img])
84
+
85
+ with gr.Tab("Evaluate"):
86
+ eval_btn = gr.Button("📊 Run Evaluation")
87
+ eval_output = gr.Textbox(label="Results", lines=8)
88
+ eval_btn.click(run_evaluation, outputs=eval_output)
89
+
90
+ with gr.Tab("Boss Fights"):
91
+ boss_btn = gr.Button("⚔️ Run Boss Fights")
92
+ boss_output = gr.Textbox(label="Boss Scores", lines=8)
93
+ boss_btn.click(run_bosses, outputs=boss_output)
94
+
95
+ if __name__ == "__main__":
96
+ demo.launch(server_name="0.0.0.0", server_port=7860)
cloud_arena/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from cloud_arena.environment import CloudArenaEnv, ResourceObject, get_action_masks
2
+
3
+ __all__ = ["CloudArenaEnv", "ResourceObject", "get_action_masks"]
cloud_arena/environment.py ADDED
@@ -0,0 +1,941 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cloud Arena Environment — Mathematical Model RL
2
+ # Extracted from cloud_arena_final.py
3
+ # This is the MATHEMATICAL model env, NOT the LLM model.
4
+
5
+ import sys, math, random, copy
6
+ from collections import deque
7
+ from typing import Dict, List, Optional, Tuple
8
+
9
+ import numpy as np
10
+ import gymnasium as gym
11
+ from gymnasium import spaces
12
+
13
+ # ── Seeds ─────────────────────────────────────────────────────────────────────
14
+ GLOBAL_SEED = 42
15
+ np.random.seed(GLOBAL_SEED)
16
+ random.seed(GLOBAL_SEED)
17
+
18
+ # ── Observation layout (must sum to OBS_DIM) ──────────────────────────────────
19
+ MAX_RES_IN_OBS = 8 # fixed obs slots (pad unused with zeros)
20
+ N_FEAT_PER_RES = 10 # features per resource slot in obs
21
+ N_BLOCK_B = 8 # global security block
22
+ N_BLOCK_C = 7 # global cost block
23
+ N_BLOCK_D = 6 # environment state block
24
+ N_BLOCK_E = 24 # history: 8 actions + 8 rewards + 8 progress
25
+ OBS_DIM = MAX_RES_IN_OBS * N_FEAT_PER_RES + N_BLOCK_B + N_BLOCK_C + N_BLOCK_D + N_BLOCK_E
26
+ # = 80 + 8 + 7 + 6 + 24 = 125
27
+
28
+ assert OBS_DIM == 125, f"OBS_DIM mismatch: {OBS_DIM}"
29
+
30
+ # ── Action space ──────────────────────────────────────────────────────────────
31
+ N_ACTION_TYPES = 15
32
+ MAX_RESOURCES = 10
33
+ N_ACTIONS = N_ACTION_TYPES * MAX_RESOURCES # 150
34
+
35
+ A_NOOP=0; A_ANALYZE=1; A_VERIFY_DEPS=2; A_RESIZE_DOWN=3; A_RESIZE_UP=4
36
+ A_STOP=5; A_RESTART=6; A_DELETE=7; A_PATCH=8; A_ENCRYPT=9
37
+ A_RESTRICT=10; A_ROTATE_CREDS=11; A_ENABLE_LOG=12; A_ARCHIVE=13; A_OPT_NET=14
38
+
39
+ # Action cost penalties (small friction — makes actions non-free)
40
+ ACTION_COSTS = {
41
+ A_NOOP: 0.0, A_ANALYZE: -0.01, A_VERIFY_DEPS: -0.01,
42
+ A_RESIZE_DOWN: -0.02, A_RESIZE_UP: -0.02,
43
+ A_STOP: -0.03, A_RESTART: -0.02, A_DELETE: -0.05,
44
+ A_PATCH: -0.02, A_ENCRYPT: -0.02, A_RESTRICT: -0.02,
45
+ A_ROTATE_CREDS: -0.02, A_ENABLE_LOG: -0.01,
46
+ A_ARCHIVE: -0.03, A_OPT_NET: -0.02,
47
+ }
48
+
49
+ # ── Curriculum ────────────────────────────────────────────────────────────────
50
+ # n_resources active per phase
51
+ N_RESOURCES_PHASE = {0: 4, 1: 5, 2: 6, 3: 7, 4: 8, 5: 10}
52
+
53
+ # Phase feature flags
54
+ PHASE_FOG = {0: False, 1: True, 2: True, 3: True, 4: True, 5: True}
55
+ PHASE_EVENTS = {0: False, 1: False, 2: True, 3: True, 4: True, 5: True}
56
+ PHASE_CHAOS = {0: False, 1: False, 2: False, 3: True, 4: True, 5: True}
57
+ CHAOS_INIT_PROB = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.20, 4: 0.30, 5: 0.35}
58
+
59
+ # Win thresholds: cost must drop to this fraction of initial AND security >= sec_thr
60
+ WIN_COST_THR = {0: 0.55, 1: 0.60, 2: 0.60, 3: 0.65, 4: 0.65, 5: 0.70}
61
+ WIN_SEC_THR = {0: 0.00, 1: 0.60, 2: 0.70, 3: 0.70, 4: 0.75, 5: 0.80}
62
+
63
+ MAX_STEPS = 150
64
+
65
+
66
+ # ══════════════════════════════════════════════════════════════════════════════
67
+ # RESOURCE OBJECT
68
+ # ══════════════════════════════════════════════════════════════════════════════
69
+
70
+ class ResourceObject:
71
+ CRIT = {"LOW": 0.3, "MED": 0.6, "HIGH": 1.0}
72
+
73
+ def __init__(self, idx: int, criticality: str = "MED",
74
+ category: str = "compute", rng: random.Random = None):
75
+ rng = rng or random.Random(idx)
76
+ self.idx = idx
77
+ self.criticality = self.CRIT[criticality]
78
+ self.category = category
79
+
80
+ # ── Cost state ──────────────────────────────────────────────────────
81
+ self.allocated = rng.uniform(0.70, 1.00) # initially overprovisioned
82
+ self.usage = rng.uniform(0.15, 0.50) # true usage (always < allocated)
83
+ self.usage = min(self.usage, self.allocated - 0.10)
84
+ self.cost_rate = self.allocated # cost ∝ allocated
85
+ self.activity_status = 1.0 # 1=active, 0=idle
86
+
87
+ # ── State flags ──────────────────────────────────────────────────────
88
+ self.health = 1
89
+ self.is_stopped = False
90
+ self.is_deleted = False
91
+ self.alert_flag = 0
92
+
93
+ # ── Security state (hidden under fog) ────────────────────────────────
94
+ self.risk_score = rng.uniform(0.05, 0.20)
95
+ self.vulnerability = False
96
+ self.encryption = True
97
+ self.over_permission = False
98
+ self.logging_enabled = True
99
+ self.credential_age = rng.uniform(0.0, 0.3)
100
+ self.exposure = rng.uniform(0.0, 0.15)
101
+ self.sensitivity = rng.uniform(0.3, 0.8)
102
+
103
+ # ── Fog of war ───────────────────────────────────────────────────────
104
+ self.fog_active = True # True = attributes hidden until ANALYZE
105
+ self.cost_known = False
106
+ self.deps_known = False
107
+ self.steps_since_analyze = 0
108
+ self.staleness = 0.0
109
+ self.STALE_STEPS = 15 # after this many steps, fog re-activates
110
+
111
+ # ── Dependency ───────────────────────────────────────────────────────
112
+ self.dependency_children: List[int] = [] # indices of resources that depend on this
113
+ self.dependency_parent: Optional[int] = None
114
+
115
+ # ── Diagnostics ──────────────────────────────────────────────────────
116
+ self.steps_broken = 0
117
+ self.time_broken = 0.0
118
+
119
+ # ── Derived properties ────────────────────────────────────────────────────
120
+
121
+ def overprovision_ratio(self) -> float:
122
+ return max(0.0, (self.allocated - self.usage) / max(self.allocated, 1e-6))
123
+
124
+ def get_cost(self) -> float:
125
+ if self.is_deleted:
126
+ return 0.0
127
+ if self.is_stopped:
128
+ return self.cost_rate * 0.05 # minimal maintenance cost
129
+ return self.cost_rate
130
+
131
+ # ── Observation vector (10 dims) ──────────────────────────────────────────
132
+
133
+ def to_obs(self, fog: bool = False) -> np.ndarray:
134
+ if fog and self.fog_active:
135
+ risk_obs = 0.0
136
+ cost_obs = 0.5 # agent sees estimated cost when under fog
137
+ exp_obs = 0.0
138
+ else:
139
+ risk_obs = self.risk_score
140
+ cost_obs = self.cost_rate
141
+ exp_obs = self.exposure
142
+
143
+ return np.array([
144
+ float(self.health), # 0
145
+ risk_obs, # 1 (hidden under fog)
146
+ self.criticality, # 2
147
+ cost_obs, # 3 (hidden under fog)
148
+ self.activity_status, # 4
149
+ exp_obs, # 5 (hidden under fog)
150
+ self.sensitivity, # 6
151
+ self.staleness, # 7 (always visible)
152
+ float(self.alert_flag), # 8 (always visible for critical)
153
+ self.time_broken, # 9
154
+ ], dtype=np.float32)
155
+
156
+ # ── Per-step tick ─────────────────────────────────────────────────────────
157
+
158
+ def tick(self, rng: random.Random, phase: int, event_prob: float = 0.0):
159
+ if self.is_deleted:
160
+ return
161
+
162
+ # Staleness
163
+ self.steps_since_analyze += 1
164
+ self.staleness = min(self.steps_since_analyze / self.STALE_STEPS, 1.0)
165
+ if self.steps_since_analyze >= self.STALE_STEPS:
166
+ self.fog_active = True # knowledge expires
167
+
168
+ # Usage drift (only when running)
169
+ if not self.is_stopped and self.health:
170
+ self.usage = float(np.clip(
171
+ self.usage + rng.uniform(-0.03, 0.03), 0.10, self.allocated))
172
+
173
+ # Credential aging
174
+ self.credential_age = min(self.credential_age + 0.01, 1.0)
175
+
176
+ # Broken resource tracking
177
+ if not self.health:
178
+ self.steps_broken += 1
179
+ self.time_broken = min(self.steps_broken / MAX_STEPS, 1.0)
180
+ self.risk_score = min(self.risk_score + 0.015, 1.0)
181
+ if self.criticality >= 1.0:
182
+ self.alert_flag = 1 # high-criticality broken = visible alert
183
+
184
+ # Random security events (Phase 2+)
185
+ if phase >= 2 and rng.random() < event_prob and self.health:
186
+ ev = rng.choice(["vuln", "expose", "iam", "log_off"])
187
+ if ev == "vuln":
188
+ self.vulnerability = True
189
+ self.risk_score = min(self.risk_score + 0.25, 1.0)
190
+ elif ev == "expose":
191
+ self.exposure = min(self.exposure + 0.35, 1.0)
192
+ self.risk_score = min(self.risk_score + 0.20, 1.0)
193
+ elif ev == "iam":
194
+ self.over_permission = True
195
+ self.risk_score = min(self.risk_score + 0.15, 1.0)
196
+ elif ev == "log_off":
197
+ self.logging_enabled = False
198
+ self.risk_score = min(self.risk_score + 0.05, 1.0)
199
+
200
+ # ── Actions ───────────────────────────────────────────────────────────────
201
+
202
+ def do_analyze(self):
203
+ self.fog_active = False
204
+ self.cost_known = True
205
+ self.steps_since_analyze = 0
206
+ self.staleness = 0.0
207
+
208
+ def do_verify_deps(self):
209
+ self.deps_known = True
210
+
211
+ def do_resize_down(self) -> float:
212
+ """Returns cost delta (positive = saving)."""
213
+ new_alloc = max(self.usage + 0.10, 0.25)
214
+ if new_alloc < self.allocated - 0.02:
215
+ saved = (self.allocated - new_alloc)
216
+ self.allocated = new_alloc
217
+ self.cost_rate = new_alloc
218
+ return saved
219
+ return 0.0
220
+
221
+ def do_resize_up(self):
222
+ self.allocated = min(self.allocated + 0.20, 1.0)
223
+ self.cost_rate = self.allocated
224
+
225
+ def do_stop(self) -> float:
226
+ if not self.is_stopped:
227
+ self.is_stopped = True
228
+ self.activity_status = 0.0
229
+ return self.cost_rate * 0.95 # 95% cost eliminated
230
+ return 0.0
231
+
232
+ def do_restart(self):
233
+ self.is_stopped = False
234
+ self.activity_status = 1.0
235
+ self.health = 1
236
+
237
+ def do_delete(self) -> float:
238
+ saved = self.cost_rate
239
+ self.is_deleted = True
240
+ self.health = 0
241
+ return saved
242
+
243
+ def do_patch(self):
244
+ self.vulnerability = False
245
+ self.risk_score = max(self.risk_score - 0.30, 0.0)
246
+
247
+ def do_encrypt(self):
248
+ self.encryption = True
249
+ self.risk_score = max(self.risk_score - 0.15, 0.0)
250
+
251
+ def do_restrict(self):
252
+ self.exposure = max(self.exposure - 0.40, 0.0)
253
+ self.risk_score = max(self.risk_score - 0.20, 0.0)
254
+
255
+ def do_rotate_creds(self):
256
+ self.credential_age = 0.0
257
+ self.over_permission = False
258
+ self.risk_score = max(self.risk_score - 0.10, 0.0)
259
+
260
+ def do_enable_logging(self):
261
+ self.logging_enabled = True
262
+ self.risk_score = max(self.risk_score - 0.05, 0.0)
263
+
264
+ def do_archive(self) -> float:
265
+ if not self.is_stopped:
266
+ self.is_stopped = True
267
+ self.activity_status = 0.0
268
+ return self.cost_rate * 0.70
269
+ return 0.0
270
+
271
+ def do_opt_network(self):
272
+ self.exposure = max(self.exposure - 0.15, 0.0)
273
+ self.risk_score = max(self.risk_score - 0.08, 0.0)
274
+
275
+
276
+ # ══════════════════════════════════════════════════════════════════════════════
277
+ # ENVIRONMENT
278
+ # ══════════════════════════════════════════════════════════════════════════════
279
+
280
+ class CloudArenaEnv(gym.Env):
281
+ """
282
+ Cloud Arena: multi-objective cloud operations RL environment.
283
+ Observation: 125-dim flat float32.
284
+ Action space: Discrete(150) = 15 types × 10 resource slots.
285
+ """
286
+ metadata = {"render_modes": []}
287
+
288
+ def __init__(self,
289
+ curriculum_ref: List[int] = None,
290
+ global_step_ref: List[int] = None):
291
+ super().__init__()
292
+ self._curriculum_ref = curriculum_ref or [0]
293
+ self._global_step_ref = global_step_ref or [0]
294
+
295
+ self.observation_space = spaces.Box(
296
+ low=-np.inf, high=np.inf, shape=(OBS_DIM,), dtype=np.float32)
297
+ self.action_space = spaces.Discrete(N_ACTIONS)
298
+
299
+ # Episode state (set in reset)
300
+ self.resources: List[ResourceObject] = []
301
+ self.n_active = 0
302
+ self.step_count = 0
303
+ self.chaos_active = False
304
+ self.chaos_steps = 0
305
+ self.veto_count = 0
306
+ self.cascade_count = 0
307
+ self.initial_total_cost = 1.0
308
+ self.prev_total_cost = 1.0
309
+ self.prev_risk_agg = 0.0
310
+ self._action_hist = deque([0.0] * 8, maxlen=8)
311
+ self._reward_hist = deque([0.0] * 8, maxlen=8)
312
+ self._progress_hist= deque([0.0] * 8, maxlen=8)
313
+
314
+ # ── Properties ────────────────────────────────────────────────────────────
315
+
316
+ @property
317
+ def curriculum_level(self) -> int:
318
+ return self._curriculum_ref[0]
319
+
320
+ # ── Reset ─────────────────────────────────────────────────────────────────
321
+
322
+ def reset(self, seed=None, options=None):
323
+ super().reset(seed=seed)
324
+ rng = random.Random(seed if seed is not None else GLOBAL_SEED + self.step_count)
325
+
326
+ self.step_count = 0
327
+ self.chaos_active = False
328
+ self.chaos_steps = 0
329
+ self.veto_count = 0
330
+ self.cascade_count = 0
331
+
332
+ phase = self.curriculum_level
333
+ scenario = options.get("scenario", 0) if options else 0
334
+
335
+ if scenario > 0:
336
+ self._setup_boss_scenario(scenario, rng)
337
+ else:
338
+ self._setup_normal_episode(phase, rng)
339
+
340
+ self.initial_total_cost = max(sum(r.get_cost() for r in self.resources), 1e-6)
341
+ self.prev_total_cost = self.initial_total_cost
342
+ self.prev_risk_agg = self._risk_aggregate()
343
+
344
+ self._action_hist = deque([0.0] * 8, maxlen=8)
345
+ self._reward_hist = deque([0.0] * 8, maxlen=8)
346
+ self._progress_hist = deque([0.0] * 8, maxlen=8)
347
+
348
+ return self._build_obs(), {}
349
+
350
+ def _setup_normal_episode(self, phase: int, rng: random.Random):
351
+ """Standard episode with phase-appropriate resources."""
352
+ self.n_active = N_RESOURCES_PHASE[phase]
353
+ n = self.n_active
354
+
355
+ # Criticality distribution: ~20% HIGH, ~40% MED, ~40% LOW
356
+ crits = []
357
+ for i in range(n):
358
+ if i == 0:
359
+ crits.append("HIGH")
360
+ elif i < n // 2:
361
+ crits.append("MED")
362
+ else:
363
+ crits.append("LOW")
364
+
365
+ cats = ["compute", "compute", "storage", "database",
366
+ "compute", "storage", "compute", "database",
367
+ "compute", "storage"][:n]
368
+
369
+ self.resources = []
370
+ for i in range(n):
371
+ r = ResourceObject(i, crits[i], cats[i], rng)
372
+
373
+ # Phase 0: full observability — reveal everything upfront
374
+ if not PHASE_FOG[phase]:
375
+ r.fog_active = False
376
+ r.cost_known = True
377
+ r.deps_known = True
378
+
379
+ # Phase 0: no security issues to start (clean state)
380
+ if phase == 0:
381
+ r.risk_score = rng.uniform(0.02, 0.08)
382
+ r.vulnerability = False
383
+ r.encryption = True
384
+ r.over_permission = False
385
+ r.logging_enabled = True
386
+ r.exposure = rng.uniform(0.0, 0.05)
387
+ else:
388
+ # 💥 ANTI-CHEAT FIX: Force the agent to actually do SecOps in Phase 1+!
389
+ r.vulnerability = rng.random() < 0.40
390
+ r.encryption = rng.random() > 0.30 # 30% unencrypted
391
+ r.over_permission = rng.random() < 0.30
392
+ r.logging_enabled = rng.random() > 0.20
393
+ r.exposure = rng.uniform(0.10, 0.40)
394
+ r.risk_score = rng.uniform(0.30, 0.60)
395
+
396
+ self.resources.append(r)
397
+
398
+ # Set up simple dependency: resource 0 (HIGH) has children [1]
399
+ # This means deleting resource 0 would cascade to resource 1
400
+ # Agent can't delete resource 0 anyway (HIGH criticality), so it's safe
401
+ if n >= 2:
402
+ self.resources[0].dependency_children = [1]
403
+ self.resources[1].dependency_parent = 0
404
+
405
+ # Chaos initialization for Phase 3+
406
+ if PHASE_CHAOS[phase] and rng.random() < CHAOS_INIT_PROB[phase]:
407
+ self.chaos_active = True
408
+ # Break 1-2 non-critical resources
409
+ victims = [r for r in self.resources if r.criticality < 1.0][:2]
410
+ for v in victims:
411
+ v.health = 0
412
+ v.risk_score = min(v.risk_score + 0.40, 1.0)
413
+ v.alert_flag = 0 # hidden unless HIGH criticality
414
+
415
+ def _setup_boss_scenario(self, scenario: int, rng: random.Random):
416
+ """Boss fight: predefined stressful starting conditions."""
417
+ phase = max(self.curriculum_level, 3) # boss fights at phase 3+ difficulty
418
+ self._setup_normal_episode(phase, rng)
419
+
420
+ if scenario == 1: # Cost Crisis
421
+ for r in self.resources:
422
+ r.allocated = min(r.allocated + rng.uniform(0.10, 0.25), 1.0)
423
+ r.cost_rate = r.allocated
424
+ r.usage = max(r.usage - 0.10, 0.10)
425
+
426
+ elif scenario == 2: # Security Breach
427
+ for r in self.resources:
428
+ r.fog_active = True # force fog — agent must analyze
429
+ r.cost_known = False
430
+ r.vulnerability = (rng.random() < 0.60)
431
+ r.encryption = (rng.random() < 0.30)
432
+ r.over_permission = (rng.random() < 0.50)
433
+ r.logging_enabled = (rng.random() < 0.40)
434
+ r.exposure = rng.uniform(0.30, 0.80)
435
+ r.risk_score = rng.uniform(0.40, 0.90)
436
+
437
+ elif scenario == 3: # Infrastructure Failure (NOOP Test)
438
+ self.chaos_active = True
439
+ for r in self.resources[:3]:
440
+ r.health = 0
441
+ r.risk_score = min(r.risk_score + 0.50, 1.0)
442
+
443
+ elif scenario == 4: # Traffic Surge (underprovisioned)
444
+ for r in self.resources:
445
+ r.usage = min(r.allocated - 0.05, rng.uniform(0.75, 0.95))
446
+ r.risk_score = min(r.risk_score + 0.10, 0.50)
447
+
448
+ elif scenario == 5: # Final Boss: everything
449
+ self.chaos_active = True
450
+ for i, r in enumerate(self.resources):
451
+ r.allocated = min(r.allocated + 0.15, 1.0)
452
+ r.cost_rate = r.allocated
453
+ r.vulnerability = (rng.random() < 0.50)
454
+ r.encryption = (rng.random() < 0.40)
455
+ r.exposure = rng.uniform(0.20, 0.70)
456
+ r.risk_score = rng.uniform(0.30, 0.80)
457
+ if i < 2:
458
+ r.health = 0
459
+
460
+ # ── Step ──────────────────────────────────────────────────────────────────
461
+
462
+ def step(self, action: int):
463
+ action = int(action)
464
+ self.step_count += 1
465
+ self._global_step_ref[0] += 1
466
+
467
+ atype = action // MAX_RESOURCES
468
+ ridx = action % MAX_RESOURCES
469
+
470
+ phase = self.curriculum_level
471
+
472
+ # ── Tick all resources ────────────────────────────────────────────────
473
+ event_prob = 0.04 if PHASE_EVENTS[phase] else 0.0
474
+ rng = random.Random(self._global_step_ref[0])
475
+ for r in self.resources:
476
+ r.tick(rng, phase, event_prob)
477
+
478
+ # ── Chaos events (Phase 3+) ───────────────────────────────────────────
479
+ if PHASE_CHAOS[phase] and rng.random() < 0.03:
480
+ healthy = [r for r in self.resources if r.health and not r.is_deleted
481
+ and r.criticality < 1.0]
482
+ if healthy:
483
+ victim = rng.choice(healthy)
484
+ victim.health = 0
485
+ victim.risk_score = min(victim.risk_score + 0.40, 1.0)
486
+ self.chaos_active = True
487
+
488
+ if self.chaos_active:
489
+ self.chaos_steps += 1
490
+ if self.chaos_steps > 20:
491
+ self.chaos_active = False # chaos resolves after ~20 steps
492
+
493
+ # ── Snapshot pre-action state ─────────────────────────────────────────
494
+ cost_before = sum(r.get_cost() for r in self.resources)
495
+ risk_before = self._risk_aggregate()
496
+
497
+ # ── Apply action ──────────────────────────────────────────────────────
498
+ cost_delta, sec_delta, veto = self._apply_action(atype, ridx)
499
+ if veto:
500
+ self.veto_count += 1
501
+
502
+ # ── Post-action state ─────────────────────────────────────────────────
503
+ cost_now = sum(r.get_cost() for r in self.resources)
504
+ risk_now = self._risk_aggregate()
505
+
506
+ # ── Compute reward ────────────────────────────────────────────────────
507
+ reward = self._compute_reward(
508
+ atype, ridx, veto, cost_before, cost_now, risk_before, risk_now)
509
+
510
+ # ── Check win/done ────────────────────────────────────────────────────
511
+ win = self._check_win(cost_now, risk_now, phase)
512
+ terminated = win
513
+ truncated = (self.step_count >= MAX_STEPS)
514
+
515
+ if terminated or truncated:
516
+ reward += self._terminal_reward(win, cost_now, risk_now, phase)
517
+ reward = float(np.clip(reward, -30.0, 60.0))
518
+ else:
519
+ reward = float(np.clip(reward, -2.0, 5.0))
520
+
521
+ # ── Update history ────────────────────────────────────────────────────
522
+ self._action_hist.append(atype / N_ACTION_TYPES)
523
+ self._reward_hist.append(np.clip(reward / 5.0, -1.0, 1.0))
524
+ self._progress_hist.append(max(0.0, (self.initial_total_cost - cost_now)
525
+ / max(self.initial_total_cost, 1e-6)))
526
+ self.prev_total_cost = cost_now
527
+ self.prev_risk_agg = risk_now
528
+
529
+ info = {
530
+ "win": int(win),
531
+ "cost_score": float(np.clip(1.0 - cost_now / max(self.initial_total_cost, 1e-6), 0, 1)),
532
+ "security_score": float(np.clip(1.0 - risk_now, 0, 1)),
533
+ "reliability_score": self._reliability_score(),
534
+ "savings_pct": float(np.clip(
535
+ (self.initial_total_cost - cost_now)
536
+ / max(self.initial_total_cost, 1e-6) * 100, 0, 100)),
537
+ "veto_rate": self.veto_count / max(self.step_count, 1),
538
+ "cascade_count": self.cascade_count,
539
+ "risk": risk_now,
540
+ "chaos_active": self.chaos_active,
541
+ }
542
+
543
+ return self._build_obs(), reward, terminated, truncated, info
544
+
545
+ # ── Action application ────────────────────────────────────────────────────
546
+
547
+ def _apply_action(self, atype: int, ridx: int) -> Tuple[float, float, bool]:
548
+ """Returns (cost_delta, security_delta, was_vetoed)."""
549
+
550
+ if atype == A_NOOP:
551
+ return 0.0, 0.0, False # NOOP is never a veto
552
+
553
+ # Validate resource index
554
+ if ridx >= len(self.resources):
555
+ return 0.0, 0.0, True
556
+
557
+ r = self.resources[ridx]
558
+
559
+ if r.is_deleted:
560
+ return 0.0, 0.0, True
561
+
562
+ cost_before = r.get_cost()
563
+ risk_before = r.risk_score
564
+ veto = False
565
+
566
+ if atype == A_ANALYZE:
567
+ r.do_analyze()
568
+
569
+ elif atype == A_VERIFY_DEPS:
570
+ r.do_verify_deps()
571
+
572
+ elif atype == A_RESIZE_DOWN:
573
+ if r.overprovision_ratio() > 0.08 and not r.is_stopped:
574
+ r.do_resize_down()
575
+ else:
576
+ veto = True
577
+
578
+ elif atype == A_RESIZE_UP:
579
+ if r.usage > r.allocated - 0.12:
580
+ r.do_resize_up()
581
+ else:
582
+ veto = True
583
+
584
+ elif atype == A_STOP:
585
+ can_stop = (not r.is_stopped and
586
+ (r.activity_status < 0.35 or r.criticality <= 0.3) and
587
+ r.criticality < 1.0)
588
+ if can_stop:
589
+ r.do_stop()
590
+ else:
591
+ veto = True
592
+
593
+ elif atype == A_RESTART:
594
+ if r.is_stopped:
595
+ r.do_restart()
596
+ else:
597
+ veto = True
598
+
599
+ elif atype == A_DELETE:
600
+ can_delete = (r.deps_known and r.criticality < 1.0 and not r.is_stopped)
601
+ if can_delete:
602
+ has_crit_child = any(
603
+ (ci < len(self.resources) and
604
+ not self.resources[ci].is_deleted and
605
+ self.resources[ci].criticality >= 0.6)
606
+ for ci in r.dependency_children)
607
+ if has_crit_child:
608
+ veto = True
609
+ else:
610
+ r.do_delete()
611
+ for ci in r.dependency_children:
612
+ if ci < len(self.resources) and not self.resources[ci].is_deleted:
613
+ child = self.resources[ci]
614
+ child.health = 0
615
+ child.risk_score = min(child.risk_score + 0.3, 1.0)
616
+ self.cascade_count += 1
617
+ else:
618
+ veto = True
619
+
620
+ elif atype == A_PATCH:
621
+ if r.vulnerability:
622
+ r.do_patch()
623
+ else:
624
+ veto = True
625
+
626
+ elif atype == A_ENCRYPT:
627
+ if not r.encryption:
628
+ r.do_encrypt()
629
+ else:
630
+ veto = True
631
+
632
+ elif atype == A_RESTRICT:
633
+ if r.exposure > 0.15:
634
+ r.do_restrict()
635
+ else:
636
+ veto = True
637
+
638
+ elif atype == A_ROTATE_CREDS:
639
+ if r.credential_age > 0.40:
640
+ r.do_rotate_creds()
641
+ else:
642
+ veto = True
643
+
644
+ elif atype == A_ENABLE_LOG:
645
+ if not r.logging_enabled:
646
+ r.do_enable_logging()
647
+ else:
648
+ veto = True
649
+
650
+ elif atype == A_ARCHIVE:
651
+ if r.category == "storage" and r.activity_status < 0.35:
652
+ r.do_archive()
653
+ else:
654
+ veto = True
655
+
656
+ elif atype == A_OPT_NET:
657
+ if r.exposure > 0.08:
658
+ r.do_opt_network()
659
+ else:
660
+ veto = True
661
+
662
+ cost_after = r.get_cost() if not r.is_deleted else 0.0
663
+ risk_after = r.risk_score if not r.is_deleted else 0.0
664
+
665
+ return (cost_before - cost_after), (risk_before - risk_after), veto
666
+
667
+ # ── Reward ────────────────────────────────────────────────────────────────
668
+
669
+ def _compute_reward(self, atype, ridx, veto,
670
+ cost_before, cost_now, risk_before, risk_now) -> float:
671
+
672
+ phase = self.curriculum_level
673
+ w_cost = 0.25
674
+ w_sec = 0.35 if phase >= 1 else 0.0
675
+ w_stab = 0.25
676
+
677
+ # ── 1. Dense cost channel ─────────────────────────────────────────────
678
+ r_cost = -w_cost * (cost_now / max(self.initial_total_cost, 1e-6))
679
+
680
+ # ── 2. Dense security channel ─────────────────────────────────────────
681
+ r_sec = -w_sec * risk_now
682
+
683
+ # ── 3. Stability/reliability ──────────────────────────────────────────
684
+ n_broken = sum(1 for r in self.resources if not r.health and not r.is_deleted)
685
+ r_stab = -w_stab * (n_broken / max(len(self.resources), 1))
686
+
687
+ # ── 4. Delta reward (THE MOST IMPORTANT SIGNAL) ───────────────────────
688
+ # Positive when agent caused improvement, zero otherwise
689
+ cost_improvement = (cost_before - cost_now) / max(self.initial_total_cost, 1e-6)
690
+ risk_improvement = risk_before - risk_now
691
+
692
+ r_delta = 3.0 * cost_improvement # strong signal for cost savings
693
+ r_delta += 4.0 * risk_improvement # strong signal for security improvements
694
+ r_delta = float(np.clip(r_delta, -1.0, 2.0))
695
+
696
+ # ── 5. NOOP shaping ───────────────────────────────────────────────────
697
+ if atype == A_NOOP:
698
+ if self.chaos_active:
699
+ r_noop = +0.10 # correct — don't touch things during chaos
700
+ elif risk_now < 0.10 and cost_now < self.initial_total_cost * 0.60:
701
+ r_noop = +0.05 # correct — system is genuinely healthy
702
+ elif risk_now < 0.25:
703
+ r_noop = +0.01 # acceptable
704
+ elif risk_now < 0.50:
705
+ r_noop = -0.05 # negligence
706
+ else:
707
+ r_noop = -0.15 # gross negligence
708
+ else:
709
+ r_noop = 0.0
710
+
711
+ # ── 6. Action cost penalty ────────────────────────────────────────────
712
+ r_action = ACTION_COSTS.get(atype, -0.02)
713
+
714
+ # ── 7. Veto penalty ───────────────────────────────────────────────────
715
+ r_veto = -0.10 if veto else 0.0
716
+
717
+ # ── 8. Temporal neglect ───────────────────────────────────────────────
718
+ # Phase 1+: growing penalty for ignoring known high-risk resources
719
+ r_neglect = 0.0
720
+ if phase >= 1:
721
+ for r in self.resources:
722
+ if (not r.fog_active and not r.is_deleted and
723
+ r.risk_score > 0.60):
724
+ neglect_scale = min(r.steps_broken / MAX_STEPS, 1.0)
725
+ r_neglect -= 0.02 * (1.0 + neglect_scale) * r.criticality
726
+ r_neglect = max(r_neglect, -0.20)
727
+
728
+ total = r_cost + r_sec + r_stab + r_delta + r_noop + r_action + r_veto + r_neglect
729
+ return float(total)
730
+
731
+ def _terminal_reward(self, win: bool, cost_now: float,
732
+ risk_now: float, phase: int) -> float:
733
+ r = 0.0
734
+ if win:
735
+ speed_bonus = 10.0 * (1.0 - self.step_count / MAX_STEPS)
736
+ r += 15.0 + speed_bonus
737
+ else:
738
+ # Partial credit
739
+ cost_reduction = (self.initial_total_cost - cost_now) / max(self.initial_total_cost, 1e-6)
740
+ r += 3.0 * max(cost_reduction, 0.0)
741
+ r -= 5.0 # timeout penalty
742
+
743
+ r -= 10.0 * risk_now # end-state security penalty
744
+ if self.cascade_count > 0:
745
+ r -= 5.0 * min(self.cascade_count, 3)
746
+ return r
747
+
748
+ # ── Win condition ─────────────────────────────────────────────────────────
749
+
750
+ def _check_win(self, cost_now: float, risk_now: float, phase: int) -> bool:
751
+ cost_ratio = cost_now / max(self.initial_total_cost, 1e-6)
752
+ cost_win = cost_ratio < WIN_COST_THR[phase]
753
+
754
+ sec_score = 1.0 - risk_now
755
+ sec_win = sec_score >= WIN_SEC_THR[phase]
756
+
757
+ # No critical resources broken
758
+ no_crit_broken = not any(
759
+ r.criticality >= 1.0 and not r.health and not r.is_deleted
760
+ for r in self.resources)
761
+
762
+ return cost_win and sec_win and no_crit_broken
763
+
764
+ # ── Observation ───────────────────────────────────────────────────────────
765
+
766
+ def _build_obs(self) -> np.ndarray:
767
+ phase = self.curriculum_level
768
+ fog = PHASE_FOG[phase]
769
+
770
+ # Block A: resource observations (padded to MAX_RES_IN_OBS)
771
+ block_a = np.zeros(MAX_RES_IN_OBS * N_FEAT_PER_RES, dtype=np.float32)
772
+ for i, r in enumerate(self.resources[:MAX_RES_IN_OBS]):
773
+ block_a[i * N_FEAT_PER_RES: (i + 1) * N_FEAT_PER_RES] = r.to_obs(fog)
774
+
775
+ # Block B: global security (8 dims)
776
+ active = [r for r in self.resources if not r.is_deleted]
777
+ n_a = max(len(active), 1)
778
+ risk_agg = self._risk_aggregate()
779
+ n_vuln = sum(1 for r in active if r.vulnerability)
780
+ n_exposed = sum(1 for r in active if r.exposure > 0.3)
781
+ n_unenc = sum(1 for r in active if not r.encryption)
782
+ n_no_log = sum(1 for r in active if not r.logging_enabled)
783
+ n_overperm = sum(1 for r in active if r.over_permission)
784
+ block_b = np.array([
785
+ risk_agg,
786
+ n_vuln / n_a,
787
+ n_exposed / n_a,
788
+ n_unenc / n_a,
789
+ n_no_log / n_a,
790
+ n_overperm / n_a,
791
+ min(sum(r.credential_age for r in active) / n_a, 1.0),
792
+ float(self.chaos_active),
793
+ ], dtype=np.float32)
794
+
795
+ # Block C: global cost (7 dims)
796
+ total_cost = sum(r.get_cost() for r in self.resources)
797
+ n_idle = sum(1 for r in active if r.activity_status < 0.3)
798
+ n_overprov = sum(1 for r in active if r.overprovision_ratio() > 0.2)
799
+ n_stopped = sum(1 for r in self.resources if r.is_stopped)
800
+ n_deleted = sum(1 for r in self.resources if r.is_deleted)
801
+ block_c = np.array([
802
+ total_cost / max(self.initial_total_cost, 1e-6),
803
+ n_idle / n_a,
804
+ n_overprov / n_a,
805
+ n_stopped / max(len(self.resources), 1),
806
+ n_deleted / max(len(self.resources), 1),
807
+ (self.initial_total_cost - total_cost) / max(self.initial_total_cost, 1e-6),
808
+ float(self._check_win(total_cost, risk_agg, self.curriculum_level)),
809
+ ], dtype=np.float32)
810
+
811
+ # Block D: environment state (6 dims)
812
+ n_broken = sum(1 for r in active if not r.health)
813
+ block_d = np.array([
814
+ self.step_count / MAX_STEPS,
815
+ self.curriculum_level / 5.0,
816
+ float(self.chaos_active),
817
+ n_broken / n_a,
818
+ self.veto_count / max(self.step_count, 1),
819
+ self.cascade_count / max(n_a, 1),
820
+ ], dtype=np.float32)
821
+
822
+ # Block E: history (24 dims)
823
+ block_e = np.array(
824
+ list(self._action_hist) +
825
+ list(self._reward_hist) +
826
+ list(self._progress_hist),
827
+ dtype=np.float32)
828
+
829
+ obs = np.concatenate([block_a, block_b, block_c, block_d, block_e])
830
+ assert obs.shape == (OBS_DIM,), f"Obs shape {obs.shape} != {OBS_DIM}"
831
+ return obs
832
+
833
+ # ── Action masks ──────────────────────────────────────────────────────────
834
+
835
+ def action_masks(self) -> np.ndarray:
836
+ mask = np.zeros(N_ACTIONS, dtype=bool)
837
+
838
+ # NOOP (action 0) — always valid
839
+ mask[A_NOOP * MAX_RESOURCES] = True
840
+
841
+ for ridx in range(MAX_RESOURCES):
842
+ # Resources beyond active set are always invalid
843
+ if ridx >= len(self.resources):
844
+ # Only NOOP is already set; skip rest
845
+ continue
846
+
847
+ r = self.resources[ridx]
848
+
849
+ if r.is_deleted:
850
+ continue
851
+
852
+ aid = lambda atype: atype * MAX_RESOURCES + ridx # noqa
853
+
854
+ # ANALYZE — always valid (costs a small amount)
855
+ mask[aid(A_ANALYZE)] = True
856
+
857
+ # VERIFY_DEPS — always valid
858
+ mask[aid(A_VERIFY_DEPS)] = True
859
+
860
+ # 💥 ANTI-CHEAT FIX: If fog is active, the agent CANNOT execute these actions!
861
+ if r.fog_active:
862
+ continue # Skips evaluating the rest, keeping them False (Masked)
863
+
864
+ # --- ONLY EVALUATED IF FOG IS LIFTED ---
865
+
866
+ # RESIZE_DOWN — valid if overprovisioned and running
867
+ mask[aid(A_RESIZE_DOWN)] = (r.overprovision_ratio() > 0.08
868
+ and not r.is_stopped)
869
+
870
+ # RESIZE_UP — valid if near capacity
871
+ mask[aid(A_RESIZE_UP)] = (r.usage > r.allocated - 0.12
872
+ and not r.is_stopped)
873
+
874
+ # STOP — valid if idle or LOW criticality and currently running
875
+ mask[aid(A_STOP)] = (not r.is_stopped
876
+ and r.criticality < 1.0
877
+ and (r.activity_status < 0.35 or r.criticality <= 0.3))
878
+
879
+ # RESTART — valid if stopped
880
+ mask[aid(A_RESTART)] = r.is_stopped
881
+
882
+ # DELETE — valid if deps known, not critical, no critical children
883
+ has_crit_child = any(
884
+ (ci < len(self.resources) and
885
+ not self.resources[ci].is_deleted and
886
+ self.resources[ci].criticality >= 0.6)
887
+ for ci in r.dependency_children)
888
+ mask[aid(A_DELETE)] = (r.deps_known and r.criticality < 1.0
889
+ and not has_crit_child)
890
+
891
+ # Security fixes (Phase 1+)
892
+ mask[aid(A_PATCH)] = r.vulnerability
893
+ mask[aid(A_ENCRYPT)] = not r.encryption
894
+ mask[aid(A_RESTRICT)] = r.exposure > 0.15
895
+ mask[aid(A_ROTATE_CREDS)] = r.credential_age > 0.40
896
+ mask[aid(A_ENABLE_LOG)] = not r.logging_enabled
897
+ mask[aid(A_ARCHIVE)] = (r.category == "storage"
898
+ and r.activity_status < 0.35)
899
+ mask[aid(A_OPT_NET)] = r.exposure > 0.08
900
+
901
+ # Collapse guard: always at least 3 valid actions
902
+ if mask.sum() < 3:
903
+ mask[A_NOOP * MAX_RESOURCES] = True
904
+ if len(self.resources) > 0:
905
+ mask[A_ANALYZE * MAX_RESOURCES] = True
906
+ if len(self.resources) > 1:
907
+ mask[A_ANALYZE * MAX_RESOURCES + 1] = True
908
+
909
+ return mask
910
+
911
+ # ── Helpers ───────────────────────────────────────────────────────────────
912
+
913
+ def _risk_aggregate(self) -> float:
914
+ active = [r for r in self.resources if not r.is_deleted]
915
+ if not active:
916
+ return 0.0
917
+ weighted = sum(r.risk_score * r.criticality for r in active)
918
+ total_w = sum(r.criticality for r in active)
919
+ return weighted / max(total_w, 1e-6)
920
+
921
+ def _reliability_score(self) -> float:
922
+ active = [r for r in self.resources if not r.is_deleted]
923
+ if not active:
924
+ return 0.0
925
+ broken_w = sum(r.criticality for r in active if not r.health)
926
+ total_w = sum(r.criticality for r in active)
927
+ return max(0.0, 1.0 - broken_w / max(total_w, 1e-6))
928
+
929
+ def render(self): pass
930
+
931
+
932
+ # ── Gymnasium wrapper ─────────────────────────────────────────────────────────
933
+
934
+ from sb3_contrib.common.wrappers import ActionMasker
935
+
936
+ def get_action_masks(env) -> np.ndarray:
937
+ """Extract mask through ActionMasker wrapper."""
938
+ inner = env
939
+ while hasattr(inner, "env"):
940
+ inner = inner.env
941
+ return inner.action_masks()
cloud_arena/evaluation.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cloud Arena Evaluation — Mathematical Model
2
+ # Extracted from cloud_arena_final.py (Cells 4-5)
3
+
4
+ import os
5
+ import numpy as np
6
+ import torch
7
+ from typing import List
8
+
9
+ from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
10
+ from sb3_contrib import MaskablePPO
11
+ from sb3_contrib.common.wrappers import ActionMasker
12
+
13
+ from cloud_arena.environment import (
14
+ CloudArenaEnv, get_action_masks, MAX_RESOURCES, MAX_STEPS, A_NOOP,
15
+ )
16
+
17
+
18
+ def _get_inner(vec_env):
19
+ inner = vec_env.envs[0]
20
+ while hasattr(inner, "env"):
21
+ inner = inner.env
22
+ return inner
23
+
24
+
25
+ def evaluate_model(model_path="./models/cloud_arena_final",
26
+ vecnorm_path="./models/cloud_arena_vecnorm.pkl",
27
+ level=0, n_eval=30):
28
+ results = {k: [] for k in ["win","cost_score","security_score",
29
+ "reliability_score","savings_pct","veto_rate",
30
+ "cascade_count","steps"]}
31
+
32
+ def make_eval_env():
33
+ env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[500000])
34
+ return ActionMasker(env, get_action_masks)
35
+
36
+ raw = DummyVecEnv([make_eval_env])
37
+ eval_env = VecNormalize.load(vecnorm_path, raw)
38
+ eval_env.training = False
39
+ eval_env.norm_reward = False
40
+
41
+ model = MaskablePPO.load(model_path, env=eval_env)
42
+
43
+ for ep in range(n_eval):
44
+ obs = eval_env.reset()
45
+ done = False
46
+ steps = 0
47
+ while not done:
48
+ masks = [_get_inner(eval_env).action_masks()]
49
+ act, _ = model.predict(obs, deterministic=True, action_masks=masks)
50
+ obs, rew, done_arr, info_arr = eval_env.step(act)
51
+ done = bool(done_arr[0])
52
+ steps += 1
53
+ info = info_arr[0] if info_arr else {}
54
+ for k in results:
55
+ results[k].append(info.get(k, 0) if k != "steps" else steps)
56
+
57
+ return results
58
+
59
+
60
+ BOSS_NAMES = {
61
+ 1: "Cost Crisis",
62
+ 2: "Security Breach",
63
+ 3: "Infrastructure Failure",
64
+ 4: "Traffic Surge",
65
+ 5: "Final Boss",
66
+ }
67
+
68
+
69
+ def run_boss_fights(model_path="./models/cloud_arena_final",
70
+ vecnorm_path="./models/cloud_arena_vecnorm.pkl",
71
+ level=0, n_runs=10):
72
+ model = MaskablePPO.load(model_path)
73
+ boss_scores = {}
74
+
75
+ for s_id, name in BOSS_NAMES.items():
76
+ runs = []
77
+ for seed in range(100, 100 + n_runs):
78
+ def _init():
79
+ env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[0])
80
+ return ActionMasker(env, get_action_masks)
81
+
82
+ raw = DummyVecEnv([_init])
83
+ vec = VecNormalize.load(vecnorm_path, raw)
84
+ vec.training = False
85
+ vec.norm_reward = False
86
+
87
+ inner = _get_inner(vec)
88
+ raw_obs, _ = inner.reset(seed=seed, options={"scenario": s_id})
89
+ obs = vec.normalize_obs(np.array([raw_obs]))
90
+
91
+ done = False
92
+ steps = 0
93
+ noops_chaos = 0
94
+ chaos_steps_total = 0
95
+
96
+ while not done:
97
+ masks = [inner.action_masks()]
98
+ act, _ = model.predict(obs, deterministic=True, action_masks=masks)
99
+ a_type = int(act[0]) // MAX_RESOURCES
100
+ if inner.chaos_active:
101
+ chaos_steps_total += 1
102
+ if a_type == A_NOOP:
103
+ noops_chaos += 1
104
+ obs, _, done_arr, info_arr = vec.step(act)
105
+ done = bool(done_arr[0])
106
+ steps += 1
107
+
108
+ info = info_arr[0] if info_arr else {}
109
+ info.update({"steps": steps, "noops_chaos": noops_chaos, "chaos_steps": chaos_steps_total})
110
+ runs.append(info)
111
+ vec.close()
112
+
113
+ wins = [r.get("win", 0) for r in runs]
114
+ costs = [r.get("cost_score", 0) for r in runs]
115
+ secs = [r.get("security_score", 0) for r in runs]
116
+ rels = [r.get("reliability_score", 0) for r in runs]
117
+
118
+ if s_id == 3:
119
+ noop_r = [r["noops_chaos"] / max(r["chaos_steps"], 1) for r in runs]
120
+ score = (0.4 * np.mean(noop_r) + 0.6 * np.mean(rels)) * 100
121
+ else:
122
+ score = (0.4 * np.mean(wins) + 0.3 * np.mean(costs) + 0.3 * np.mean(secs)) * 100
123
+
124
+ boss_scores[s_id] = score
125
+
126
+ return boss_scores
cloud_arena/training.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cloud Arena Training — Mathematical Model (MaskablePPO)
2
+ # Extracted from cloud_arena_final.py (Cell 3)
3
+
4
+ import os, sys, math
5
+ import numpy as np
6
+ import torch
7
+
8
+ from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize, sync_envs_normalization
9
+ from stable_baselines3.common.callbacks import BaseCallback
10
+ from stable_baselines3.common.monitor import Monitor
11
+ from sb3_contrib import MaskablePPO
12
+ from sb3_contrib.common.wrappers import ActionMasker
13
+ from sb3_contrib.common.maskable.callbacks import MaskableEvalCallback
14
+
15
+ from cloud_arena.environment import (
16
+ CloudArenaEnv, get_action_masks, GLOBAL_SEED,
17
+ N_ACTION_TYPES, MAX_RESOURCES, N_ACTIONS, MAX_STEPS,
18
+ )
19
+
20
+
21
+ TOTAL_TIMESTEPS = 500_000
22
+
23
+ def cosine_lr(progress_remaining: float, init_lr: float = 3e-4, min_lr: float = 5e-5):
24
+ return min_lr + (init_lr - min_lr) * 0.5 * (1.0 + math.cos(math.pi * (1.0 - progress_remaining)))
25
+
26
+
27
+ class SafeMaskableEvalCallback(MaskableEvalCallback):
28
+ def _on_step(self) -> bool:
29
+ if self.model.get_vec_normalize_env() is not None:
30
+ sync_envs_normalization(self.training_env, self.eval_env)
31
+ return super()._on_step()
32
+
33
+
34
+ class CloudArenaCallback(BaseCallback):
35
+ EMA_ALPHA = 0.02
36
+ MIN_EPS_PER_PHASE = 800
37
+ PHASE_THRESHOLDS = {0: 0.65, 1: 0.62, 2: 0.58, 3: 0.55, 4: 0.52}
38
+ PROGRESS_EVERY = 500
39
+
40
+ def __init__(self, curriculum_ref, verbose=0):
41
+ super().__init__(verbose)
42
+ self._curriculum_ref = curriculum_ref
43
+ self.ema_win_rate = 0.0
44
+ self.current_level = 0
45
+ self._phase_eps = 0
46
+ self.episode_rewards = []
47
+ self.episode_wins = []
48
+ self.episode_savings = []
49
+ self.episode_security = []
50
+ self.episode_veto_rates = []
51
+ self.curriculum_log = [(0, 0)]
52
+ self.action_freq = np.zeros(N_ACTION_TYPES)
53
+
54
+ def _on_step(self) -> bool:
55
+ if self.num_timesteps % self.PROGRESS_EVERY == 0:
56
+ self._print_progress()
57
+ actions = self.locals.get("actions")
58
+ if actions is not None:
59
+ for a in actions:
60
+ atype = int(a) // MAX_RESOURCES
61
+ if atype < N_ACTION_TYPES:
62
+ self.action_freq[atype] += 1
63
+ dones = self.locals.get("dones", [False])
64
+ if dones[0]:
65
+ info = self.locals.get("infos", [{}])[0]
66
+ self._on_episode_end(info)
67
+ return True
68
+
69
+ def _on_episode_end(self, info):
70
+ if "final_info" in info:
71
+ info = info["final_info"]
72
+ win = int(info.get("win", 0))
73
+ self.ema_win_rate = (1 - self.EMA_ALPHA) * self.ema_win_rate + self.EMA_ALPHA * win
74
+ self.episode_rewards.append(float(self.locals.get("rewards", [0])[0]))
75
+ self.episode_wins.append(win)
76
+ self.episode_savings.append(info.get("savings_pct", 0))
77
+ self.episode_security.append(info.get("security_score", 0))
78
+ self.episode_veto_rates.append(info.get("veto_rate", 0))
79
+ self._phase_eps += 1
80
+ thr = self.PHASE_THRESHOLDS.get(self.current_level, 0.50)
81
+ if self.current_level < 5 and self._phase_eps >= self.MIN_EPS_PER_PHASE and self.ema_win_rate >= thr:
82
+ self._try_promote()
83
+
84
+ def _try_promote(self):
85
+ self.current_level += 1
86
+ self._curriculum_ref[0] = self.current_level
87
+ self._phase_eps = 0
88
+ self.ema_win_rate = 0.0
89
+ self.curriculum_log.append((self.num_timesteps, self.current_level))
90
+ print(f"\n✄ PROMOTED -> Phase {self.current_level}")
91
+
92
+ def _print_progress(self):
93
+ pct = min(100.0, self.num_timesteps / TOTAL_TIMESTEPS * 100)
94
+ sys.stdout.write(f"\rProgress: {pct:.1f}% | Phase: {self.current_level} | EMA Win: {self.ema_win_rate*100:.1f}%")
95
+ sys.stdout.flush()
96
+
97
+
98
+ def train_model(total_timesteps=TOTAL_TIMESTEPS, save_dir="./models"):
99
+ os.makedirs(save_dir, exist_ok=True)
100
+ os.makedirs("./logs/", exist_ok=True)
101
+ os.makedirs("./eval_logs/", exist_ok=True)
102
+
103
+ torch.manual_seed(GLOBAL_SEED)
104
+ curriculum_ref = [0]
105
+ global_step_ref = [0]
106
+
107
+ def make_env():
108
+ env = CloudArenaEnv(curriculum_ref, global_step_ref)
109
+ env = Monitor(env)
110
+ return ActionMasker(env, get_action_masks)
111
+
112
+ train_env = DummyVecEnv([make_env])
113
+ train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10.0)
114
+
115
+ eval_env = DummyVecEnv([make_env])
116
+ eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, training=False)
117
+ eval_env.obs_rms = train_env.obs_rms
118
+
119
+ model = MaskablePPO("MlpPolicy", train_env, learning_rate=cosine_lr, ent_coef=0.01, verbose=0)
120
+ arena_cb = CloudArenaCallback(curriculum_ref)
121
+ eval_cb = SafeMaskableEvalCallback(eval_env, best_model_save_path=save_dir, eval_freq=10000)
122
+
123
+ print("Starting Pipeline...")
124
+ model.learn(total_timesteps=total_timesteps, callback=[arena_cb, eval_cb])
125
+
126
+ model.save(os.path.join(save_dir, "cloud_arena_final"))
127
+ train_env.save(os.path.join(save_dir, "cloud_arena_vecnorm.pkl"))
128
+ print("\n✅ Model and VecNormalize stats saved.")
129
+
130
+ return model, arena_cb, train_env
cloud_arena/visualization.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cloud Arena Visualization — Mathematical Model
2
+
3
+ import matplotlib
4
+ matplotlib.use('Agg')
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+
8
+
9
+ REF_BG = '#0e1117'
10
+ REF_CYAN = '#00d4ff'
11
+ REF_AMBER = '#ffa500'
12
+ REF_NEON = '#39ff14'
13
+ TEXT_COLOR = '#e6e6e6'
14
+
15
+
16
+ def smooth(y, box_pts=50):
17
+ if len(y) < box_pts:
18
+ return y
19
+ box = np.ones(box_pts) / box_pts
20
+ return np.convolve(y, box, mode='valid')
21
+
22
+
23
+ def generate_dashboard(callback, output_path="outputs/training_dashboard.png"):
24
+ rewards = np.array(callback.episode_rewards)
25
+ savings = np.array(callback.episode_savings)
26
+ security = np.array(callback.episode_security)
27
+
28
+ fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(22, 6), facecolor=REF_BG)
29
+
30
+ for ax in [ax1, ax2, ax3]:
31
+ ax.set_facecolor(REF_BG)
32
+ ax.grid(True, alpha=0.05, color='white')
33
+ ax.spines['top'].set_visible(False)
34
+ ax.spines['right'].set_visible(False)
35
+ ax.spines['left'].set_color('#333333')
36
+ ax.spines['bottom'].set_color('#333333')
37
+ ax.tick_params(colors=TEXT_COLOR, labelsize=10)
38
+
39
+ ax1.plot(rewards, color=REF_CYAN, alpha=0.15)
40
+ ax1.plot(smooth(rewards), color=REF_CYAN, lw=3)
41
+ ax1.set_title("Learning Curve", color=TEXT_COLOR, fontsize=14, fontweight='bold')
42
+
43
+ ax2.plot(savings, color=REF_AMBER, alpha=0.15)
44
+ ax2.plot(smooth(savings), color=REF_AMBER, lw=3)
45
+ ax2.set_title("Cost Optimization %", color=TEXT_COLOR, fontsize=14, fontweight='bold')
46
+ ax2.set_ylim(0, 100)
47
+
48
+ ax3.plot(security, color=REF_NEON, alpha=0.15)
49
+ ax3.plot(smooth(security), color=REF_NEON, lw=3)
50
+ ax3.set_title("Security Score", color=TEXT_COLOR, fontsize=14, fontweight='bold')
51
+ ax3.set_ylim(0, 1)
52
+
53
+ plt.tight_layout()
54
+ plt.savefig(output_path, dpi=200, bbox_inches='tight', facecolor=REF_BG)
55
+ plt.close()
56
+ return output_path
models/.gitkeep ADDED
File without changes
outputs/.gitkeep ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # ── Mathematical Model RL Dependencies ONLY ──
2
+ # DO NOT add transformers/peft/trl here — those belong to the LLM model
3
+ gymnasium>=0.29.0
4
+ stable-baselines3>=2.3.0
5
+ sb3-contrib>=2.3.0
6
+ numpy>=1.24.0
7
+ torch>=2.0.0
8
+ matplotlib>=3.7.0
9
+ gradio>=4.0.0