Spaces:

saravanatanjiro
/

cloudarena

Sleeping

App Files Files Community

cloudarena / cloud_arena_final.py

saravanatanjiro

Removed training loops below render function

184ea7c about 2 months ago

raw

history blame contribute delete

42.9 kB

	# -- coding: utf-8 --
	"""cloud_arena_final.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1hA7HmoWI2aPAMZv1t-tQpsI0qECnSI_a
	"""

	# ══════════════════════════════════════════════════════════════════════════════
	# CELL 1 — INSTALL
	# ══════════════════════════════════════════════════════════════════════════════
	# !pip install stable-baselines3[extra] sb3-contrib gymnasium torch matplotlib numpy --quiet

	# ══════════════════════════════════════════════════════════════════════════════
	# CELL 2 — ENVIRONMENT (PATCHED ANTI-CHEAT)
	# ══════════════════════════════════════════════════════════════════════════════

	import sys, math, random, copy
	from collections import deque
	from typing import Dict, List, Optional, Tuple

	import numpy as np
	import gymnasium as gym
	from gymnasium import spaces

	# ── Seeds ─────────────────────────────────────────────────────────────────────
	GLOBAL_SEED = 42
	np.random.seed(GLOBAL_SEED)
	random.seed(GLOBAL_SEED)

	# ── Observation layout (must sum to OBS_DIM) ──────────────────────────────────
	MAX_RES_IN_OBS = 8 # fixed obs slots (pad unused with zeros)
	N_FEAT_PER_RES = 10 # features per resource slot in obs
	N_BLOCK_B = 8 # global security block
	N_BLOCK_C = 7 # global cost block
	N_BLOCK_D = 6 # environment state block
	N_BLOCK_E = 24 # history: 8 actions + 8 rewards + 8 progress
	OBS_DIM = MAX_RES_IN_OBS * N_FEAT_PER_RES + N_BLOCK_B + N_BLOCK_C + N_BLOCK_D + N_BLOCK_E
	# = 80 + 8 + 7 + 6 + 24 = 125

	assert OBS_DIM == 125, f"OBS_DIM mismatch: {OBS_DIM}"

	# ── Action space ──────────────────────────────────────────────────────────────
	N_ACTION_TYPES = 15
	MAX_RESOURCES = 10
	N_ACTIONS = N_ACTION_TYPES * MAX_RESOURCES # 150

	A_NOOP=0; A_ANALYZE=1; A_VERIFY_DEPS=2; A_RESIZE_DOWN=3; A_RESIZE_UP=4
	A_STOP=5; A_RESTART=6; A_DELETE=7; A_PATCH=8; A_ENCRYPT=9
	A_RESTRICT=10; A_ROTATE_CREDS=11; A_ENABLE_LOG=12; A_ARCHIVE=13; A_OPT_NET=14

	# Action cost penalties (small friction — makes actions non-free)
	ACTION_COSTS = {
	A_NOOP: 0.0, A_ANALYZE: -0.01, A_VERIFY_DEPS: -0.01,
	A_RESIZE_DOWN: -0.02, A_RESIZE_UP: -0.02,
	A_STOP: -0.03, A_RESTART: -0.02, A_DELETE: -0.05,
	A_PATCH: -0.02, A_ENCRYPT: -0.02, A_RESTRICT: -0.02,
	A_ROTATE_CREDS: -0.02, A_ENABLE_LOG: -0.01,
	A_ARCHIVE: -0.03, A_OPT_NET: -0.02,
	}

	# ── Curriculum ────────────────────────────────────────────────────────────────
	# n_resources active per phase
	N_RESOURCES_PHASE = {0: 4, 1: 5, 2: 6, 3: 7, 4: 8, 5: 10}

	# Phase feature flags
	PHASE_FOG = {0: False, 1: True, 2: True, 3: True, 4: True, 5: True}
	PHASE_EVENTS = {0: False, 1: False, 2: True, 3: True, 4: True, 5: True}
	PHASE_CHAOS = {0: False, 1: False, 2: False, 3: True, 4: True, 5: True}
	CHAOS_INIT_PROB = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.20, 4: 0.30, 5: 0.35}

	# Win thresholds: cost must drop to this fraction of initial AND security >= sec_thr
	WIN_COST_THR = {0: 0.55, 1: 0.60, 2: 0.60, 3: 0.65, 4: 0.65, 5: 0.70}
	WIN_SEC_THR = {0: 0.00, 1: 0.60, 2: 0.70, 3: 0.70, 4: 0.75, 5: 0.80}

	MAX_STEPS = 150


	# ══════════════════════════════════════════════════════════════════════════════
	# RESOURCE OBJECT
	# ══════════════════════════════════════════════════════════════════════════════

	class ResourceObject:
	CRIT = {"LOW": 0.3, "MED": 0.6, "HIGH": 1.0}

	def __init__(self, idx: int, criticality: str = "MED",
	category: str = "compute", rng: random.Random = None):
	rng = rng or random.Random(idx)
	self.idx = idx
	self.criticality = self.CRIT[criticality]
	self.category = category

	# ── Cost state ──────────────────────────────────────────────────────
	self.allocated = rng.uniform(0.70, 1.00) # initially overprovisioned
	self.usage = rng.uniform(0.15, 0.50) # true usage (always < allocated)
	self.usage = min(self.usage, self.allocated - 0.10)
	self.cost_rate = self.allocated # cost ∝ allocated
	self.activity_status = 1.0 # 1=active, 0=idle

	# ── State flags ──────────────────────────────────────────────────────
	self.health = 1
	self.is_stopped = False
	self.is_deleted = False
	self.alert_flag = 0

	# ── Security state (hidden under fog) ────────────────────────────────
	self.risk_score = rng.uniform(0.05, 0.20)
	self.vulnerability = False
	self.encryption = True
	self.over_permission = False
	self.logging_enabled = True
	self.credential_age = rng.uniform(0.0, 0.3)
	self.exposure = rng.uniform(0.0, 0.15)
	self.sensitivity = rng.uniform(0.3, 0.8)

	# ── Fog of war ───────────────────────────────────────────────────────
	self.fog_active = True # True = attributes hidden until ANALYZE
	self.cost_known = False
	self.deps_known = False
	self.steps_since_analyze = 0
	self.staleness = 0.0
	self.STALE_STEPS = 15 # after this many steps, fog re-activates

	# ── Dependency ───────────────────────────────────────────────────────
	self.dependency_children: List[int] = [] # indices of resources that depend on this
	self.dependency_parent: Optional[int] = None

	# ── Diagnostics ──────────────────────────────────────────────────────
	self.steps_broken = 0
	self.time_broken = 0.0

	# ── Derived properties ────────────────────────────────────────────────────

	def overprovision_ratio(self) -> float:
	return max(0.0, (self.allocated - self.usage) / max(self.allocated, 1e-6))

	def get_cost(self) -> float:
	if self.is_deleted:
	return 0.0
	if self.is_stopped:
	return self.cost_rate * 0.05 # minimal maintenance cost
	return self.cost_rate

	# ── Observation vector (10 dims) ──────────────────────────────────────────

	def to_obs(self, fog: bool = False) -> np.ndarray:
	if fog and self.fog_active:
	risk_obs = 0.0
	cost_obs = 0.5 # agent sees estimated cost when under fog
	exp_obs = 0.0
	else:
	risk_obs = self.risk_score
	cost_obs = self.cost_rate
	exp_obs = self.exposure

	return np.array([
	float(self.health), # 0
	risk_obs, # 1 (hidden under fog)
	self.criticality, # 2
	cost_obs, # 3 (hidden under fog)
	self.activity_status, # 4
	exp_obs, # 5 (hidden under fog)
	self.sensitivity, # 6
	self.staleness, # 7 (always visible)
	float(self.alert_flag), # 8 (always visible for critical)
	self.time_broken, # 9
	], dtype=np.float32)

	# ── Per-step tick ─────────────────────────────────────────────────────────

	def tick(self, rng: random.Random, phase: int, event_prob: float = 0.0):
	if self.is_deleted:
	return

	# Staleness
	self.steps_since_analyze += 1
	self.staleness = min(self.steps_since_analyze / self.STALE_STEPS, 1.0)
	if self.steps_since_analyze >= self.STALE_STEPS:
	self.fog_active = True # knowledge expires

	# Usage drift (only when running)
	if not self.is_stopped and self.health:
	self.usage = float(np.clip(
	self.usage + rng.uniform(-0.03, 0.03), 0.10, self.allocated))

	# Credential aging
	self.credential_age = min(self.credential_age + 0.01, 1.0)

	# Broken resource tracking
	if not self.health:
	self.steps_broken += 1
	self.time_broken = min(self.steps_broken / MAX_STEPS, 1.0)
	self.risk_score = min(self.risk_score + 0.015, 1.0)
	if self.criticality >= 1.0:
	self.alert_flag = 1 # high-criticality broken = visible alert

	# Random security events (Phase 2+)
	if phase >= 2 and rng.random() < event_prob and self.health:
	ev = rng.choice(["vuln", "expose", "iam", "log_off"])
	if ev == "vuln":
	self.vulnerability = True
	self.risk_score = min(self.risk_score + 0.25, 1.0)
	elif ev == "expose":
	self.exposure = min(self.exposure + 0.35, 1.0)
	self.risk_score = min(self.risk_score + 0.20, 1.0)
	elif ev == "iam":
	self.over_permission = True
	self.risk_score = min(self.risk_score + 0.15, 1.0)
	elif ev == "log_off":
	self.logging_enabled = False
	self.risk_score = min(self.risk_score + 0.05, 1.0)

	# ── Actions ───────────────────────────────────────────────────────────────

	def do_analyze(self):
	self.fog_active = False
	self.cost_known = True
	self.steps_since_analyze = 0
	self.staleness = 0.0

	def do_verify_deps(self):
	self.deps_known = True

	def do_resize_down(self) -> float:
	"""Returns cost delta (positive = saving)."""
	new_alloc = max(self.usage + 0.10, 0.25)
	if new_alloc < self.allocated - 0.02:
	saved = (self.allocated - new_alloc)
	self.allocated = new_alloc
	self.cost_rate = new_alloc
	return saved
	return 0.0

	def do_resize_up(self):
	self.allocated = min(self.allocated + 0.20, 1.0)
	self.cost_rate = self.allocated

	def do_stop(self) -> float:
	if not self.is_stopped:
	self.is_stopped = True
	self.activity_status = 0.0
	return self.cost_rate * 0.95 # 95% cost eliminated
	return 0.0

	def do_restart(self):
	self.is_stopped = False
	self.activity_status = 1.0
	self.health = 1

	def do_delete(self) -> float:
	saved = self.cost_rate
	self.is_deleted = True
	self.health = 0
	return saved

	def do_patch(self):
	self.vulnerability = False
	self.risk_score = max(self.risk_score - 0.30, 0.0)

	def do_encrypt(self):
	self.encryption = True
	self.risk_score = max(self.risk_score - 0.15, 0.0)

	def do_restrict(self):
	self.exposure = max(self.exposure - 0.40, 0.0)
	self.risk_score = max(self.risk_score - 0.20, 0.0)

	def do_rotate_creds(self):
	self.credential_age = 0.0
	self.over_permission = False
	self.risk_score = max(self.risk_score - 0.10, 0.0)

	def do_enable_logging(self):
	self.logging_enabled = True
	self.risk_score = max(self.risk_score - 0.05, 0.0)

	def do_archive(self) -> float:
	if not self.is_stopped:
	self.is_stopped = True
	self.activity_status = 0.0
	return self.cost_rate * 0.70
	return 0.0

	def do_opt_network(self):
	self.exposure = max(self.exposure - 0.15, 0.0)
	self.risk_score = max(self.risk_score - 0.08, 0.0)


	# ══════════════════════════════════════════════════════════════════════════════
	# ENVIRONMENT
	# ══════════════════════════════════════════════════════════════════════════════

	class CloudArenaEnv(gym.Env):
	"""
	Cloud Arena: multi-objective cloud operations RL environment.
	Observation: 125-dim flat float32.
	Action space: Discrete(150) = 15 types × 10 resource slots.
	"""
	metadata = {"render_modes": []}

	def __init__(self,
	curriculum_ref: List[int] = None,
	global_step_ref: List[int] = None):
	super().__init__()
	self._curriculum_ref = curriculum_ref or [0]
	self._global_step_ref = global_step_ref or [0]

	self.observation_space = spaces.Box(
	low=-np.inf, high=np.inf, shape=(OBS_DIM,), dtype=np.float32)
	self.action_space = spaces.Discrete(N_ACTIONS)

	# Episode state (set in reset)
	self.resources: List[ResourceObject] = []
	self.n_active = 0
	self.step_count = 0
	self.chaos_active = False
	self.chaos_steps = 0
	self.veto_count = 0
	self.cascade_count = 0
	self.initial_total_cost = 1.0
	self.prev_total_cost = 1.0
	self.prev_risk_agg = 0.0
	self._action_hist = deque([0.0] * 8, maxlen=8)
	self._reward_hist = deque([0.0] * 8, maxlen=8)
	self._progress_hist= deque([0.0] * 8, maxlen=8)

	# ── Properties ────────────────────────────────────────────────────────────

	@property
	def curriculum_level(self) -> int:
	return self._curriculum_ref[0]

	# ── Reset ─────────────────────────────────────────────────────────────────

	def reset(self, seed=None, options=None):
	super().reset(seed=seed)
	rng = random.Random(seed if seed is not None else GLOBAL_SEED + self.step_count)

	self.step_count = 0
	self.chaos_active = False
	self.chaos_steps = 0
	self.veto_count = 0
	self.cascade_count = 0

	phase = self.curriculum_level
	scenario = options.get("scenario", 0) if options else 0

	if scenario > 0:
	self._setup_boss_scenario(scenario, rng)
	else:
	self._setup_normal_episode(phase, rng)

	self.initial_total_cost = max(sum(r.get_cost() for r in self.resources), 1e-6)
	self.prev_total_cost = self.initial_total_cost
	self.prev_risk_agg = self._risk_aggregate()

	self._action_hist = deque([0.0] * 8, maxlen=8)
	self._reward_hist = deque([0.0] * 8, maxlen=8)
	self._progress_hist = deque([0.0] * 8, maxlen=8)

	return self._build_obs(), {}

	def _setup_normal_episode(self, phase: int, rng: random.Random):
	"""Standard episode with phase-appropriate resources."""
	self.n_active = N_RESOURCES_PHASE[phase]
	n = self.n_active

	# Criticality distribution: ~20% HIGH, ~40% MED, ~40% LOW
	crits = []
	for i in range(n):
	if i == 0:
	crits.append("HIGH")
	elif i < n // 2:
	crits.append("MED")
	else:
	crits.append("LOW")

	cats = ["compute", "compute", "storage", "database",
	"compute", "storage", "compute", "database",
	"compute", "storage"][:n]

	self.resources = []
	for i in range(n):
	r = ResourceObject(i, crits[i], cats[i], rng)

	# Phase 0: full observability — reveal everything upfront
	if not PHASE_FOG[phase]:
	r.fog_active = False
	r.cost_known = True
	r.deps_known = True

	# Phase 0: no security issues to start (clean state)
	if phase == 0:
	r.risk_score = rng.uniform(0.02, 0.08)
	r.vulnerability = False
	r.encryption = True
	r.over_permission = False
	r.logging_enabled = True
	r.exposure = rng.uniform(0.0, 0.05)
	else:
	# 💥 ANTI-CHEAT FIX: Force the agent to actually do SecOps in Phase 1+!
	r.vulnerability = rng.random() < 0.40
	r.encryption = rng.random() > 0.30 # 30% unencrypted
	r.over_permission = rng.random() < 0.30
	r.logging_enabled = rng.random() > 0.20
	r.exposure = rng.uniform(0.10, 0.40)
	r.risk_score = rng.uniform(0.30, 0.60)

	self.resources.append(r)

	# Set up simple dependency: resource 0 (HIGH) has children [1]
	# This means deleting resource 0 would cascade to resource 1
	# Agent can't delete resource 0 anyway (HIGH criticality), so it's safe
	if n >= 2:
	self.resources[0].dependency_children = [1]
	self.resources[1].dependency_parent = 0

	# Chaos initialization for Phase 3+
	if PHASE_CHAOS[phase] and rng.random() < CHAOS_INIT_PROB[phase]:
	self.chaos_active = True
	# Break 1-2 non-critical resources
	victims = [r for r in self.resources if r.criticality < 1.0][:2]
	for v in victims:
	v.health = 0
	v.risk_score = min(v.risk_score + 0.40, 1.0)
	v.alert_flag = 0 # hidden unless HIGH criticality

	def _setup_boss_scenario(self, scenario: int, rng: random.Random):
	"""Boss fight: predefined stressful starting conditions."""
	phase = max(self.curriculum_level, 3) # boss fights at phase 3+ difficulty
	self._setup_normal_episode(phase, rng)

	if scenario == 1: # Cost Crisis
	for r in self.resources:
	r.allocated = min(r.allocated + rng.uniform(0.10, 0.25), 1.0)
	r.cost_rate = r.allocated
	r.usage = max(r.usage - 0.10, 0.10)

	elif scenario == 2: # Security Breach
	for r in self.resources:
	r.fog_active = True # force fog — agent must analyze
	r.cost_known = False
	r.vulnerability = (rng.random() < 0.60)
	r.encryption = (rng.random() < 0.30)
	r.over_permission = (rng.random() < 0.50)
	r.logging_enabled = (rng.random() < 0.40)
	r.exposure = rng.uniform(0.30, 0.80)
	r.risk_score = rng.uniform(0.40, 0.90)

	elif scenario == 3: # Infrastructure Failure (NOOP Test)
	self.chaos_active = True
	for r in self.resources[:3]:
	r.health = 0
	r.risk_score = min(r.risk_score + 0.50, 1.0)

	elif scenario == 4: # Traffic Surge (underprovisioned)
	for r in self.resources:
	r.usage = min(r.allocated - 0.05, rng.uniform(0.75, 0.95))
	r.risk_score = min(r.risk_score + 0.10, 0.50)

	elif scenario == 5: # Final Boss: everything
	self.chaos_active = True
	for i, r in enumerate(self.resources):
	r.allocated = min(r.allocated + 0.15, 1.0)
	r.cost_rate = r.allocated
	r.vulnerability = (rng.random() < 0.50)
	r.encryption = (rng.random() < 0.40)
	r.exposure = rng.uniform(0.20, 0.70)
	r.risk_score = rng.uniform(0.30, 0.80)
	if i < 2:
	r.health = 0

	# ── Step ──────────────────────────────────────────────────────────────────

	def step(self, action: int):
	action = int(action)
	self.step_count += 1
	self._global_step_ref[0] += 1

	atype = action // MAX_RESOURCES
	ridx = action % MAX_RESOURCES

	phase = self.curriculum_level

	# ── Tick all resources ────────────────────────────────────────────────
	event_prob = 0.04 if PHASE_EVENTS[phase] else 0.0
	rng = random.Random(self._global_step_ref[0])
	for r in self.resources:
	r.tick(rng, phase, event_prob)

	# ── Chaos events (Phase 3+) ───────────────────────────────────────────
	if PHASE_CHAOS[phase] and rng.random() < 0.03:
	healthy = [r for r in self.resources if r.health and not r.is_deleted
	and r.criticality < 1.0]
	if healthy:
	victim = rng.choice(healthy)
	victim.health = 0
	victim.risk_score = min(victim.risk_score + 0.40, 1.0)
	self.chaos_active = True

	if self.chaos_active:
	self.chaos_steps += 1
	if self.chaos_steps > 20:
	self.chaos_active = False # chaos resolves after ~20 steps

	# ── Snapshot pre-action state ─────────────────────────────────────────
	cost_before = sum(r.get_cost() for r in self.resources)
	risk_before = self._risk_aggregate()

	# ── Apply action ──────────────────────────────────────────────────────
	cost_delta, sec_delta, veto = self._apply_action(atype, ridx)
	if veto:
	self.veto_count += 1

	# ── Post-action state ─────────────────────────────────────────────────
	cost_now = sum(r.get_cost() for r in self.resources)
	risk_now = self._risk_aggregate()

	# ── Compute reward ────────────────────────────────────────────────────
	reward = self._compute_reward(
	atype, ridx, veto, cost_before, cost_now, risk_before, risk_now)

	# ── Check win/done ────────────────────────────────────────────────────
	win = self._check_win(cost_now, risk_now, phase)
	terminated = win
	truncated = (self.step_count >= MAX_STEPS)

	if terminated or truncated:
	reward += self._terminal_reward(win, cost_now, risk_now, phase)
	reward = float(np.clip(reward, -30.0, 60.0))
	else:
	reward = float(np.clip(reward, -2.0, 5.0))

	# ── Update history ────────────────────────────────────────────────────
	self._action_hist.append(atype / N_ACTION_TYPES)
	self._reward_hist.append(np.clip(reward / 5.0, -1.0, 1.0))
	self._progress_hist.append(max(0.0, (self.initial_total_cost - cost_now)
	/ max(self.initial_total_cost, 1e-6)))
	self.prev_total_cost = cost_now
	self.prev_risk_agg = risk_now

	info = {
	"win": int(win),
	"cost_score": float(np.clip(1.0 - cost_now / max(self.initial_total_cost, 1e-6), 0, 1)),
	"security_score": float(np.clip(1.0 - risk_now, 0, 1)),
	"reliability_score": self._reliability_score(),
	"savings_pct": float(np.clip(
	(self.initial_total_cost - cost_now)
	/ max(self.initial_total_cost, 1e-6) * 100, 0, 100)),
	"veto_rate": self.veto_count / max(self.step_count, 1),
	"cascade_count": self.cascade_count,
	"risk": risk_now,
	"chaos_active": self.chaos_active,
	}

	return self._build_obs(), reward, terminated, truncated, info

	# ── Action application ────────────────────────────────────────────────────

	def _apply_action(self, atype: int, ridx: int) -> Tuple[float, float, bool]:
	"""Returns (cost_delta, security_delta, was_vetoed)."""

	if atype == A_NOOP:
	return 0.0, 0.0, False # NOOP is never a veto

	# Validate resource index
	if ridx >= len(self.resources):
	return 0.0, 0.0, True

	r = self.resources[ridx]

	if r.is_deleted:
	return 0.0, 0.0, True

	cost_before = r.get_cost()
	risk_before = r.risk_score
	veto = False

	if atype == A_ANALYZE:
	r.do_analyze()

	elif atype == A_VERIFY_DEPS:
	r.do_verify_deps()

	elif atype == A_RESIZE_DOWN:
	if r.overprovision_ratio() > 0.08 and not r.is_stopped:
	r.do_resize_down()
	else:
	veto = True

	elif atype == A_RESIZE_UP:
	if r.usage > r.allocated - 0.12:
	r.do_resize_up()
	else:
	veto = True

	elif atype == A_STOP:
	can_stop = (not r.is_stopped and
	(r.activity_status < 0.35 or r.criticality <= 0.3) and
	r.criticality < 1.0)
	if can_stop:
	r.do_stop()
	else:
	veto = True

	elif atype == A_RESTART:
	if r.is_stopped:
	r.do_restart()
	else:
	veto = True

	elif atype == A_DELETE:
	can_delete = (r.deps_known and r.criticality < 1.0 and not r.is_stopped)
	if can_delete:
	has_crit_child = any(
	(ci < len(self.resources) and
	not self.resources[ci].is_deleted and
	self.resources[ci].criticality >= 0.6)
	for ci in r.dependency_children)
	if has_crit_child:
	veto = True
	else:
	r.do_delete()
	for ci in r.dependency_children:
	if ci < len(self.resources) and not self.resources[ci].is_deleted:
	child = self.resources[ci]
	child.health = 0
	child.risk_score = min(child.risk_score + 0.3, 1.0)
	self.cascade_count += 1
	else:
	veto = True

	elif atype == A_PATCH:
	if r.vulnerability:
	r.do_patch()
	else:
	veto = True

	elif atype == A_ENCRYPT:
	if not r.encryption:
	r.do_encrypt()
	else:
	veto = True

	elif atype == A_RESTRICT:
	if r.exposure > 0.15:
	r.do_restrict()
	else:
	veto = True

	elif atype == A_ROTATE_CREDS:
	if r.credential_age > 0.40:
	r.do_rotate_creds()
	else:
	veto = True

	elif atype == A_ENABLE_LOG:
	if not r.logging_enabled:
	r.do_enable_logging()
	else:
	veto = True

	elif atype == A_ARCHIVE:
	if r.category == "storage" and r.activity_status < 0.35:
	r.do_archive()
	else:
	veto = True

	elif atype == A_OPT_NET:
	if r.exposure > 0.08:
	r.do_opt_network()
	else:
	veto = True

	cost_after = r.get_cost() if not r.is_deleted else 0.0
	risk_after = r.risk_score if not r.is_deleted else 0.0

	return (cost_before - cost_after), (risk_before - risk_after), veto

	# ── Reward ────────────────────────────────────────────────────────────────

	def _compute_reward(self, atype, ridx, veto,
	cost_before, cost_now, risk_before, risk_now) -> float:

	phase = self.curriculum_level
	w_cost = 0.25
	w_sec = 0.35 if phase >= 1 else 0.0
	w_stab = 0.25

	# ── 1. Dense cost channel ─────────────────────────────────────────────
	r_cost = -w_cost * (cost_now / max(self.initial_total_cost, 1e-6))

	# ── 2. Dense security channel ─────────────────────────────────────────
	r_sec = -w_sec * risk_now

	# ── 3. Stability/reliability ──────────────────────────────────────────
	n_broken = sum(1 for r in self.resources if not r.health and not r.is_deleted)
	r_stab = -w_stab * (n_broken / max(len(self.resources), 1))

	# ── 4. Delta reward (THE MOST IMPORTANT SIGNAL) ───────────────────────
	# Positive when agent caused improvement, zero otherwise
	cost_improvement = (cost_before - cost_now) / max(self.initial_total_cost, 1e-6)
	risk_improvement = risk_before - risk_now

	r_delta = 3.0 * cost_improvement # strong signal for cost savings
	r_delta += 4.0 * risk_improvement # strong signal for security improvements
	r_delta = float(np.clip(r_delta, -1.0, 2.0))

	# ── 5. NOOP shaping ───────────────────────────────────────────────────
	if atype == A_NOOP:
	if self.chaos_active:
	r_noop = +0.10 # correct — don't touch things during chaos
	elif risk_now < 0.10 and cost_now < self.initial_total_cost * 0.60:
	r_noop = +0.05 # correct — system is genuinely healthy
	elif risk_now < 0.25:
	r_noop = +0.01 # acceptable
	elif risk_now < 0.50:
	r_noop = -0.05 # negligence
	else:
	r_noop = -0.15 # gross negligence
	else:
	r_noop = 0.0

	# ── 6. Action cost penalty ────────────────────────────────────────────
	r_action = ACTION_COSTS.get(atype, -0.02)

	# ── 7. Veto penalty ───────────────────────────────────────────────────
	r_veto = -0.10 if veto else 0.0

	# ── 8. Temporal neglect ───────────────────────────────────────────────
	# Phase 1+: growing penalty for ignoring known high-risk resources
	r_neglect = 0.0
	if phase >= 1:
	for r in self.resources:
	if (not r.fog_active and not r.is_deleted and
	r.risk_score > 0.60):
	neglect_scale = min(r.steps_broken / MAX_STEPS, 1.0)
	r_neglect -= 0.02 * (1.0 + neglect_scale) * r.criticality
	r_neglect = max(r_neglect, -0.20)

	total = r_cost + r_sec + r_stab + r_delta + r_noop + r_action + r_veto + r_neglect
	return float(total)

	def _terminal_reward(self, win: bool, cost_now: float,
	risk_now: float, phase: int) -> float:
	r = 0.0
	if win:
	speed_bonus = 10.0 * (1.0 - self.step_count / MAX_STEPS)
	r += 15.0 + speed_bonus
	else:
	# Partial credit
	cost_reduction = (self.initial_total_cost - cost_now) / max(self.initial_total_cost, 1e-6)
	r += 3.0 * max(cost_reduction, 0.0)
	r -= 5.0 # timeout penalty

	r -= 10.0 * risk_now # end-state security penalty
	if self.cascade_count > 0:
	r -= 5.0 * min(self.cascade_count, 3)
	return r

	# ── Win condition ─────────────────────────────────────────────────────────

	def _check_win(self, cost_now: float, risk_now: float, phase: int) -> bool:
	cost_ratio = cost_now / max(self.initial_total_cost, 1e-6)
	cost_win = cost_ratio < WIN_COST_THR[phase]

	sec_score = 1.0 - risk_now
	sec_win = sec_score >= WIN_SEC_THR[phase]

	# No critical resources broken
	no_crit_broken = not any(
	r.criticality >= 1.0 and not r.health and not r.is_deleted
	for r in self.resources)

	return cost_win and sec_win and no_crit_broken

	# ── Observation ───────────────────────────────────────────────────────────

	def _build_obs(self) -> np.ndarray:
	phase = self.curriculum_level
	fog = PHASE_FOG[phase]

	# Block A: resource observations (padded to MAX_RES_IN_OBS)
	block_a = np.zeros(MAX_RES_IN_OBS * N_FEAT_PER_RES, dtype=np.float32)
	for i, r in enumerate(self.resources[:MAX_RES_IN_OBS]):
	block_a[i * N_FEAT_PER_RES: (i + 1) * N_FEAT_PER_RES] = r.to_obs(fog)

	# Block B: global security (8 dims)
	active = [r for r in self.resources if not r.is_deleted]
	n_a = max(len(active), 1)
	risk_agg = self._risk_aggregate()
	n_vuln = sum(1 for r in active if r.vulnerability)
	n_exposed = sum(1 for r in active if r.exposure > 0.3)
	n_unenc = sum(1 for r in active if not r.encryption)
	n_no_log = sum(1 for r in active if not r.logging_enabled)
	n_overperm = sum(1 for r in active if r.over_permission)
	block_b = np.array([
	risk_agg,
	n_vuln / n_a,
	n_exposed / n_a,
	n_unenc / n_a,
	n_no_log / n_a,
	n_overperm / n_a,
	min(sum(r.credential_age for r in active) / n_a, 1.0),
	float(self.chaos_active),
	], dtype=np.float32)

	# Block C: global cost (7 dims)
	total_cost = sum(r.get_cost() for r in self.resources)
	n_idle = sum(1 for r in active if r.activity_status < 0.3)
	n_overprov = sum(1 for r in active if r.overprovision_ratio() > 0.2)
	n_stopped = sum(1 for r in self.resources if r.is_stopped)
	n_deleted = sum(1 for r in self.resources if r.is_deleted)
	block_c = np.array([
	total_cost / max(self.initial_total_cost, 1e-6),
	n_idle / n_a,
	n_overprov / n_a,
	n_stopped / max(len(self.resources), 1),
	n_deleted / max(len(self.resources), 1),
	(self.initial_total_cost - total_cost) / max(self.initial_total_cost, 1e-6),
	float(self._check_win(total_cost, risk_agg, self.curriculum_level)),
	], dtype=np.float32)

	# Block D: environment state (6 dims)
	n_broken = sum(1 for r in active if not r.health)
	block_d = np.array([
	self.step_count / MAX_STEPS,
	self.curriculum_level / 5.0,
	float(self.chaos_active),
	n_broken / n_a,
	self.veto_count / max(self.step_count, 1),
	self.cascade_count / max(n_a, 1),
	], dtype=np.float32)

	# Block E: history (24 dims)
	block_e = np.array(
	list(self._action_hist) +
	list(self._reward_hist) +
	list(self._progress_hist),
	dtype=np.float32)

	obs = np.concatenate([block_a, block_b, block_c, block_d, block_e])
	assert obs.shape == (OBS_DIM,), f"Obs shape {obs.shape} != {OBS_DIM}"
	return obs

	# ── Action masks ──────────────────────────────────────────────────────────

	def action_masks(self) -> np.ndarray:
	mask = np.zeros(N_ACTIONS, dtype=bool)

	# NOOP (action 0) — always valid
	mask[A_NOOP * MAX_RESOURCES] = True

	for ridx in range(MAX_RESOURCES):
	# Resources beyond active set are always invalid
	if ridx >= len(self.resources):
	# Only NOOP is already set; skip rest
	continue

	r = self.resources[ridx]

	if r.is_deleted:
	continue

	aid = lambda atype: atype * MAX_RESOURCES + ridx # noqa

	# ANALYZE — always valid (costs a small amount)
	mask[aid(A_ANALYZE)] = True

	# VERIFY_DEPS — always valid
	mask[aid(A_VERIFY_DEPS)] = True

	# 💥 ANTI-CHEAT FIX: If fog is active, the agent CANNOT execute these actions!
	if r.fog_active:
	continue # Skips evaluating the rest, keeping them False (Masked)

	# --- ONLY EVALUATED IF FOG IS LIFTED ---

	# RESIZE_DOWN — valid if overprovisioned and running
	mask[aid(A_RESIZE_DOWN)] = (r.overprovision_ratio() > 0.08
	and not r.is_stopped)

	# RESIZE_UP — valid if near capacity
	mask[aid(A_RESIZE_UP)] = (r.usage > r.allocated - 0.12
	and not r.is_stopped)

	# STOP — valid if idle or LOW criticality and currently running
	mask[aid(A_STOP)] = (not r.is_stopped
	and r.criticality < 1.0
	and (r.activity_status < 0.35 or r.criticality <= 0.3))

	# RESTART — valid if stopped
	mask[aid(A_RESTART)] = r.is_stopped

	# DELETE — valid if deps known, not critical, no critical children
	has_crit_child = any(
	(ci < len(self.resources) and
	not self.resources[ci].is_deleted and
	self.resources[ci].criticality >= 0.6)
	for ci in r.dependency_children)
	mask[aid(A_DELETE)] = (r.deps_known and r.criticality < 1.0
	and not has_crit_child)

	# Security fixes (Phase 1+)
	mask[aid(A_PATCH)] = r.vulnerability
	mask[aid(A_ENCRYPT)] = not r.encryption
	mask[aid(A_RESTRICT)] = r.exposure > 0.15
	mask[aid(A_ROTATE_CREDS)] = r.credential_age > 0.40
	mask[aid(A_ENABLE_LOG)] = not r.logging_enabled
	mask[aid(A_ARCHIVE)] = (r.category == "storage"
	and r.activity_status < 0.35)
	mask[aid(A_OPT_NET)] = r.exposure > 0.08

	# Collapse guard: always at least 3 valid actions
	if mask.sum() < 3:
	mask[A_NOOP * MAX_RESOURCES] = True
	if len(self.resources) > 0:
	mask[A_ANALYZE * MAX_RESOURCES] = True
	if len(self.resources) > 1:
	mask[A_ANALYZE * MAX_RESOURCES + 1] = True

	return mask

	# ── Helpers ───────────────────────────────────────────────────────────────

	def _risk_aggregate(self) -> float:
	active = [r for r in self.resources if not r.is_deleted]
	if not active:
	return 0.0
	weighted = sum(r.risk_score * r.criticality for r in active)
	total_w = sum(r.criticality for r in active)
	return weighted / max(total_w, 1e-6)

	def _reliability_score(self) -> float:
	active = [r for r in self.resources if not r.is_deleted]
	if not active:
	return 0.0
	broken_w = sum(r.criticality for r in active if not r.health)
	total_w = sum(r.criticality for r in active)
	return max(0.0, 1.0 - broken_w / max(total_w, 1e-6))

	def render(self): pass