# Cloud Arena Evaluation — Mathematical Model # Extracted from cloud_arena_final.py (Cells 4-5) import os import numpy as np import torch from typing import List from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize from sb3_contrib import MaskablePPO from sb3_contrib.common.wrappers import ActionMasker from cloud_arena.environment import ( CloudArenaEnv, get_action_masks, MAX_RESOURCES, MAX_STEPS, A_NOOP, ) def _get_inner(vec_env): inner = vec_env.envs[0] while hasattr(inner, "env"): inner = inner.env return inner def evaluate_model(model_path="./models/cloud_arena_final", vecnorm_path="./models/cloud_arena_vecnorm.pkl", level=0, n_eval=30): results = {k: [] for k in ["win","cost_score","security_score", "reliability_score","savings_pct","veto_rate", "cascade_count","steps"]} def make_eval_env(): env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[500000]) return ActionMasker(env, get_action_masks) raw = DummyVecEnv([make_eval_env]) eval_env = VecNormalize.load(vecnorm_path, raw) eval_env.training = False eval_env.norm_reward = False model = MaskablePPO.load(model_path, env=eval_env) for ep in range(n_eval): obs = eval_env.reset() done = False steps = 0 while not done: masks = [_get_inner(eval_env).action_masks()] act, _ = model.predict(obs, deterministic=True, action_masks=masks) obs, rew, done_arr, info_arr = eval_env.step(act) done = bool(done_arr[0]) steps += 1 info = info_arr[0] if info_arr else {} for k in results: results[k].append(info.get(k, 0) if k != "steps" else steps) return results BOSS_NAMES = { 1: "Cost Crisis", 2: "Security Breach", 3: "Infrastructure Failure", 4: "Traffic Surge", 5: "Final Boss", } def run_boss_fights(model_path="./models/cloud_arena_final", vecnorm_path="./models/cloud_arena_vecnorm.pkl", level=0, n_runs=10): model = MaskablePPO.load(model_path) boss_scores = {} for s_id, name in BOSS_NAMES.items(): runs = [] for seed in range(100, 100 + n_runs): def _init(): env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[0]) return ActionMasker(env, get_action_masks) raw = DummyVecEnv([_init]) vec = VecNormalize.load(vecnorm_path, raw) vec.training = False vec.norm_reward = False inner = _get_inner(vec) raw_obs, _ = inner.reset(seed=seed, options={"scenario": s_id}) obs = vec.normalize_obs(np.array([raw_obs])) done = False steps = 0 noops_chaos = 0 chaos_steps_total = 0 while not done: masks = [inner.action_masks()] act, _ = model.predict(obs, deterministic=True, action_masks=masks) a_type = int(act[0]) // MAX_RESOURCES if inner.chaos_active: chaos_steps_total += 1 if a_type == A_NOOP: noops_chaos += 1 obs, _, done_arr, info_arr = vec.step(act) done = bool(done_arr[0]) steps += 1 info = info_arr[0] if info_arr else {} info.update({"steps": steps, "noops_chaos": noops_chaos, "chaos_steps": chaos_steps_total}) runs.append(info) vec.close() wins = [r.get("win", 0) for r in runs] costs = [r.get("cost_score", 0) for r in runs] secs = [r.get("security_score", 0) for r in runs] rels = [r.get("reliability_score", 0) for r in runs] if s_id == 3: noop_r = [r["noops_chaos"] / max(r["chaos_steps"], 1) for r in runs] score = (0.4 * np.mean(noop_r) + 0.6 * np.mean(rels)) * 100 else: score = (0.4 * np.mean(wins) + 0.3 * np.mean(costs) + 0.3 * np.mean(secs)) * 100 boss_scores[s_id] = score return boss_scores def evaluate_llm_grpo(model, tokenizer, n_eval=20, steps_per_episode=15, seed=123): """ Evaluate LLM policy quality on the FinOps environment using the same ACTION parser logic as training. """ import random import torch from cloud_arena.llm_environment import SB3Adapter from cloud_arena.llm_training import extract_action_and_reasoning, format_prompt random.seed(seed) np.random.seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) env = SB3Adapter() metrics = { "episodes": n_eval, "win_rate": 0.0, "avg_savings_pct": 0.0, "avg_episode_len": 0.0, "safety_violation_rate": 0.0, "action_distribution": {str(i): 0 for i in range(5)}, "avg_reward_components": {}, } wins = 0 total_savings = 0.0 total_steps = 0 total_safety_violations = 0 reward_components_sum = {} total_component_steps = 0 for _ in range(n_eval): _, _ = env.reset() done = False step_count = 0 last_info = {} while not done and step_count < steps_per_episode: state_dict = env.core._get_internal_state() prompt = format_prompt(state_dict) inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) input_ids = inputs["input_ids"].to(model.device) attn_mask = inputs["attention_mask"].to(model.device) with torch.no_grad(): out = model.generate( input_ids=input_ids, attention_mask=attn_mask, max_new_tokens=80, do_sample=False, pad_token_id=tokenizer.pad_token_id, ) response = tokenizer.decode(out[0][input_ids.shape[1] :], skip_special_tokens=True) action, _ = extract_action_and_reasoning(response) metrics["action_distribution"][str(action)] += 1 _, _, terminated, truncated, info = env.step(action) done = bool(terminated or truncated) step_count += 1 last_info = info total_safety_violations += int(info.get("safety_violation", 0)) rc = info.get("reward_components", {}) for k, v in rc.items(): reward_components_sum[k] = reward_components_sum.get(k, 0.0) + float(v) total_component_steps += 1 wins += int(last_info.get("win", False)) total_savings += float(last_info.get("savings_pct", 0.0)) total_steps += step_count total_actions = max(sum(metrics["action_distribution"].values()), 1) metrics["action_distribution"] = { k: round(v / total_actions, 4) for k, v in metrics["action_distribution"].items() } metrics["win_rate"] = round(wins / max(n_eval, 1), 4) metrics["avg_savings_pct"] = round(total_savings / max(n_eval, 1), 3) metrics["avg_episode_len"] = round(total_steps / max(n_eval, 1), 3) metrics["safety_violation_rate"] = round(total_safety_violations / max(total_steps, 1), 4) metrics["avg_reward_components"] = { k: round(v / max(total_component_steps, 1), 4) for k, v in reward_components_sum.items() } return metrics