Spaces:
Paused
Paused
| # Cloud Arena Evaluation — Mathematical Model | |
| # Extracted from cloud_arena_final.py (Cells 4-5) | |
| import os | |
| import numpy as np | |
| import torch | |
| from typing import List | |
| from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize | |
| from sb3_contrib import MaskablePPO | |
| from sb3_contrib.common.wrappers import ActionMasker | |
| from cloud_arena.environment import ( | |
| CloudArenaEnv, get_action_masks, MAX_RESOURCES, MAX_STEPS, A_NOOP, | |
| ) | |
| def _get_inner(vec_env): | |
| inner = vec_env.envs[0] | |
| while hasattr(inner, "env"): | |
| inner = inner.env | |
| return inner | |
| def evaluate_model(model_path="./models/cloud_arena_final", | |
| vecnorm_path="./models/cloud_arena_vecnorm.pkl", | |
| level=0, n_eval=30): | |
| results = {k: [] for k in ["win","cost_score","security_score", | |
| "reliability_score","savings_pct","veto_rate", | |
| "cascade_count","steps"]} | |
| def make_eval_env(): | |
| env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[500000]) | |
| return ActionMasker(env, get_action_masks) | |
| raw = DummyVecEnv([make_eval_env]) | |
| eval_env = VecNormalize.load(vecnorm_path, raw) | |
| eval_env.training = False | |
| eval_env.norm_reward = False | |
| model = MaskablePPO.load(model_path, env=eval_env) | |
| for ep in range(n_eval): | |
| obs = eval_env.reset() | |
| done = False | |
| steps = 0 | |
| while not done: | |
| masks = [_get_inner(eval_env).action_masks()] | |
| act, _ = model.predict(obs, deterministic=True, action_masks=masks) | |
| obs, rew, done_arr, info_arr = eval_env.step(act) | |
| done = bool(done_arr[0]) | |
| steps += 1 | |
| info = info_arr[0] if info_arr else {} | |
| for k in results: | |
| results[k].append(info.get(k, 0) if k != "steps" else steps) | |
| return results | |
| BOSS_NAMES = { | |
| 1: "Cost Crisis", | |
| 2: "Security Breach", | |
| 3: "Infrastructure Failure", | |
| 4: "Traffic Surge", | |
| 5: "Final Boss", | |
| } | |
| def run_boss_fights(model_path="./models/cloud_arena_final", | |
| vecnorm_path="./models/cloud_arena_vecnorm.pkl", | |
| level=0, n_runs=10): | |
| model = MaskablePPO.load(model_path) | |
| boss_scores = {} | |
| for s_id, name in BOSS_NAMES.items(): | |
| runs = [] | |
| for seed in range(100, 100 + n_runs): | |
| def _init(): | |
| env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[0]) | |
| return ActionMasker(env, get_action_masks) | |
| raw = DummyVecEnv([_init]) | |
| vec = VecNormalize.load(vecnorm_path, raw) | |
| vec.training = False | |
| vec.norm_reward = False | |
| inner = _get_inner(vec) | |
| raw_obs, _ = inner.reset(seed=seed, options={"scenario": s_id}) | |
| obs = vec.normalize_obs(np.array([raw_obs])) | |
| done = False | |
| steps = 0 | |
| noops_chaos = 0 | |
| chaos_steps_total = 0 | |
| while not done: | |
| masks = [inner.action_masks()] | |
| act, _ = model.predict(obs, deterministic=True, action_masks=masks) | |
| a_type = int(act[0]) // MAX_RESOURCES | |
| if inner.chaos_active: | |
| chaos_steps_total += 1 | |
| if a_type == A_NOOP: | |
| noops_chaos += 1 | |
| obs, _, done_arr, info_arr = vec.step(act) | |
| done = bool(done_arr[0]) | |
| steps += 1 | |
| info = info_arr[0] if info_arr else {} | |
| info.update({"steps": steps, "noops_chaos": noops_chaos, "chaos_steps": chaos_steps_total}) | |
| runs.append(info) | |
| vec.close() | |
| wins = [r.get("win", 0) for r in runs] | |
| costs = [r.get("cost_score", 0) for r in runs] | |
| secs = [r.get("security_score", 0) for r in runs] | |
| rels = [r.get("reliability_score", 0) for r in runs] | |
| if s_id == 3: | |
| noop_r = [r["noops_chaos"] / max(r["chaos_steps"], 1) for r in runs] | |
| score = (0.4 * np.mean(noop_r) + 0.6 * np.mean(rels)) * 100 | |
| else: | |
| score = (0.4 * np.mean(wins) + 0.3 * np.mean(costs) + 0.3 * np.mean(secs)) * 100 | |
| boss_scores[s_id] = score | |
| return boss_scores | |
| def evaluate_llm_grpo(model, tokenizer, n_eval=20, steps_per_episode=15, seed=123): | |
| """ | |
| Evaluate LLM policy quality on the FinOps environment using the same | |
| ACTION parser logic as training. | |
| """ | |
| import random | |
| import torch | |
| from cloud_arena.llm_environment import SB3Adapter | |
| from cloud_arena.llm_training import extract_action_and_reasoning, format_prompt | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(seed) | |
| env = SB3Adapter() | |
| metrics = { | |
| "episodes": n_eval, | |
| "win_rate": 0.0, | |
| "avg_savings_pct": 0.0, | |
| "avg_episode_len": 0.0, | |
| "safety_violation_rate": 0.0, | |
| "action_distribution": {str(i): 0 for i in range(5)}, | |
| "avg_reward_components": {}, | |
| } | |
| wins = 0 | |
| total_savings = 0.0 | |
| total_steps = 0 | |
| total_safety_violations = 0 | |
| reward_components_sum = {} | |
| total_component_steps = 0 | |
| for _ in range(n_eval): | |
| _, _ = env.reset() | |
| done = False | |
| step_count = 0 | |
| last_info = {} | |
| while not done and step_count < steps_per_episode: | |
| state_dict = env.core._get_internal_state() | |
| prompt = format_prompt(state_dict) | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) | |
| input_ids = inputs["input_ids"].to(model.device) | |
| attn_mask = inputs["attention_mask"].to(model.device) | |
| with torch.no_grad(): | |
| out = model.generate( | |
| input_ids=input_ids, | |
| attention_mask=attn_mask, | |
| max_new_tokens=80, | |
| do_sample=False, | |
| pad_token_id=tokenizer.pad_token_id, | |
| ) | |
| response = tokenizer.decode(out[0][input_ids.shape[1] :], skip_special_tokens=True) | |
| action, _ = extract_action_and_reasoning(response) | |
| metrics["action_distribution"][str(action)] += 1 | |
| _, _, terminated, truncated, info = env.step(action) | |
| done = bool(terminated or truncated) | |
| step_count += 1 | |
| last_info = info | |
| total_safety_violations += int(info.get("safety_violation", 0)) | |
| rc = info.get("reward_components", {}) | |
| for k, v in rc.items(): | |
| reward_components_sum[k] = reward_components_sum.get(k, 0.0) + float(v) | |
| total_component_steps += 1 | |
| wins += int(last_info.get("win", False)) | |
| total_savings += float(last_info.get("savings_pct", 0.0)) | |
| total_steps += step_count | |
| total_actions = max(sum(metrics["action_distribution"].values()), 1) | |
| metrics["action_distribution"] = { | |
| k: round(v / total_actions, 4) for k, v in metrics["action_distribution"].items() | |
| } | |
| metrics["win_rate"] = round(wins / max(n_eval, 1), 4) | |
| metrics["avg_savings_pct"] = round(total_savings / max(n_eval, 1), 3) | |
| metrics["avg_episode_len"] = round(total_steps / max(n_eval, 1), 3) | |
| metrics["safety_violation_rate"] = round(total_safety_violations / max(total_steps, 1), 4) | |
| metrics["avg_reward_components"] = { | |
| k: round(v / max(total_component_steps, 1), 4) for k, v in reward_components_sum.items() | |
| } | |
| return metrics | |