Openenv / cloud_arena /evaluation.py
saravanatanjiro's picture
Migrate LLM pipeline to custom GRPO with robust rewards
dfc5996
# Cloud Arena Evaluation — Mathematical Model
# Extracted from cloud_arena_final.py (Cells 4-5)
import os
import numpy as np
import torch
from typing import List
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from sb3_contrib import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
from cloud_arena.environment import (
CloudArenaEnv, get_action_masks, MAX_RESOURCES, MAX_STEPS, A_NOOP,
)
def _get_inner(vec_env):
inner = vec_env.envs[0]
while hasattr(inner, "env"):
inner = inner.env
return inner
def evaluate_model(model_path="./models/cloud_arena_final",
vecnorm_path="./models/cloud_arena_vecnorm.pkl",
level=0, n_eval=30):
results = {k: [] for k in ["win","cost_score","security_score",
"reliability_score","savings_pct","veto_rate",
"cascade_count","steps"]}
def make_eval_env():
env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[500000])
return ActionMasker(env, get_action_masks)
raw = DummyVecEnv([make_eval_env])
eval_env = VecNormalize.load(vecnorm_path, raw)
eval_env.training = False
eval_env.norm_reward = False
model = MaskablePPO.load(model_path, env=eval_env)
for ep in range(n_eval):
obs = eval_env.reset()
done = False
steps = 0
while not done:
masks = [_get_inner(eval_env).action_masks()]
act, _ = model.predict(obs, deterministic=True, action_masks=masks)
obs, rew, done_arr, info_arr = eval_env.step(act)
done = bool(done_arr[0])
steps += 1
info = info_arr[0] if info_arr else {}
for k in results:
results[k].append(info.get(k, 0) if k != "steps" else steps)
return results
BOSS_NAMES = {
1: "Cost Crisis",
2: "Security Breach",
3: "Infrastructure Failure",
4: "Traffic Surge",
5: "Final Boss",
}
def run_boss_fights(model_path="./models/cloud_arena_final",
vecnorm_path="./models/cloud_arena_vecnorm.pkl",
level=0, n_runs=10):
model = MaskablePPO.load(model_path)
boss_scores = {}
for s_id, name in BOSS_NAMES.items():
runs = []
for seed in range(100, 100 + n_runs):
def _init():
env = CloudArenaEnv(curriculum_ref=[level], global_step_ref=[0])
return ActionMasker(env, get_action_masks)
raw = DummyVecEnv([_init])
vec = VecNormalize.load(vecnorm_path, raw)
vec.training = False
vec.norm_reward = False
inner = _get_inner(vec)
raw_obs, _ = inner.reset(seed=seed, options={"scenario": s_id})
obs = vec.normalize_obs(np.array([raw_obs]))
done = False
steps = 0
noops_chaos = 0
chaos_steps_total = 0
while not done:
masks = [inner.action_masks()]
act, _ = model.predict(obs, deterministic=True, action_masks=masks)
a_type = int(act[0]) // MAX_RESOURCES
if inner.chaos_active:
chaos_steps_total += 1
if a_type == A_NOOP:
noops_chaos += 1
obs, _, done_arr, info_arr = vec.step(act)
done = bool(done_arr[0])
steps += 1
info = info_arr[0] if info_arr else {}
info.update({"steps": steps, "noops_chaos": noops_chaos, "chaos_steps": chaos_steps_total})
runs.append(info)
vec.close()
wins = [r.get("win", 0) for r in runs]
costs = [r.get("cost_score", 0) for r in runs]
secs = [r.get("security_score", 0) for r in runs]
rels = [r.get("reliability_score", 0) for r in runs]
if s_id == 3:
noop_r = [r["noops_chaos"] / max(r["chaos_steps"], 1) for r in runs]
score = (0.4 * np.mean(noop_r) + 0.6 * np.mean(rels)) * 100
else:
score = (0.4 * np.mean(wins) + 0.3 * np.mean(costs) + 0.3 * np.mean(secs)) * 100
boss_scores[s_id] = score
return boss_scores
def evaluate_llm_grpo(model, tokenizer, n_eval=20, steps_per_episode=15, seed=123):
"""
Evaluate LLM policy quality on the FinOps environment using the same
ACTION parser logic as training.
"""
import random
import torch
from cloud_arena.llm_environment import SB3Adapter
from cloud_arena.llm_training import extract_action_and_reasoning, format_prompt
random.seed(seed)
np.random.seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
env = SB3Adapter()
metrics = {
"episodes": n_eval,
"win_rate": 0.0,
"avg_savings_pct": 0.0,
"avg_episode_len": 0.0,
"safety_violation_rate": 0.0,
"action_distribution": {str(i): 0 for i in range(5)},
"avg_reward_components": {},
}
wins = 0
total_savings = 0.0
total_steps = 0
total_safety_violations = 0
reward_components_sum = {}
total_component_steps = 0
for _ in range(n_eval):
_, _ = env.reset()
done = False
step_count = 0
last_info = {}
while not done and step_count < steps_per_episode:
state_dict = env.core._get_internal_state()
prompt = format_prompt(state_dict)
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
input_ids = inputs["input_ids"].to(model.device)
attn_mask = inputs["attention_mask"].to(model.device)
with torch.no_grad():
out = model.generate(
input_ids=input_ids,
attention_mask=attn_mask,
max_new_tokens=80,
do_sample=False,
pad_token_id=tokenizer.pad_token_id,
)
response = tokenizer.decode(out[0][input_ids.shape[1] :], skip_special_tokens=True)
action, _ = extract_action_and_reasoning(response)
metrics["action_distribution"][str(action)] += 1
_, _, terminated, truncated, info = env.step(action)
done = bool(terminated or truncated)
step_count += 1
last_info = info
total_safety_violations += int(info.get("safety_violation", 0))
rc = info.get("reward_components", {})
for k, v in rc.items():
reward_components_sum[k] = reward_components_sum.get(k, 0.0) + float(v)
total_component_steps += 1
wins += int(last_info.get("win", False))
total_savings += float(last_info.get("savings_pct", 0.0))
total_steps += step_count
total_actions = max(sum(metrics["action_distribution"].values()), 1)
metrics["action_distribution"] = {
k: round(v / total_actions, 4) for k, v in metrics["action_distribution"].items()
}
metrics["win_rate"] = round(wins / max(n_eval, 1), 4)
metrics["avg_savings_pct"] = round(total_savings / max(n_eval, 1), 3)
metrics["avg_episode_len"] = round(total_steps / max(n_eval, 1), 3)
metrics["safety_violation_rate"] = round(total_safety_violations / max(total_steps, 1), 4)
metrics["avg_reward_components"] = {
k: round(v / max(total_component_steps, 1), 4) for k, v in reward_components_sum.items()
}
return metrics