File size: 9,374 Bytes
ddbc1ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | """
scripts/eval.py
---------------
Standalone evaluation runner for the LifeStack environment.
Runs N episodes with a random-action baseline (no model / GPU required) and
prints a summary table plus aggregate statistics.
Usage:
python scripts/eval.py
python scripts/eval.py --episodes 20
python scripts/eval.py --episodes 20 --domain flight_crisis --verbose
"""
import argparse
import random
import sys
import os
# Allow running from repo root without installing the package.
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from core.lifestack_env import LifeStackEnv, LifeStackAction
from agent.conflict_generator import TaskGenerator
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
# All action_types understood by the env's tool dispatch.
_ACTION_TYPES = ["execute", "inspect", "plan", "wait", "communicate", "spend", "delegate"]
# Known route IDs across the two TaskGenerator domains β used for targeted
# "execute" actions so we occasionally hit real routes.
_KNOWN_ROUTE_IDS = [
"rebook_premium", "wait_lounge", # flight_crisis
"revert_commit", "hotfix", # code_merge_crisis
]
def _random_action(task) -> LifeStackAction:
"""Return a random LifeStackAction that exercises a variety of tool types."""
action_type = random.choice(_ACTION_TYPES)
# For "execute" actions, attempt to target a known route from the task.
target = None
if action_type == "execute":
route_ids = [r.id for r in task.viable_routes] if task and task.viable_routes else _KNOWN_ROUTE_IDS
target = random.choice(route_ids)
elif action_type == "inspect":
# Pick a random hidden-state key from the task or fall back to a default.
if task and task.hidden_state:
target = random.choice(list(task.hidden_state.keys()))
else:
target = "lounge_capacity"
# Small, random metric nudges to keep the episode non-trivial.
metric_changes: dict = {}
if action_type in ("execute", "plan", "communicate"):
domain = random.choice(
["career", "finances", "relationships", "physical_health", "mental_wellbeing", "time"]
)
sub_key = random.choice(["workload", "stress_level", "liquidity", "sleep_quality", "energy", "free_hours_per_week"])
metric_changes[f"{domain}.{sub_key}"] = random.uniform(-10.0, 10.0)
resource_cost: dict = {}
if action_type != "wait":
resource_cost = {
"time": random.uniform(0.0, 2.0),
"money": random.uniform(0.0, 50.0),
"energy": random.uniform(0.0, 10.0),
}
return LifeStackAction(
action_type=action_type,
target=target,
metric_changes=metric_changes,
resource_cost=resource_cost,
actions_taken=1,
reasoning="random baseline",
)
def _row(ep_id: int, total_reward: float, steps: int, domain: str, success: bool) -> str:
"""Format one summary table row."""
success_str = "β" if success else "β"
return (
f" {ep_id:>4} "
f"{total_reward:>12.4f} "
f"{steps:>6} "
f"{domain:<20} "
f"{success_str:>7}"
)
# ---------------------------------------------------------------------------
# Core evaluation loop
# ---------------------------------------------------------------------------
def run_eval(n_episodes: int, domain: str | None, verbose: bool) -> None:
generator = TaskGenerator()
env = LifeStackEnv()
results = []
header = (
f"\n {'EP':>4} {'TOTAL REWARD':>12} {'STEPS':>6} {'DOMAIN':<20} {'SUCCESS':>7}\n"
f" {'β'*4} {'β'*12} {'β'*6} {'β'*20} {'β'*7}"
)
print(header)
for ep in range(1, n_episodes + 1):
# Generate task (optionally filtered by domain).
task = generator.generate(domain=domain)
obs = env.reset(task=task, episode_id=str(ep))
total_reward = 0.0
steps = 0
success = False
while not obs.done:
action = _random_action(env.state.current_task)
obs = env.step(action)
reward = obs.reward or 0.0
total_reward += reward
steps += 1
if verbose:
print(
f" step={steps:>3} reward={reward:+.3f} "
f"action={action.action_type:<12} "
f"target={str(action.target):<20} "
f"done={obs.done}"
)
if obs.metadata.get("success"):
success = True
task_domain = task.domain if task else "unknown"
results.append(
{
"episode": ep,
"total_reward": total_reward,
"steps": steps,
"domain": task_domain,
"success": success,
}
)
print(_row(ep, total_reward, steps, task_domain, success))
# -----------------------------------------------------------------------
# Aggregate stats
# -----------------------------------------------------------------------
n = len(results)
mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
mean_steps = sum(r["steps"] for r in results) / n if n else 0.0
print(
f"\n {'β'*60}\n"
f" Episodes : {n}\n"
f" Mean Reward : {mean_reward:.4f}\n"
f" Success Rate : {success_rate:.1%}\n"
f" Mean Steps : {mean_steps:.1f}\n"
)
# Alias used by train_trl.py
run_evaluation = run_eval
# ---------------------------------------------------------------------------
# Holdout evaluation β fixed task seeds not used during training
# ---------------------------------------------------------------------------
def run_holdout_eval(n_episodes: int = 10, verbose: bool = False) -> dict:
"""Run evaluation on a fixed holdout set for generalization measurement."""
import json as _json
holdout_path = os.path.join(os.path.dirname(__file__), "..", "data", "holdout_tasks.json")
try:
with open(holdout_path) as fh:
holdout_configs = _json.load(fh)
except FileNotFoundError:
print(f"[holdout] No holdout file at {holdout_path}; falling back to random tasks.")
holdout_configs = [{"id": f"fallback_{i}", "seed": 9000 + i} for i in range(n_episodes)]
generator = TaskGenerator()
env = LifeStackEnv()
results = []
print(f"\n {'β'*60}")
print(f" HOLDOUT EVALUATION ({len(holdout_configs)} fixed tasks)")
print(f" {'β'*60}")
for cfg in holdout_configs[:n_episodes]:
seed = cfg.get("seed", 9000)
domain = cfg.get("domain", "flight_crisis")
task = generator.generate(domain=domain)
obs = env.reset(task=task, seed=seed, episode_id=cfg["id"])
total_reward = 0.0
steps = 0
success = False
while not obs.done:
action = _random_action(env.state.current_task)
obs = env.step(action)
total_reward += obs.reward or 0.0
steps += 1
if verbose:
print(f" step={steps:>3} reward={obs.reward:+.3f} action={action.action_type}")
if obs.metadata.get("success"):
success = True
results.append({"id": cfg["id"], "total_reward": total_reward, "steps": steps, "success": success})
print(f" {cfg['id']:<20} reward={total_reward:>8.4f} steps={steps:>4} {'β' if success else 'β'}")
n = len(results)
mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
print(f"\n Holdout Mean Reward : {mean_reward:.4f}")
print(f" Holdout Success Rate : {success_rate:.1%}\n")
return {"mean_reward": mean_reward, "success_rate": success_rate, "results": results}
# ---------------------------------------------------------------------------
# CLI entry-point
# ---------------------------------------------------------------------------
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="LifeStack environment evaluation runner (random baseline)."
)
parser.add_argument(
"--episodes",
type=int,
default=10,
help="Number of episodes to run (default: 10).",
)
parser.add_argument(
"--domain",
type=str,
default=None,
help=(
"Optional domain filter passed to TaskGenerator.generate(). "
"Supported: 'flight_crisis', 'code_merge_crisis'. "
"Omit to cycle randomly."
),
)
parser.add_argument(
"--verbose",
action="store_true",
default=False,
help="Print per-step details for every episode.",
)
return parser.parse_args()
if __name__ == "__main__":
args = _parse_args()
print(
f"LifeStack Eval β episodes={args.episodes} "
f"domain={args.domain or 'any'} "
f"verbose={args.verbose}"
)
run_eval(n_episodes=args.episodes, domain=args.domain, verbose=args.verbose)
|