| """ |
| scripts/eval.py |
| --------------- |
| Standalone evaluation runner for the LifeStack environment. |
| |
| Runs N episodes with a random-action baseline (no model / GPU required) and |
| prints a summary table plus aggregate statistics. |
| |
| Usage: |
| python scripts/eval.py |
| python scripts/eval.py --episodes 20 |
| python scripts/eval.py --episodes 20 --domain flight_crisis --verbose |
| """ |
|
|
| import argparse |
| import random |
| import sys |
| import os |
|
|
| |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) |
|
|
| from core.lifestack_env import LifeStackEnv, LifeStackAction |
| from agent.conflict_generator import TaskGenerator |
|
|
| |
| |
| |
|
|
| |
| _ACTION_TYPES = ["execute", "inspect", "plan", "wait", "communicate", "spend", "delegate"] |
|
|
| |
| |
| _KNOWN_ROUTE_IDS = [ |
| "rebook_premium", "wait_lounge", |
| "revert_commit", "hotfix", |
| ] |
|
|
|
|
| def _random_action(task) -> LifeStackAction: |
| """Return a random LifeStackAction that exercises a variety of tool types.""" |
| action_type = random.choice(_ACTION_TYPES) |
|
|
| |
| target = None |
| if action_type == "execute": |
| route_ids = [r.id for r in task.viable_routes] if task and task.viable_routes else _KNOWN_ROUTE_IDS |
| target = random.choice(route_ids) |
| elif action_type == "inspect": |
| |
| if task and task.hidden_state: |
| target = random.choice(list(task.hidden_state.keys())) |
| else: |
| target = "lounge_capacity" |
|
|
| |
| metric_changes: dict = {} |
| if action_type in ("execute", "plan", "communicate"): |
| domain = random.choice( |
| ["career", "finances", "relationships", "physical_health", "mental_wellbeing", "time"] |
| ) |
| sub_key = random.choice(["workload", "stress_level", "liquidity", "sleep_quality", "energy", "free_hours_per_week"]) |
| metric_changes[f"{domain}.{sub_key}"] = random.uniform(-10.0, 10.0) |
|
|
| resource_cost: dict = {} |
| if action_type != "wait": |
| resource_cost = { |
| "time": random.uniform(0.0, 2.0), |
| "money": random.uniform(0.0, 50.0), |
| "energy": random.uniform(0.0, 10.0), |
| } |
|
|
| return LifeStackAction( |
| action_type=action_type, |
| target=target, |
| metric_changes=metric_changes, |
| resource_cost=resource_cost, |
| actions_taken=1, |
| reasoning="random baseline", |
| ) |
|
|
|
|
| def _row(ep_id: int, total_reward: float, steps: int, domain: str, success: bool) -> str: |
| """Format one summary table row.""" |
| success_str = "β" if success else "β" |
| return ( |
| f" {ep_id:>4} " |
| f"{total_reward:>12.4f} " |
| f"{steps:>6} " |
| f"{domain:<20} " |
| f"{success_str:>7}" |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def run_eval(n_episodes: int, domain: str | None, verbose: bool) -> None: |
| generator = TaskGenerator() |
| env = LifeStackEnv() |
|
|
| results = [] |
|
|
| header = ( |
| f"\n {'EP':>4} {'TOTAL REWARD':>12} {'STEPS':>6} {'DOMAIN':<20} {'SUCCESS':>7}\n" |
| f" {'β'*4} {'β'*12} {'β'*6} {'β'*20} {'β'*7}" |
| ) |
| print(header) |
|
|
| for ep in range(1, n_episodes + 1): |
| |
| task = generator.generate(domain=domain) |
|
|
| obs = env.reset(task=task, episode_id=str(ep)) |
|
|
| total_reward = 0.0 |
| steps = 0 |
| success = False |
|
|
| while not obs.done: |
| action = _random_action(env.state.current_task) |
| obs = env.step(action) |
| reward = obs.reward or 0.0 |
| total_reward += reward |
| steps += 1 |
|
|
| if verbose: |
| print( |
| f" step={steps:>3} reward={reward:+.3f} " |
| f"action={action.action_type:<12} " |
| f"target={str(action.target):<20} " |
| f"done={obs.done}" |
| ) |
|
|
| if obs.metadata.get("success"): |
| success = True |
|
|
| task_domain = task.domain if task else "unknown" |
| results.append( |
| { |
| "episode": ep, |
| "total_reward": total_reward, |
| "steps": steps, |
| "domain": task_domain, |
| "success": success, |
| } |
| ) |
|
|
| print(_row(ep, total_reward, steps, task_domain, success)) |
|
|
| |
| |
| |
| n = len(results) |
| mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0 |
| success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0 |
| mean_steps = sum(r["steps"] for r in results) / n if n else 0.0 |
|
|
| print( |
| f"\n {'β'*60}\n" |
| f" Episodes : {n}\n" |
| f" Mean Reward : {mean_reward:.4f}\n" |
| f" Success Rate : {success_rate:.1%}\n" |
| f" Mean Steps : {mean_steps:.1f}\n" |
| ) |
|
|
|
|
| |
| run_evaluation = run_eval |
|
|
|
|
| |
| |
| |
|
|
| def run_holdout_eval(n_episodes: int = 10, verbose: bool = False) -> dict: |
| """Run evaluation on a fixed holdout set for generalization measurement.""" |
| import json as _json |
|
|
| holdout_path = os.path.join(os.path.dirname(__file__), "..", "data", "holdout_tasks.json") |
| try: |
| with open(holdout_path) as fh: |
| holdout_configs = _json.load(fh) |
| except FileNotFoundError: |
| print(f"[holdout] No holdout file at {holdout_path}; falling back to random tasks.") |
| holdout_configs = [{"id": f"fallback_{i}", "seed": 9000 + i} for i in range(n_episodes)] |
|
|
| generator = TaskGenerator() |
| env = LifeStackEnv() |
| results = [] |
|
|
| print(f"\n {'β'*60}") |
| print(f" HOLDOUT EVALUATION ({len(holdout_configs)} fixed tasks)") |
| print(f" {'β'*60}") |
|
|
| for cfg in holdout_configs[:n_episodes]: |
| seed = cfg.get("seed", 9000) |
| domain = cfg.get("domain", "flight_crisis") |
| task = generator.generate(domain=domain) |
|
|
| obs = env.reset(task=task, seed=seed, episode_id=cfg["id"]) |
| total_reward = 0.0 |
| steps = 0 |
| success = False |
|
|
| while not obs.done: |
| action = _random_action(env.state.current_task) |
| obs = env.step(action) |
| total_reward += obs.reward or 0.0 |
| steps += 1 |
| if verbose: |
| print(f" step={steps:>3} reward={obs.reward:+.3f} action={action.action_type}") |
| if obs.metadata.get("success"): |
| success = True |
|
|
| results.append({"id": cfg["id"], "total_reward": total_reward, "steps": steps, "success": success}) |
| print(f" {cfg['id']:<20} reward={total_reward:>8.4f} steps={steps:>4} {'β' if success else 'β'}") |
|
|
| n = len(results) |
| mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0 |
| success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0 |
| print(f"\n Holdout Mean Reward : {mean_reward:.4f}") |
| print(f" Holdout Success Rate : {success_rate:.1%}\n") |
| return {"mean_reward": mean_reward, "success_rate": success_rate, "results": results} |
|
|
|
|
| |
| |
| |
|
|
| def _parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser( |
| description="LifeStack environment evaluation runner (random baseline)." |
| ) |
| parser.add_argument( |
| "--episodes", |
| type=int, |
| default=10, |
| help="Number of episodes to run (default: 10).", |
| ) |
| parser.add_argument( |
| "--domain", |
| type=str, |
| default=None, |
| help=( |
| "Optional domain filter passed to TaskGenerator.generate(). " |
| "Supported: 'flight_crisis', 'code_merge_crisis'. " |
| "Omit to cycle randomly." |
| ), |
| ) |
| parser.add_argument( |
| "--verbose", |
| action="store_true", |
| default=False, |
| help="Print per-step details for every episode.", |
| ) |
| return parser.parse_args() |
|
|
|
|
| if __name__ == "__main__": |
| args = _parse_args() |
| print( |
| f"LifeStack Eval β episodes={args.episodes} " |
| f"domain={args.domain or 'any'} " |
| f"verbose={args.verbose}" |
| ) |
| run_eval(n_episodes=args.episodes, domain=args.domain, verbose=args.verbose) |
|
|