meta-r2 / scripts /eval.py
github-actions[bot]
Deploy Space snapshot
ddbc1ba
"""
scripts/eval.py
---------------
Standalone evaluation runner for the LifeStack environment.
Runs N episodes with a random-action baseline (no model / GPU required) and
prints a summary table plus aggregate statistics.
Usage:
python scripts/eval.py
python scripts/eval.py --episodes 20
python scripts/eval.py --episodes 20 --domain flight_crisis --verbose
"""
import argparse
import random
import sys
import os
# Allow running from repo root without installing the package.
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from core.lifestack_env import LifeStackEnv, LifeStackAction
from agent.conflict_generator import TaskGenerator
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
# All action_types understood by the env's tool dispatch.
_ACTION_TYPES = ["execute", "inspect", "plan", "wait", "communicate", "spend", "delegate"]
# Known route IDs across the two TaskGenerator domains β€” used for targeted
# "execute" actions so we occasionally hit real routes.
_KNOWN_ROUTE_IDS = [
"rebook_premium", "wait_lounge", # flight_crisis
"revert_commit", "hotfix", # code_merge_crisis
]
def _random_action(task) -> LifeStackAction:
"""Return a random LifeStackAction that exercises a variety of tool types."""
action_type = random.choice(_ACTION_TYPES)
# For "execute" actions, attempt to target a known route from the task.
target = None
if action_type == "execute":
route_ids = [r.id for r in task.viable_routes] if task and task.viable_routes else _KNOWN_ROUTE_IDS
target = random.choice(route_ids)
elif action_type == "inspect":
# Pick a random hidden-state key from the task or fall back to a default.
if task and task.hidden_state:
target = random.choice(list(task.hidden_state.keys()))
else:
target = "lounge_capacity"
# Small, random metric nudges to keep the episode non-trivial.
metric_changes: dict = {}
if action_type in ("execute", "plan", "communicate"):
domain = random.choice(
["career", "finances", "relationships", "physical_health", "mental_wellbeing", "time"]
)
sub_key = random.choice(["workload", "stress_level", "liquidity", "sleep_quality", "energy", "free_hours_per_week"])
metric_changes[f"{domain}.{sub_key}"] = random.uniform(-10.0, 10.0)
resource_cost: dict = {}
if action_type != "wait":
resource_cost = {
"time": random.uniform(0.0, 2.0),
"money": random.uniform(0.0, 50.0),
"energy": random.uniform(0.0, 10.0),
}
return LifeStackAction(
action_type=action_type,
target=target,
metric_changes=metric_changes,
resource_cost=resource_cost,
actions_taken=1,
reasoning="random baseline",
)
def _row(ep_id: int, total_reward: float, steps: int, domain: str, success: bool) -> str:
"""Format one summary table row."""
success_str = "βœ“" if success else "βœ—"
return (
f" {ep_id:>4} "
f"{total_reward:>12.4f} "
f"{steps:>6} "
f"{domain:<20} "
f"{success_str:>7}"
)
# ---------------------------------------------------------------------------
# Core evaluation loop
# ---------------------------------------------------------------------------
def run_eval(n_episodes: int, domain: str | None, verbose: bool) -> None:
generator = TaskGenerator()
env = LifeStackEnv()
results = []
header = (
f"\n {'EP':>4} {'TOTAL REWARD':>12} {'STEPS':>6} {'DOMAIN':<20} {'SUCCESS':>7}\n"
f" {'─'*4} {'─'*12} {'─'*6} {'─'*20} {'─'*7}"
)
print(header)
for ep in range(1, n_episodes + 1):
# Generate task (optionally filtered by domain).
task = generator.generate(domain=domain)
obs = env.reset(task=task, episode_id=str(ep))
total_reward = 0.0
steps = 0
success = False
while not obs.done:
action = _random_action(env.state.current_task)
obs = env.step(action)
reward = obs.reward or 0.0
total_reward += reward
steps += 1
if verbose:
print(
f" step={steps:>3} reward={reward:+.3f} "
f"action={action.action_type:<12} "
f"target={str(action.target):<20} "
f"done={obs.done}"
)
if obs.metadata.get("success"):
success = True
task_domain = task.domain if task else "unknown"
results.append(
{
"episode": ep,
"total_reward": total_reward,
"steps": steps,
"domain": task_domain,
"success": success,
}
)
print(_row(ep, total_reward, steps, task_domain, success))
# -----------------------------------------------------------------------
# Aggregate stats
# -----------------------------------------------------------------------
n = len(results)
mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
mean_steps = sum(r["steps"] for r in results) / n if n else 0.0
print(
f"\n {'─'*60}\n"
f" Episodes : {n}\n"
f" Mean Reward : {mean_reward:.4f}\n"
f" Success Rate : {success_rate:.1%}\n"
f" Mean Steps : {mean_steps:.1f}\n"
)
# Alias used by train_trl.py
run_evaluation = run_eval
# ---------------------------------------------------------------------------
# Holdout evaluation β€” fixed task seeds not used during training
# ---------------------------------------------------------------------------
def run_holdout_eval(n_episodes: int = 10, verbose: bool = False) -> dict:
"""Run evaluation on a fixed holdout set for generalization measurement."""
import json as _json
holdout_path = os.path.join(os.path.dirname(__file__), "..", "data", "holdout_tasks.json")
try:
with open(holdout_path) as fh:
holdout_configs = _json.load(fh)
except FileNotFoundError:
print(f"[holdout] No holdout file at {holdout_path}; falling back to random tasks.")
holdout_configs = [{"id": f"fallback_{i}", "seed": 9000 + i} for i in range(n_episodes)]
generator = TaskGenerator()
env = LifeStackEnv()
results = []
print(f"\n {'─'*60}")
print(f" HOLDOUT EVALUATION ({len(holdout_configs)} fixed tasks)")
print(f" {'─'*60}")
for cfg in holdout_configs[:n_episodes]:
seed = cfg.get("seed", 9000)
domain = cfg.get("domain", "flight_crisis")
task = generator.generate(domain=domain)
obs = env.reset(task=task, seed=seed, episode_id=cfg["id"])
total_reward = 0.0
steps = 0
success = False
while not obs.done:
action = _random_action(env.state.current_task)
obs = env.step(action)
total_reward += obs.reward or 0.0
steps += 1
if verbose:
print(f" step={steps:>3} reward={obs.reward:+.3f} action={action.action_type}")
if obs.metadata.get("success"):
success = True
results.append({"id": cfg["id"], "total_reward": total_reward, "steps": steps, "success": success})
print(f" {cfg['id']:<20} reward={total_reward:>8.4f} steps={steps:>4} {'βœ“' if success else 'βœ—'}")
n = len(results)
mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
print(f"\n Holdout Mean Reward : {mean_reward:.4f}")
print(f" Holdout Success Rate : {success_rate:.1%}\n")
return {"mean_reward": mean_reward, "success_rate": success_rate, "results": results}
# ---------------------------------------------------------------------------
# CLI entry-point
# ---------------------------------------------------------------------------
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="LifeStack environment evaluation runner (random baseline)."
)
parser.add_argument(
"--episodes",
type=int,
default=10,
help="Number of episodes to run (default: 10).",
)
parser.add_argument(
"--domain",
type=str,
default=None,
help=(
"Optional domain filter passed to TaskGenerator.generate(). "
"Supported: 'flight_crisis', 'code_merge_crisis'. "
"Omit to cycle randomly."
),
)
parser.add_argument(
"--verbose",
action="store_true",
default=False,
help="Print per-step details for every episode.",
)
return parser.parse_args()
if __name__ == "__main__":
args = _parse_args()
print(
f"LifeStack Eval β€” episodes={args.episodes} "
f"domain={args.domain or 'any'} "
f"verbose={args.verbose}"
)
run_eval(n_episodes=args.episodes, domain=args.domain, verbose=args.verbose)