Spaces:

jdsb06
/

meta-r2

Sleeping

File size: 9,374 Bytes

ddbc1ba

"""
scripts/eval.py
---------------
Standalone evaluation runner for the LifeStack environment.

Runs N episodes with a random-action baseline (no model / GPU required) and
prints a summary table plus aggregate statistics.

Usage:
    python scripts/eval.py
    python scripts/eval.py --episodes 20
    python scripts/eval.py --episodes 20 --domain flight_crisis --verbose
"""

import argparse
import random
import sys
import os

# Allow running from repo root without installing the package.
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from core.lifestack_env import LifeStackEnv, LifeStackAction
from agent.conflict_generator import TaskGenerator

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

# All action_types understood by the env's tool dispatch.
_ACTION_TYPES = ["execute", "inspect", "plan", "wait", "communicate", "spend", "delegate"]

# Known route IDs across the two TaskGenerator domains — used for targeted
# "execute" actions so we occasionally hit real routes.
_KNOWN_ROUTE_IDS = [
    "rebook_premium", "wait_lounge",        # flight_crisis
    "revert_commit", "hotfix",              # code_merge_crisis
]


def _random_action(task) -> LifeStackAction:
    """Return a random LifeStackAction that exercises a variety of tool types."""
    action_type = random.choice(_ACTION_TYPES)

    # For "execute" actions, attempt to target a known route from the task.
    target = None
    if action_type == "execute":
        route_ids = [r.id for r in task.viable_routes] if task and task.viable_routes else _KNOWN_ROUTE_IDS
        target = random.choice(route_ids)
    elif action_type == "inspect":
        # Pick a random hidden-state key from the task or fall back to a default.
        if task and task.hidden_state:
            target = random.choice(list(task.hidden_state.keys()))
        else:
            target = "lounge_capacity"

    # Small, random metric nudges to keep the episode non-trivial.
    metric_changes: dict = {}
    if action_type in ("execute", "plan", "communicate"):
        domain = random.choice(
            ["career", "finances", "relationships", "physical_health", "mental_wellbeing", "time"]
        )
        sub_key = random.choice(["workload", "stress_level", "liquidity", "sleep_quality", "energy", "free_hours_per_week"])
        metric_changes[f"{domain}.{sub_key}"] = random.uniform(-10.0, 10.0)

    resource_cost: dict = {}
    if action_type != "wait":
        resource_cost = {
            "time":   random.uniform(0.0, 2.0),
            "money":  random.uniform(0.0, 50.0),
            "energy": random.uniform(0.0, 10.0),
        }

    return LifeStackAction(
        action_type=action_type,
        target=target,
        metric_changes=metric_changes,
        resource_cost=resource_cost,
        actions_taken=1,
        reasoning="random baseline",
    )


def _row(ep_id: int, total_reward: float, steps: int, domain: str, success: bool) -> str:
    """Format one summary table row."""
    success_str = "✓" if success else "✗"
    return (
        f"  {ep_id:>4}  "
        f"{total_reward:>12.4f}  "
        f"{steps:>6}  "
        f"{domain:<20}  "
        f"{success_str:>7}"
    )


# ---------------------------------------------------------------------------
# Core evaluation loop
# ---------------------------------------------------------------------------

def run_eval(n_episodes: int, domain: str | None, verbose: bool) -> None:
    generator = TaskGenerator()
    env = LifeStackEnv()

    results = []

    header = (
        f"\n  {'EP':>4}  {'TOTAL REWARD':>12}  {'STEPS':>6}  {'DOMAIN':<20}  {'SUCCESS':>7}\n"
        f"  {'─'*4}  {'─'*12}  {'─'*6}  {'─'*20}  {'─'*7}"
    )
    print(header)

    for ep in range(1, n_episodes + 1):
        # Generate task (optionally filtered by domain).
        task = generator.generate(domain=domain)

        obs = env.reset(task=task, episode_id=str(ep))

        total_reward = 0.0
        steps = 0
        success = False

        while not obs.done:
            action = _random_action(env.state.current_task)
            obs = env.step(action)
            reward = obs.reward or 0.0
            total_reward += reward
            steps += 1

            if verbose:
                print(
                    f"    step={steps:>3}  reward={reward:+.3f}  "
                    f"action={action.action_type:<12}  "
                    f"target={str(action.target):<20}  "
                    f"done={obs.done}"
                )

            if obs.metadata.get("success"):
                success = True

        task_domain = task.domain if task else "unknown"
        results.append(
            {
                "episode": ep,
                "total_reward": total_reward,
                "steps": steps,
                "domain": task_domain,
                "success": success,
            }
        )

        print(_row(ep, total_reward, steps, task_domain, success))

    # -----------------------------------------------------------------------
    # Aggregate stats
    # -----------------------------------------------------------------------
    n = len(results)
    mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
    success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
    mean_steps = sum(r["steps"] for r in results) / n if n else 0.0

    print(
        f"\n  {'─'*60}\n"
        f"  Episodes     : {n}\n"
        f"  Mean Reward  : {mean_reward:.4f}\n"
        f"  Success Rate : {success_rate:.1%}\n"
        f"  Mean Steps   : {mean_steps:.1f}\n"
    )


# Alias used by train_trl.py
run_evaluation = run_eval


# ---------------------------------------------------------------------------
# Holdout evaluation — fixed task seeds not used during training
# ---------------------------------------------------------------------------

def run_holdout_eval(n_episodes: int = 10, verbose: bool = False) -> dict:
    """Run evaluation on a fixed holdout set for generalization measurement."""
    import json as _json

    holdout_path = os.path.join(os.path.dirname(__file__), "..", "data", "holdout_tasks.json")
    try:
        with open(holdout_path) as fh:
            holdout_configs = _json.load(fh)
    except FileNotFoundError:
        print(f"[holdout] No holdout file at {holdout_path}; falling back to random tasks.")
        holdout_configs = [{"id": f"fallback_{i}", "seed": 9000 + i} for i in range(n_episodes)]

    generator = TaskGenerator()
    env = LifeStackEnv()
    results = []

    print(f"\n  {'─'*60}")
    print(f"  HOLDOUT EVALUATION ({len(holdout_configs)} fixed tasks)")
    print(f"  {'─'*60}")

    for cfg in holdout_configs[:n_episodes]:
        seed = cfg.get("seed", 9000)
        domain = cfg.get("domain", "flight_crisis")
        task = generator.generate(domain=domain)

        obs = env.reset(task=task, seed=seed, episode_id=cfg["id"])
        total_reward = 0.0
        steps = 0
        success = False

        while not obs.done:
            action = _random_action(env.state.current_task)
            obs = env.step(action)
            total_reward += obs.reward or 0.0
            steps += 1
            if verbose:
                print(f"    step={steps:>3}  reward={obs.reward:+.3f}  action={action.action_type}")
            if obs.metadata.get("success"):
                success = True

        results.append({"id": cfg["id"], "total_reward": total_reward, "steps": steps, "success": success})
        print(f"  {cfg['id']:<20}  reward={total_reward:>8.4f}  steps={steps:>4}  {'✓' if success else '✗'}")

    n = len(results)
    mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
    success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
    print(f"\n  Holdout Mean Reward  : {mean_reward:.4f}")
    print(f"  Holdout Success Rate : {success_rate:.1%}\n")
    return {"mean_reward": mean_reward, "success_rate": success_rate, "results": results}


# ---------------------------------------------------------------------------
# CLI entry-point
# ---------------------------------------------------------------------------

def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="LifeStack environment evaluation runner (random baseline)."
    )
    parser.add_argument(
        "--episodes",
        type=int,
        default=10,
        help="Number of episodes to run (default: 10).",
    )
    parser.add_argument(
        "--domain",
        type=str,
        default=None,
        help=(
            "Optional domain filter passed to TaskGenerator.generate(). "
            "Supported: 'flight_crisis', 'code_merge_crisis'. "
            "Omit to cycle randomly."
        ),
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="Print per-step details for every episode.",
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = _parse_args()
    print(
        f"LifeStack Eval — episodes={args.episodes}  "
        f"domain={args.domain or 'any'}  "
        f"verbose={args.verbose}"
    )
    run_eval(n_episodes=args.episodes, domain=args.domain, verbose=args.verbose)