Spaces:

jdsb06
/

meta-r2

Sleeping

github-actions[bot]

Deploy Space snapshot

ddbc1ba about 1 month ago

9.37 kB

	"""
	scripts/eval.py
	---------------
	Standalone evaluation runner for the LifeStack environment.

	Runs N episodes with a random-action baseline (no model / GPU required) and
	prints a summary table plus aggregate statistics.

	Usage:
	python scripts/eval.py
	python scripts/eval.py --episodes 20
	python scripts/eval.py --episodes 20 --domain flight_crisis --verbose
	"""

	import argparse
	import random
	import sys
	import os

	# Allow running from repo root without installing the package.
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	from core.lifestack_env import LifeStackEnv, LifeStackAction
	from agent.conflict_generator import TaskGenerator

	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	# All action_types understood by the env's tool dispatch.
	_ACTION_TYPES = ["execute", "inspect", "plan", "wait", "communicate", "spend", "delegate"]

	# Known route IDs across the two TaskGenerator domains — used for targeted
	# "execute" actions so we occasionally hit real routes.
	_KNOWN_ROUTE_IDS = [
	"rebook_premium", "wait_lounge", # flight_crisis
	"revert_commit", "hotfix", # code_merge_crisis
	]


	def _random_action(task) -> LifeStackAction:
	"""Return a random LifeStackAction that exercises a variety of tool types."""
	action_type = random.choice(_ACTION_TYPES)

	# For "execute" actions, attempt to target a known route from the task.
	target = None
	if action_type == "execute":
	route_ids = [r.id for r in task.viable_routes] if task and task.viable_routes else _KNOWN_ROUTE_IDS
	target = random.choice(route_ids)
	elif action_type == "inspect":
	# Pick a random hidden-state key from the task or fall back to a default.
	if task and task.hidden_state:
	target = random.choice(list(task.hidden_state.keys()))
	else:
	target = "lounge_capacity"

	# Small, random metric nudges to keep the episode non-trivial.
	metric_changes: dict = {}
	if action_type in ("execute", "plan", "communicate"):
	domain = random.choice(
	["career", "finances", "relationships", "physical_health", "mental_wellbeing", "time"]
	)
	sub_key = random.choice(["workload", "stress_level", "liquidity", "sleep_quality", "energy", "free_hours_per_week"])
	metric_changes[f"{domain}.{sub_key}"] = random.uniform(-10.0, 10.0)

	resource_cost: dict = {}
	if action_type != "wait":
	resource_cost = {
	"time": random.uniform(0.0, 2.0),
	"money": random.uniform(0.0, 50.0),
	"energy": random.uniform(0.0, 10.0),
	}

	return LifeStackAction(
	action_type=action_type,
	target=target,
	metric_changes=metric_changes,
	resource_cost=resource_cost,
	actions_taken=1,
	reasoning="random baseline",
	)


	def _row(ep_id: int, total_reward: float, steps: int, domain: str, success: bool) -> str:
	"""Format one summary table row."""
	success_str = "✓" if success else "✗"
	return (
	f" {ep_id:>4} "
	f"{total_reward:>12.4f} "
	f"{steps:>6} "
	f"{domain:<20} "
	f"{success_str:>7}"
	)


	# ---------------------------------------------------------------------------
	# Core evaluation loop
	# ---------------------------------------------------------------------------

	def run_eval(n_episodes: int, domain: str \| None, verbose: bool) -> None:
	generator = TaskGenerator()
	env = LifeStackEnv()

	results = []

	header = (
	f"\n {'EP':>4} {'TOTAL REWARD':>12} {'STEPS':>6} {'DOMAIN':<20} {'SUCCESS':>7}\n"
	f" {'─'4} {'─'12} {'─'6} {'─'20} {'─'*7}"
	)
	print(header)

	for ep in range(1, n_episodes + 1):
	# Generate task (optionally filtered by domain).
	task = generator.generate(domain=domain)

	obs = env.reset(task=task, episode_id=str(ep))

	total_reward = 0.0
	steps = 0
	success = False

	while not obs.done:
	action = _random_action(env.state.current_task)
	obs = env.step(action)
	reward = obs.reward or 0.0
	total_reward += reward
	steps += 1

	if verbose:
	print(
	f" step={steps:>3} reward={reward:+.3f} "
	f"action={action.action_type:<12} "
	f"target={str(action.target):<20} "
	f"done={obs.done}"
	)

	if obs.metadata.get("success"):
	success = True

	task_domain = task.domain if task else "unknown"
	results.append(
	{
	"episode": ep,
	"total_reward": total_reward,
	"steps": steps,
	"domain": task_domain,
	"success": success,
	}
	)

	print(_row(ep, total_reward, steps, task_domain, success))

	# -----------------------------------------------------------------------
	# Aggregate stats
	# -----------------------------------------------------------------------
	n = len(results)
	mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
	success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
	mean_steps = sum(r["steps"] for r in results) / n if n else 0.0

	print(
	f"\n {'─'*60}\n"
	f" Episodes : {n}\n"
	f" Mean Reward : {mean_reward:.4f}\n"
	f" Success Rate : {success_rate:.1%}\n"
	f" Mean Steps : {mean_steps:.1f}\n"
	)


	# Alias used by train_trl.py
	run_evaluation = run_eval


	# ---------------------------------------------------------------------------
	# Holdout evaluation — fixed task seeds not used during training
	# ---------------------------------------------------------------------------

	def run_holdout_eval(n_episodes: int = 10, verbose: bool = False) -> dict:
	"""Run evaluation on a fixed holdout set for generalization measurement."""
	import json as _json

	holdout_path = os.path.join(os.path.dirname(__file__), "..", "data", "holdout_tasks.json")
	try:
	with open(holdout_path) as fh:
	holdout_configs = _json.load(fh)
	except FileNotFoundError:
	print(f"[holdout] No holdout file at {holdout_path}; falling back to random tasks.")
	holdout_configs = [{"id": f"fallback_{i}", "seed": 9000 + i} for i in range(n_episodes)]

	generator = TaskGenerator()
	env = LifeStackEnv()
	results = []

	print(f"\n {'─'*60}")
	print(f" HOLDOUT EVALUATION ({len(holdout_configs)} fixed tasks)")
	print(f" {'─'*60}")

	for cfg in holdout_configs[:n_episodes]:
	seed = cfg.get("seed", 9000)
	domain = cfg.get("domain", "flight_crisis")
	task = generator.generate(domain=domain)

	obs = env.reset(task=task, seed=seed, episode_id=cfg["id"])
	total_reward = 0.0
	steps = 0
	success = False

	while not obs.done:
	action = _random_action(env.state.current_task)
	obs = env.step(action)
	total_reward += obs.reward or 0.0
	steps += 1
	if verbose:
	print(f" step={steps:>3} reward={obs.reward:+.3f} action={action.action_type}")
	if obs.metadata.get("success"):
	success = True

	results.append({"id": cfg["id"], "total_reward": total_reward, "steps": steps, "success": success})
	print(f" {cfg['id']:<20} reward={total_reward:>8.4f} steps={steps:>4} {'✓' if success else '✗'}")

	n = len(results)
	mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
	success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
	print(f"\n Holdout Mean Reward : {mean_reward:.4f}")
	print(f" Holdout Success Rate : {success_rate:.1%}\n")
	return {"mean_reward": mean_reward, "success_rate": success_rate, "results": results}


	# ---------------------------------------------------------------------------
	# CLI entry-point
	# ---------------------------------------------------------------------------

	def _parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="LifeStack environment evaluation runner (random baseline)."
	)
	parser.add_argument(
	"--episodes",
	type=int,
	default=10,
	help="Number of episodes to run (default: 10).",
	)
	parser.add_argument(
	"--domain",
	type=str,
	default=None,
	help=(
	"Optional domain filter passed to TaskGenerator.generate(). "
	"Supported: 'flight_crisis', 'code_merge_crisis'. "
	"Omit to cycle randomly."
	),
	)
	parser.add_argument(
	"--verbose",
	action="store_true",
	default=False,
	help="Print per-step details for every episode.",
	)
	return parser.parse_args()


	if __name__ == "__main__":
	args = _parse_args()
	print(
	f"LifeStack Eval — episodes={args.episodes} "
	f"domain={args.domain or 'any'} "
	f"verbose={args.verbose}"
	)
	run_eval(n_episodes=args.episodes, domain=args.domain, verbose=args.verbose)