File size: 9,374 Bytes
ddbc1ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
"""
scripts/eval.py
---------------
Standalone evaluation runner for the LifeStack environment.

Runs N episodes with a random-action baseline (no model / GPU required) and
prints a summary table plus aggregate statistics.

Usage:
    python scripts/eval.py
    python scripts/eval.py --episodes 20
    python scripts/eval.py --episodes 20 --domain flight_crisis --verbose
"""

import argparse
import random
import sys
import os

# Allow running from repo root without installing the package.
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from core.lifestack_env import LifeStackEnv, LifeStackAction
from agent.conflict_generator import TaskGenerator

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

# All action_types understood by the env's tool dispatch.
_ACTION_TYPES = ["execute", "inspect", "plan", "wait", "communicate", "spend", "delegate"]

# Known route IDs across the two TaskGenerator domains β€” used for targeted
# "execute" actions so we occasionally hit real routes.
_KNOWN_ROUTE_IDS = [
    "rebook_premium", "wait_lounge",        # flight_crisis
    "revert_commit", "hotfix",              # code_merge_crisis
]


def _random_action(task) -> LifeStackAction:
    """Return a random LifeStackAction that exercises a variety of tool types."""
    action_type = random.choice(_ACTION_TYPES)

    # For "execute" actions, attempt to target a known route from the task.
    target = None
    if action_type == "execute":
        route_ids = [r.id for r in task.viable_routes] if task and task.viable_routes else _KNOWN_ROUTE_IDS
        target = random.choice(route_ids)
    elif action_type == "inspect":
        # Pick a random hidden-state key from the task or fall back to a default.
        if task and task.hidden_state:
            target = random.choice(list(task.hidden_state.keys()))
        else:
            target = "lounge_capacity"

    # Small, random metric nudges to keep the episode non-trivial.
    metric_changes: dict = {}
    if action_type in ("execute", "plan", "communicate"):
        domain = random.choice(
            ["career", "finances", "relationships", "physical_health", "mental_wellbeing", "time"]
        )
        sub_key = random.choice(["workload", "stress_level", "liquidity", "sleep_quality", "energy", "free_hours_per_week"])
        metric_changes[f"{domain}.{sub_key}"] = random.uniform(-10.0, 10.0)

    resource_cost: dict = {}
    if action_type != "wait":
        resource_cost = {
            "time":   random.uniform(0.0, 2.0),
            "money":  random.uniform(0.0, 50.0),
            "energy": random.uniform(0.0, 10.0),
        }

    return LifeStackAction(
        action_type=action_type,
        target=target,
        metric_changes=metric_changes,
        resource_cost=resource_cost,
        actions_taken=1,
        reasoning="random baseline",
    )


def _row(ep_id: int, total_reward: float, steps: int, domain: str, success: bool) -> str:
    """Format one summary table row."""
    success_str = "βœ“" if success else "βœ—"
    return (
        f"  {ep_id:>4}  "
        f"{total_reward:>12.4f}  "
        f"{steps:>6}  "
        f"{domain:<20}  "
        f"{success_str:>7}"
    )


# ---------------------------------------------------------------------------
# Core evaluation loop
# ---------------------------------------------------------------------------

def run_eval(n_episodes: int, domain: str | None, verbose: bool) -> None:
    generator = TaskGenerator()
    env = LifeStackEnv()

    results = []

    header = (
        f"\n  {'EP':>4}  {'TOTAL REWARD':>12}  {'STEPS':>6}  {'DOMAIN':<20}  {'SUCCESS':>7}\n"
        f"  {'─'*4}  {'─'*12}  {'─'*6}  {'─'*20}  {'─'*7}"
    )
    print(header)

    for ep in range(1, n_episodes + 1):
        # Generate task (optionally filtered by domain).
        task = generator.generate(domain=domain)

        obs = env.reset(task=task, episode_id=str(ep))

        total_reward = 0.0
        steps = 0
        success = False

        while not obs.done:
            action = _random_action(env.state.current_task)
            obs = env.step(action)
            reward = obs.reward or 0.0
            total_reward += reward
            steps += 1

            if verbose:
                print(
                    f"    step={steps:>3}  reward={reward:+.3f}  "
                    f"action={action.action_type:<12}  "
                    f"target={str(action.target):<20}  "
                    f"done={obs.done}"
                )

            if obs.metadata.get("success"):
                success = True

        task_domain = task.domain if task else "unknown"
        results.append(
            {
                "episode": ep,
                "total_reward": total_reward,
                "steps": steps,
                "domain": task_domain,
                "success": success,
            }
        )

        print(_row(ep, total_reward, steps, task_domain, success))

    # -----------------------------------------------------------------------
    # Aggregate stats
    # -----------------------------------------------------------------------
    n = len(results)
    mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
    success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
    mean_steps = sum(r["steps"] for r in results) / n if n else 0.0

    print(
        f"\n  {'─'*60}\n"
        f"  Episodes     : {n}\n"
        f"  Mean Reward  : {mean_reward:.4f}\n"
        f"  Success Rate : {success_rate:.1%}\n"
        f"  Mean Steps   : {mean_steps:.1f}\n"
    )


# Alias used by train_trl.py
run_evaluation = run_eval


# ---------------------------------------------------------------------------
# Holdout evaluation β€” fixed task seeds not used during training
# ---------------------------------------------------------------------------

def run_holdout_eval(n_episodes: int = 10, verbose: bool = False) -> dict:
    """Run evaluation on a fixed holdout set for generalization measurement."""
    import json as _json

    holdout_path = os.path.join(os.path.dirname(__file__), "..", "data", "holdout_tasks.json")
    try:
        with open(holdout_path) as fh:
            holdout_configs = _json.load(fh)
    except FileNotFoundError:
        print(f"[holdout] No holdout file at {holdout_path}; falling back to random tasks.")
        holdout_configs = [{"id": f"fallback_{i}", "seed": 9000 + i} for i in range(n_episodes)]

    generator = TaskGenerator()
    env = LifeStackEnv()
    results = []

    print(f"\n  {'─'*60}")
    print(f"  HOLDOUT EVALUATION ({len(holdout_configs)} fixed tasks)")
    print(f"  {'─'*60}")

    for cfg in holdout_configs[:n_episodes]:
        seed = cfg.get("seed", 9000)
        domain = cfg.get("domain", "flight_crisis")
        task = generator.generate(domain=domain)

        obs = env.reset(task=task, seed=seed, episode_id=cfg["id"])
        total_reward = 0.0
        steps = 0
        success = False

        while not obs.done:
            action = _random_action(env.state.current_task)
            obs = env.step(action)
            total_reward += obs.reward or 0.0
            steps += 1
            if verbose:
                print(f"    step={steps:>3}  reward={obs.reward:+.3f}  action={action.action_type}")
            if obs.metadata.get("success"):
                success = True

        results.append({"id": cfg["id"], "total_reward": total_reward, "steps": steps, "success": success})
        print(f"  {cfg['id']:<20}  reward={total_reward:>8.4f}  steps={steps:>4}  {'βœ“' if success else 'βœ—'}")

    n = len(results)
    mean_reward = sum(r["total_reward"] for r in results) / n if n else 0.0
    success_rate = sum(1 for r in results if r["success"]) / n if n else 0.0
    print(f"\n  Holdout Mean Reward  : {mean_reward:.4f}")
    print(f"  Holdout Success Rate : {success_rate:.1%}\n")
    return {"mean_reward": mean_reward, "success_rate": success_rate, "results": results}


# ---------------------------------------------------------------------------
# CLI entry-point
# ---------------------------------------------------------------------------

def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="LifeStack environment evaluation runner (random baseline)."
    )
    parser.add_argument(
        "--episodes",
        type=int,
        default=10,
        help="Number of episodes to run (default: 10).",
    )
    parser.add_argument(
        "--domain",
        type=str,
        default=None,
        help=(
            "Optional domain filter passed to TaskGenerator.generate(). "
            "Supported: 'flight_crisis', 'code_merge_crisis'. "
            "Omit to cycle randomly."
        ),
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=False,
        help="Print per-step details for every episode.",
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = _parse_args()
    print(
        f"LifeStack Eval β€” episodes={args.episodes}  "
        f"domain={args.domain or 'any'}  "
        f"verbose={args.verbose}"
    )
    run_eval(n_episodes=args.episodes, domain=args.domain, verbose=args.verbose)