""" Baseline eval: Qwen2.5-1.5B-Instruct (no LoRA) on the same 50 episodes as evaluate_and_plot. Usage (repo root or scripts/): python scripts/eval_baseline.py python scripts/eval_baseline.py --output ./baseline_results.json Colab (one cell after deps + repo mount): !python scripts/eval_baseline.py """ from __future__ import annotations import argparse import json import os import random import sys from collections import defaultdict from datetime import datetime, timezone from typing import Any import numpy as np import torch SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) REPO_ROOT = os.path.dirname(SCRIPT_DIR) sys.path.insert(0, REPO_ROOT) sys.path.insert(0, SCRIPT_DIR) from agent.conflict_generator import TaskGenerator, generate_conflict from core.life_state import DependencyGraph, LifeMetrics, ResourceBudget from intake.simperson import SimPerson from scripts.train_trl import ( ALL_DOMAINS, build_prompt_for_task, reward_task_success_fn, ) BASE_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" N_EPISODES = 50 def _resolve_device_for_hf() -> torch.dtype: if torch.cuda.is_available(): return torch.float16 if torch.backends.mps.is_available(): return torch.float16 return torch.float32 def _model_device(model: Any) -> torch.device: d = getattr(model, "device", None) if d is not None: return d return next(model.parameters()).device def load_base_model_qwen( model_name: str = BASE_MODEL_ID, ) -> tuple[Any, Any, str]: """Load base instruct model only (no PEFT), preferring Unsloth 4-bit when available.""" try: from unsloth import FastLanguageModel model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_name, max_seq_length=2048, load_in_4bit=True, ) FastLanguageModel.for_inference(model) model.eval() return model, tokenizer, f"unsloth+4bit:{model_name}" except Exception as e: print(f" Unsloth load failed ({e}), using transformers + AutoModelForCausalLM") from transformers import AutoModelForCausalLM, AutoTokenizer dtype = _resolve_device_for_hf() tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=dtype, device_map="auto", ) model.eval() return model, tokenizer, f"transformers:{model_name}" def run_baseline_eval( model_name: str = BASE_MODEL_ID, n_episodes: int = N_EPISODES, output_path: str = "baseline_results.json", ) -> dict[str, Any]: print("\n" + "=" * 50) print(" BASELINE EVALUATION (no LoRA)") print("=" * 50) model, tokenizer, load_tag = load_base_model_qwen(model_name) device = _model_device(model) print(f" Loaded: {load_tag} | device={device}") graph = DependencyGraph() rewards: list[float] = [] episode_rows: list[dict[str, Any]] = [] by_domain: dict[str, list[float]] = defaultdict(list) generator = TaskGenerator() for ep in range(n_episodes): difficulty = min(5, 1 + ep // 10) domain = ALL_DOMAINS[ep % len(ALL_DOMAINS)] ep_seed = ep * 137 random.seed(ep_seed) task = generator.generate(domain=domain, difficulty=difficulty) random.seed() metrics = LifeMetrics() conflict = generate_conflict(difficulty) metrics = graph.cascade(metrics, {**task.mutable_world, **conflict.primary_disruption}) budget_dict = task.constraints.get("budget", {}) budget = ResourceBudget( time_hours=budget_dict.get("time", 20.0), money_dollars=budget_dict.get("money", 500.0), energy_units=budget_dict.get("energy", 100.0), ) person = SimPerson(name="Eval") prompt = build_prompt_for_task(task, person, metrics, budget, seed=ep_seed, step=0) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=128, temperature=0.3, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) completion = tokenizer.decode( outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True, ) r = float(reward_task_success_fn([completion], [prompt])[0]) rewards.append(r) by_domain[domain].append(r) episode_rows.append( { "episode": ep, "domain": domain, "difficulty": difficulty, "seed": ep_seed, "reward": r, } ) if (ep + 1) % 10 == 0: print( f" Episode {ep + 1}/{n_episodes} | Reward: {r:.3f} | " f"Running mean: {float(np.mean(rewards)):.3f}" ) mean_r = float(np.mean(rewards)) per_domain: dict[str, Any] = {} for d in ALL_DOMAINS: rs = by_domain.get(d, []) per_domain[d] = { "n": len(rs), "mean": float(np.mean(rs)) if rs else 0.0, "rewards": [float(x) for x in rs], } print("\n" + "-" * 50) print(f" Mean reward (all {n_episodes} episodes): {mean_r:.4f}") print(" Per-domain mean (same schedule as evaluate_and_plot):") for d in ALL_DOMAINS: p = per_domain[d] if p["n"]: print(f" {d:20s} n={p['n']} mean={p['mean']:.4f}") print("-" * 50) payload: dict[str, Any] = { "schema": "lifestack_baseline_eval_v1", "created_utc": datetime.now(timezone.utc).isoformat(), "model": model_name, "load_method": load_tag, "n_episodes": n_episodes, "mean_reward": mean_r, "per_domain": per_domain, "all_domains_order": list(ALL_DOMAINS), "episodes": episode_rows, } out_dir = os.path.dirname(os.path.abspath(output_path)) if out_dir: os.makedirs(out_dir, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: json.dump(payload, f, indent=2) print(f" Wrote {output_path}") return payload def main() -> None: parser = argparse.ArgumentParser( description="50-episode baseline eval for Qwen2.5-1.5B-Instruct (no LoRA)." ) parser.add_argument( "--model", type=str, default=BASE_MODEL_ID, help="HF model id (default: Qwen/Qwen2.5-1.5B-Instruct)", ) parser.add_argument( "--episodes", type=int, default=N_EPISODES, help="Number of eval episodes (default: 50, matches evaluate_and_plot)", ) parser.add_argument( "--output", type=str, default="baseline_results.json", help="Where to write results JSON", ) args = parser.parse_args() run_baseline_eval( model_name=args.model, n_episodes=args.episodes, output_path=args.output, ) if __name__ == "__main__": main()