""" Compare base vs trained LifeStack policy on identical crisis prompts. Usage: python scripts/compare_baseline.py python scripts/compare_baseline.py --trained-model ./lifestack_model """ import argparse import json import os import random import sys from datetime import datetime from typing import Any import torch SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) REPO_ROOT = os.path.dirname(SCRIPT_DIR) sys.path.insert(0, REPO_ROOT) sys.path.insert(0, SCRIPT_DIR) from agent.conflict_generator import TaskGenerator, generate_conflict from core.life_state import DependencyGraph, LifeMetrics, ResourceBudget from intake.simperson import SimPerson from scripts.train_trl import build_prompt_for_task, get_lifestack_evaluation def _load_base_model(): """Load base Qwen2.5-1.5B-Instruct (no training adapter).""" try: from unsloth import FastLanguageModel model, tokenizer = FastLanguageModel.from_pretrained( model_name="unsloth/Qwen2.5-1.5B-Instruct", max_seq_length=1024, load_in_4bit=True, ) FastLanguageModel.for_inference(model) return model, tokenizer, "unsloth/base-qwen2.5-1.5b-instruct" except Exception: from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "Qwen/Qwen2.5-1.5B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto", ) model.eval() return model, tokenizer, model_name def _load_trained_model(model_dir: str): """Load trained LifeStack model from local adapter/full checkpoint directory.""" try: from unsloth import FastLanguageModel model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_dir, max_seq_length=1024, load_in_4bit=True, ) FastLanguageModel.for_inference(model) return model, tokenizer except Exception: from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_dir) base = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2.5-1.5B-Instruct", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto", ) model = PeftModel.from_pretrained(base, model_dir) model.eval() return model, tokenizer def _device_for(model) -> torch.device: try: return next(model.parameters()).device except Exception: return torch.device("cuda" if torch.cuda.is_available() else "cpu") def _generate_completion(model, tokenizer, prompt: str, temperature: float = 0.3) -> str: device = _device_for(model) inputs = tokenizer(prompt, return_tensors="pt").to(device) pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=128, temperature=temperature, do_sample=True, top_p=0.9, pad_token_id=pad_token_id, eos_token_id=tokenizer.eos_token_id, ) return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True).strip() def _build_eval_cases() -> list[dict[str, Any]]: """Create 5 deterministic prompts spanning different crisis domains.""" domains = [ ("career", 3, 101), ("finances", 4, 202), ("relationships", 3, 303), ("transport_crisis", 4, 404), ("code_merge_crisis", 5, 505), ] generator = TaskGenerator() graph = DependencyGraph() person = SimPerson(name="Comparator") cases: list[dict[str, Any]] = [] for domain, difficulty, seed in domains: random.seed(seed) task = generator.generate(domain=domain, difficulty=difficulty) conflict = generate_conflict(difficulty) random.seed() metrics = LifeMetrics() metrics = graph.cascade(metrics, {**task.mutable_world, **conflict.primary_disruption}) budget_dict = task.constraints.get("budget", {}) budget = ResourceBudget( time_hours=budget_dict.get("time", 20.0), money_dollars=budget_dict.get("money", 500.0), energy_units=budget_dict.get("energy", 100.0), ) prompt = build_prompt_for_task(task, person, metrics, budget, seed=seed, step=0) crisis_text = task.domain_metadata.get("story", task.goal) cases.append( { "case_id": f"{domain}_d{difficulty}", "domain": domain, "difficulty": difficulty, "seed": seed, "crisis": crisis_text, "prompt": prompt, } ) return cases def _print_case(case: dict[str, Any]) -> None: print("=" * 110) print(f"[{case['case_id']}] domain={case['domain']} difficulty={case['difficulty']}") print(f"crisis: {case['crisis']}") print(f"base_reward={case['base_reward']:.3f} | trained_reward={case['trained_reward']:.3f} | delta={case['delta']:+.3f}") print("- BASE RESPONSE -") print(case["base_response"] or "") print("- TRAINED RESPONSE -") print(case["trained_response"] or "") def run_compare(trained_model_dir: str, output_path: str) -> dict[str, Any]: cases = _build_eval_cases() print("Loading base model...") base_model, base_tokenizer, base_name = _load_base_model() for case in cases: completion = _generate_completion(base_model, base_tokenizer, case["prompt"]) eval_data = get_lifestack_evaluation(completion, case["prompt"]) case["base_model"] = base_name case["base_response"] = completion case["base_reward"] = float(eval_data.get("reward", -0.5)) del base_model torch.cuda.empty_cache() print("Loading trained model...") trained_model, trained_tokenizer = _load_trained_model(trained_model_dir) for case in cases: completion = _generate_completion(trained_model, trained_tokenizer, case["prompt"]) eval_data = get_lifestack_evaluation(completion, case["prompt"]) case["trained_model"] = trained_model_dir case["trained_response"] = completion case["trained_reward"] = float(eval_data.get("reward", -0.5)) case["delta"] = round(case["trained_reward"] - case["base_reward"], 4) _print_case(case) del trained_model torch.cuda.empty_cache() avg_base = sum(c["base_reward"] for c in cases) / len(cases) avg_trained = sum(c["trained_reward"] for c in cases) / len(cases) avg_delta = avg_trained - avg_base payload = { "timestamp_utc": datetime.utcnow().isoformat() + "Z", "summary": { "n_cases": len(cases), "avg_base_reward": round(avg_base, 4), "avg_trained_reward": round(avg_trained, 4), "avg_reward_delta": round(avg_delta, 4), "base_model": cases[0]["base_model"] if cases else "", "trained_model": trained_model_dir, }, "cases": cases, } output_dir = os.path.dirname(output_path) if output_dir: os.makedirs(output_dir, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: json.dump(payload, f, indent=2) print("=" * 110) print( f"SUMMARY: avg_base={avg_base:.3f} | avg_trained={avg_trained:.3f} | " f"avg_delta={avg_delta:+.3f}" ) print(f"Saved comparison JSON: {output_path}") return payload def main(): parser = argparse.ArgumentParser(description="Compare baseline Qwen vs trained LifeStack model.") parser.add_argument("--trained-model", type=str, default="./lifestack_model") parser.add_argument("--output", type=str, default="./data/before_after_comparison.json") args = parser.parse_args() run_compare(trained_model_dir=args.trained_model, output_path=args.output) if __name__ == "__main__": main()