{ "schema": "lifestack_baseline_eval_v1", "note": "Base model eval (no LoRA). 50 episodes, same schedule as evaluate_and_plot. Per-episode rows omitted; re-run: python scripts/eval_baseline.py --output baseline_results.json", "model": "Qwen/Qwen2.5-1.5B-Instruct", "load_method": "transformers:Qwen/Qwen2.5-1.5B-Instruct", "environment": "linux GPU server, HF fallback (Unsloth unavailable due to TRL import mismatch)", "n_episodes": 50, "mean_reward": -0.07, "per_domain": { "career": { "n": 7, "mean": -0.1429 }, "finances": { "n": 7, "mean": 0.0 }, "relationships": { "n": 6, "mean": 0.0 }, "physical_health": { "n": 6, "mean": -0.1667 }, "mental_wellbeing": { "n": 6, "mean": -0.25 }, "time": { "n": 6, "mean": 0.0 }, "transport_crisis": { "n": 6, "mean": 0.0 }, "code_merge_crisis": { "n": 6, "mean": 0.0 } }, "all_domains_order": [ "career", "finances", "relationships", "physical_health", "mental_wellbeing", "time", "transport_crisis", "code_merge_crisis" ] }