""" run_episode.py — LifeStack Full Episode Runner Orchestrates a complete episode: 1. Generate a Task (with correct horizon from task.horizon) and a ConflictEvent 2. Initialize environment, agent, person, and memory 3. Loop up to task.horizon steps: agent decides → action applied → reward computed → memory updated 4. Print a rich episode summary at the end """ import sys, os; sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import random from core.life_state import LifeMetrics, ResourceBudget from core.lifestack_env import LifeStackEnv, LifeStackAction from agent.agent import LifeStackAgent from intake.simperson import SimPerson from agent.conflict_generator import generate_conflict, escalate_conflict, adaptive_escalate, TaskGenerator from core.action_space import apply_action, validate_action from agent.memory import LifeStackMemory from core.reward import compute_reward import copy _TASK_GENERATOR = TaskGenerator() def run_episode( difficulty: int = None, verbose: bool = True, memory: "LifeStackMemory" = None, agent: "LifeStackAgent" = None, agent_history: list = None, model_path: str = None, ) -> dict: """ Runs one full LifeStack episode. Args: memory: Optional shared LifeStackMemory instance (avoids re-loading the sentence-transformer model on every episode). agent: Optional shared LifeStackAgent instance (avoids re-creating the Groq client on every episode). agent_history: Optional list of (conflict_id, reward) tuples from prior episodes. Used by adaptive_escalate to decide difficulty. Returns: summary dict with total_reward, steps, final_metrics, conflicts_seen """ # -------------------------------------------------- # 1. SETUP # -------------------------------------------------- if agent is None: agent = LifeStackAgent(local_model_path=model_path) if memory is None: memory = LifeStackMemory() if agent_history is None: agent_history = [] # Pick a SimPerson from a diverse pool person_pool = [ SimPerson(name="Alex (Executive)", openness=0.4, conscientiousness=0.9, extraversion=0.7, agreeableness=0.25, neuroticism=0.8), SimPerson(name="Chloe (Creative)", openness=0.9, conscientiousness=0.2, extraversion=0.5, agreeableness=0.70, neuroticism=0.15), SimPerson(name="Sam (Introvert)", openness=0.5, conscientiousness=0.6, extraversion=0.1, agreeableness=0.65, neuroticism=0.9), SimPerson(name="Maya (Family)", openness=0.5, conscientiousness=0.7, extraversion=0.5, agreeableness=0.95, neuroticism=0.3), SimPerson(name="Leo (Student)", openness=0.85,conscientiousness=0.8, extraversion=0.4, agreeableness=0.4, neuroticism=0.55), ] person = random.choice(person_pool) # --- FIX: Generate a Task object so task.horizon is respected --- # Determine domain from difficulty: easy conflicts → flight_crisis, harder → code_merge_crisis domain = "flight_crisis" if (difficulty or 2) <= 3 else "code_merge_crisis" task = _TASK_GENERATOR.generate(domain=domain, difficulty=difficulty or random.randint(1, 3)) # Generate starting conflict (legacy ConflictEvent for disruption/budget) conflict = generate_conflict(difficulty) initial_conflict_id = conflict.id # --- FIX: Create env with task so max_steps = task.horizon (NOT hardcoded 5) --- env = LifeStackEnv(task=task) # Apply initial disruption to env; pass task= so reset() uses task.horizon obs = env.reset(task=task, conflict=conflict, budget=conflict.resource_budget, person=person, agent_history=agent_history) done = obs.done # -------------------------------------------------- # 2. EPISODE LOOP # -------------------------------------------------- total_reward = 0.0 step_log = [] conflicts_seen = [conflict.title] route_taken = [] initial_metrics_flat = env.state.current_metrics.flatten() if verbose: print("\n" + "◆" * 60) print(f" LIFESTACK EPISODE — {conflict.title}") print(f" Person : {person.name}") print(f" Hint : {person.get_personality_hint()}") print(f" Story : {conflict.story}") print("◆" * 60) env.render() while not done: step = obs.step # Inject few-shot context into agent memory few_shot = memory.build_few_shot_prompt(conflict.title, env.state.current_metrics.flatten()) # Agent decision metrics_before = copy.deepcopy(env.state.current_metrics) budget_before = copy.deepcopy(env.state.budget) action = agent.get_action(env.state.current_metrics, env.state.budget, conflict, person, few_shot_context=few_shot) # Validate resource cost is_valid, reason = validate_action(action, env.state.budget) if not is_valid: if verbose: print(f"\n ⚠️ Step {step+1}: Action unaffordable ({reason}). Forcing rest.") action.primary.metric_changes = {"mental_wellbeing.stress_level": -3.0} action.primary.resource_cost = {} # Scale metric changes by personality uptake current_stress = env.state.current_metrics.mental_wellbeing.stress_level uptake_score = person.respond_to_action( action.primary.action_type, action.primary.resource_cost, current_stress ) scaled_changes = {} # Make sure that path format is 'domain.submetric' for path, delta in action.primary.metric_changes.items(): if '.' not in path: # Prepend target_domain if the LLM forgot it path = f"{action.primary.target_domain}.{path}" # ensure float conversion just in case LLM put strings try: scaled_changes[path] = float(delta) * uptake_score except ValueError: pass # Apply action through environment env_action = LifeStackAction.from_agent_action(action) # Apply scaled changes env_action.metric_changes = scaled_changes obs = env.step(env_action) step_reward = obs.reward or 0.0 done = obs.done total_reward += step_reward # Store in transient agent memory agent.store_decision(action, step_reward) route_taken.append(f"{action.primary.action_type}({action.primary.target_domain})") # Log the step step_log.append({ "step": step + 1, "action": action.primary.action_type, "domain": action.primary.target_domain, "description": action.primary.description, "reward": round(step_reward, 3), "penalties": obs.metadata.get("breakdown", {}).get("penalties_fired", []) }) if verbose: print(f"\n{'─'*60}") print(f" STEP {step+1} → {action.primary.action_type.upper()} on {action.primary.target_domain}") print(f" \"{action.primary.description}\"") if action.communication: print(f" 💬 [{action.communication.recipient}] ({action.communication.tone}): {action.communication.content}") print(f" Reward: {step_reward:.3f} | Penalties: {obs.metadata.get('breakdown', {}).get('penalties_fired') or 'none'}") # Print Drift/Escalation info from metadata.info for msg in obs.metadata.get("info", []): if msg.startswith("DRIFT:"): print(f"\n[DRIFT] {msg[6:]}") if msg.startswith("ESCALATION:"): parts = msg[11:].split(" -> ") reason = parts[0] new_title = parts[1] conflicts_seen.append(new_title) print(f"\n🔥 ADAPTIVE ESCALATION: {reason}") print(f" New conflict: {new_title}") env.render() # -------------------------------------------------- # 3. EPISODE SUMMARY # -------------------------------------------------- final_flat = env.state.current_metrics.flatten() # Calculate difference string diffs = [] for k, v_end in final_flat.items(): v_start = initial_metrics_flat.get(k, 0.0) delta = v_end - v_start if abs(delta) >= 1.0: name = k.split('.')[-1] sign = "+" if delta > 0 else "" diffs.append(f"{name}:{sign}{delta:.1f}") metrics_diff_str = ", ".join(diffs) if diffs else "no_change" # Store full trajectory in ChromaDB memory.store_trajectory( conflict_title=conflict.title, route_taken=" -> ".join(route_taken), total_reward=total_reward, metrics_diff_str=metrics_diff_str, reasoning=f"Resolved with {env.state.step_count} steps. End critical: {len([k for k, v in final_flat.items() if v < 20])}" ) final_flat = env.state.current_metrics.flatten() critical = [k for k, v in final_flat.items() if v < 20] improved = [k for k, v in final_flat.items() if v > 70] mem_stats = memory.get_stats() if verbose: print("\n" + "█" * 60) print(" EPISODE COMPLETE — FINAL SUMMARY") print("█" * 60) print(f" Person : {person.name}") print(f" Conflicts Seen : {' → '.join(conflicts_seen)}") print(f" Steps Taken : {env.state.step_count}") print(f" Total Reward : {total_reward:.4f}") print(f" Critical (<20) : {critical or 'None'}") print(f" Thriving (>70) : {len(improved)} metrics") print(f"\n Step-by-Step Log:") for s in step_log: flag = " ⚠️ " if s["penalties"] else " ✅" print(f" {flag} Step {s['step']}: [{s['action']}] on {s['domain']} → {s['reward']:.3f}") print(f"\n Memory Bank : {mem_stats['total_memories']} decisions stored (avg reward: {mem_stats['average_reward']})") print("█" * 60) return { "person": person.name, "initial_conflict_id": initial_conflict_id, "total_reward": round(total_reward, 4), "steps": env.state.step_count, "conflicts_seen": conflicts_seen, "critical_metrics": critical, "thriving_count": len(improved), "step_log": step_log, "memory_stats": mem_stats } if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--model", default=None, help="Path to trained GRPO model (default: auto-detect ./lifestack_model or LIFESTACK_MODEL_PATH)") parser.add_argument("--difficulty", type=int, default=None, help="Fixed difficulty 1-5 (default: varies)") args = parser.parse_args() shared_agent = LifeStackAgent(local_model_path=args.model) shared_memory = LifeStackMemory(silent=True) difficulties = [args.difficulty] * 3 if args.difficulty else [2, 3, 5] for d in difficulties: print(f"\n{'═'*60}") print(f" STARTING EPISODE AT DIFFICULTY {d}") print(f"{'═'*60}") summary = run_episode(difficulty=d, verbose=True, agent=shared_agent, memory=shared_memory) print(f"\n → Total Reward: {summary['total_reward']}")