Spaces:

s-b3
/

LifeStack

Sleeping

Soham Banerjee

deploy: pure lifestack with partitioned wisdom pool

77da5ce about 1 month ago

11.4 kB

	"""
	run_episode.py — LifeStack Full Episode Runner

	Orchestrates a complete episode:
	1. Generate a Task (with correct horizon from task.horizon) and a ConflictEvent
	2. Initialize environment, agent, person, and memory
	3. Loop up to task.horizon steps: agent decides → action applied → reward computed → memory updated
	4. Print a rich episode summary at the end
	"""

	import sys, os; sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	import random
	from core.life_state import LifeMetrics, ResourceBudget
	from core.lifestack_env import LifeStackEnv, LifeStackAction
	from agent.agent import LifeStackAgent
	from intake.simperson import SimPerson
	from agent.conflict_generator import generate_conflict, escalate_conflict, adaptive_escalate, TaskGenerator
	from core.action_space import apply_action, validate_action
	from agent.memory import LifeStackMemory
	from core.reward import compute_reward
	import copy

	_TASK_GENERATOR = TaskGenerator()


	def run_episode(
	difficulty: int = None,
	verbose: bool = True,
	memory: "LifeStackMemory" = None,
	agent: "LifeStackAgent" = None,
	agent_history: list = None,
	model_path: str = None,
	) -> dict:
	"""
	Runs one full LifeStack episode.

	Args:
	memory: Optional shared LifeStackMemory instance (avoids re-loading the
	sentence-transformer model on every episode).
	agent: Optional shared LifeStackAgent instance (avoids re-creating the
	Groq client on every episode).
	agent_history: Optional list of (conflict_id, reward) tuples from prior
	episodes. Used by adaptive_escalate to decide difficulty.

	Returns:
	summary dict with total_reward, steps, final_metrics, conflicts_seen
	"""
	# --------------------------------------------------
	# 1. SETUP
	# --------------------------------------------------
	if agent is None:
	agent = LifeStackAgent(local_model_path=model_path)
	if memory is None:
	memory = LifeStackMemory()
	if agent_history is None:
	agent_history = []

	# Pick a SimPerson from a diverse pool
	person_pool = [
	SimPerson(name="Alex (Executive)", openness=0.4, conscientiousness=0.9, extraversion=0.7, agreeableness=0.25, neuroticism=0.8),
	SimPerson(name="Chloe (Creative)", openness=0.9, conscientiousness=0.2, extraversion=0.5, agreeableness=0.70, neuroticism=0.15),
	SimPerson(name="Sam (Introvert)", openness=0.5, conscientiousness=0.6, extraversion=0.1, agreeableness=0.65, neuroticism=0.9),
	SimPerson(name="Maya (Family)", openness=0.5, conscientiousness=0.7, extraversion=0.5, agreeableness=0.95, neuroticism=0.3),
	SimPerson(name="Leo (Student)", openness=0.85,conscientiousness=0.8, extraversion=0.4, agreeableness=0.4, neuroticism=0.55),
	]
	person = random.choice(person_pool)

	# --- FIX: Generate a Task object so task.horizon is respected ---
	# Determine domain from difficulty: easy conflicts → flight_crisis, harder → code_merge_crisis
	domain = "flight_crisis" if (difficulty or 2) <= 3 else "code_merge_crisis"
	task = _TASK_GENERATOR.generate(domain=domain, difficulty=difficulty or random.randint(1, 3))

	# Generate starting conflict (legacy ConflictEvent for disruption/budget)
	conflict = generate_conflict(difficulty)
	initial_conflict_id = conflict.id

	# --- FIX: Create env with task so max_steps = task.horizon (NOT hardcoded 5) ---
	env = LifeStackEnv(task=task)

	# Apply initial disruption to env; pass task= so reset() uses task.horizon
	obs = env.reset(task=task, conflict=conflict, budget=conflict.resource_budget,
	person=person, agent_history=agent_history)
	done = obs.done

	# --------------------------------------------------
	# 2. EPISODE LOOP
	# --------------------------------------------------
	total_reward = 0.0
	step_log = []
	conflicts_seen = [conflict.title]
	route_taken = []
	initial_metrics_flat = env.state.current_metrics.flatten()

	if verbose:
	print("\n" + "◆" * 60)
	print(f" LIFESTACK EPISODE — {conflict.title}")
	print(f" Person : {person.name}")
	print(f" Hint : {person.get_personality_hint()}")
	print(f" Story : {conflict.story}")
	print("◆" * 60)
	env.render()

	while not done:
	step = obs.step

	# Inject few-shot context into agent memory
	few_shot = memory.build_few_shot_prompt(conflict.title, env.state.current_metrics.flatten())

	# Agent decision
	metrics_before = copy.deepcopy(env.state.current_metrics)
	budget_before = copy.deepcopy(env.state.budget)

	action = agent.get_action(env.state.current_metrics, env.state.budget, conflict, person, few_shot_context=few_shot)

	# Validate resource cost
	is_valid, reason = validate_action(action, env.state.budget)
	if not is_valid:
	if verbose:
	print(f"\n ⚠️ Step {step+1}: Action unaffordable ({reason}). Forcing rest.")
	action.primary.metric_changes = {"mental_wellbeing.stress_level": -3.0}
	action.primary.resource_cost = {}

	# Scale metric changes by personality uptake
	current_stress = env.state.current_metrics.mental_wellbeing.stress_level
	uptake_score = person.respond_to_action(
	action.primary.action_type,
	action.primary.resource_cost,
	current_stress
	)
	scaled_changes = {}
	# Make sure that path format is 'domain.submetric'
	for path, delta in action.primary.metric_changes.items():
	if '.' not in path: # Prepend target_domain if the LLM forgot it
	path = f"{action.primary.target_domain}.{path}"
	# ensure float conversion just in case LLM put strings
	try:
	scaled_changes[path] = float(delta) * uptake_score
	except ValueError:
	pass

	# Apply action through environment
	env_action = LifeStackAction.from_agent_action(action)
	# Apply scaled changes
	env_action.metric_changes = scaled_changes
	obs = env.step(env_action)
	step_reward = obs.reward or 0.0
	done = obs.done
	total_reward += step_reward

	# Store in transient agent memory
	agent.store_decision(action, step_reward)
	route_taken.append(f"{action.primary.action_type}({action.primary.target_domain})")

	# Log the step
	step_log.append({
	"step": step + 1,
	"action": action.primary.action_type,
	"domain": action.primary.target_domain,
	"description": action.primary.description,
	"reward": round(step_reward, 3),
	"penalties": obs.metadata.get("breakdown", {}).get("penalties_fired", [])
	})

	if verbose:
	print(f"\n{'─'*60}")
	print(f" STEP {step+1} → {action.primary.action_type.upper()} on {action.primary.target_domain}")
	print(f" \"{action.primary.description}\"")
	if action.communication:
	print(f" 💬 [{action.communication.recipient}] ({action.communication.tone}): {action.communication.content}")
	print(f" Reward: {step_reward:.3f} \| Penalties: {obs.metadata.get('breakdown', {}).get('penalties_fired') or 'none'}")

	# Print Drift/Escalation info from metadata.info
	for msg in obs.metadata.get("info", []):
	if msg.startswith("DRIFT:"):
	print(f"\n[DRIFT] {msg[6:]}")
	if msg.startswith("ESCALATION:"):
	parts = msg[11:].split(" -> ")
	reason = parts[0]
	new_title = parts[1]
	conflicts_seen.append(new_title)
	print(f"\n🔥 ADAPTIVE ESCALATION: {reason}")
	print(f" New conflict: {new_title}")

	env.render()

	# --------------------------------------------------
	# 3. EPISODE SUMMARY
	# --------------------------------------------------
	final_flat = env.state.current_metrics.flatten()

	# Calculate difference string
	diffs = []
	for k, v_end in final_flat.items():
	v_start = initial_metrics_flat.get(k, 0.0)
	delta = v_end - v_start
	if abs(delta) >= 1.0:
	name = k.split('.')[-1]
	sign = "+" if delta > 0 else ""
	diffs.append(f"{name}:{sign}{delta:.1f}")
	metrics_diff_str = ", ".join(diffs) if diffs else "no_change"

	# Store full trajectory in ChromaDB
	memory.store_trajectory(
	conflict_title=conflict.title,
	route_taken=" -> ".join(route_taken),
	total_reward=total_reward,
	metrics_diff_str=metrics_diff_str,
	reasoning=f"Resolved with {env.state.step_count} steps. End critical: {len([k for k, v in final_flat.items() if v < 20])}"
	)
	final_flat = env.state.current_metrics.flatten()
	critical = [k for k, v in final_flat.items() if v < 20]
	improved = [k for k, v in final_flat.items() if v > 70]
	mem_stats = memory.get_stats()

	if verbose:
	print("\n" + "█" * 60)
	print(" EPISODE COMPLETE — FINAL SUMMARY")
	print("█" * 60)
	print(f" Person : {person.name}")
	print(f" Conflicts Seen : {' → '.join(conflicts_seen)}")
	print(f" Steps Taken : {env.state.step_count}")
	print(f" Total Reward : {total_reward:.4f}")
	print(f" Critical (<20) : {critical or 'None'}")
	print(f" Thriving (>70) : {len(improved)} metrics")
	print(f"\n Step-by-Step Log:")
	for s in step_log:
	flag = " ⚠️ " if s["penalties"] else " ✅"
	print(f" {flag} Step {s['step']}: [{s['action']}] on {s['domain']} → {s['reward']:.3f}")
	print(f"\n Memory Bank : {mem_stats['total_memories']} decisions stored (avg reward: {mem_stats['average_reward']})")
	print("█" * 60)

	return {
	"person": person.name,
	"initial_conflict_id": initial_conflict_id,
	"total_reward": round(total_reward, 4),
	"steps": env.state.step_count,
	"conflicts_seen": conflicts_seen,
	"critical_metrics": critical,
	"thriving_count": len(improved),
	"step_log": step_log,
	"memory_stats": mem_stats
	}


	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--model", default=None, help="Path to trained GRPO model (default: auto-detect ./lifestack_model or LIFESTACK_MODEL_PATH)")
	parser.add_argument("--difficulty", type=int, default=None, help="Fixed difficulty 1-5 (default: varies)")
	args = parser.parse_args()

	shared_agent = LifeStackAgent(local_model_path=args.model)
	shared_memory = LifeStackMemory(silent=True)

	difficulties = [args.difficulty] * 3 if args.difficulty else [2, 3, 5]
	for d in difficulties:
	print(f"\n{'═'*60}")
	print(f" STARTING EPISODE AT DIFFICULTY {d}")
	print(f"{'═'*60}")
	summary = run_episode(difficulty=d, verbose=True, agent=shared_agent, memory=shared_memory)
	print(f"\n → Total Reward: {summary['total_reward']}")