Spaces:

jdsb06
/

meta-r2

Sleeping

File size: 13,025 Bytes

ddbc1ba

"""
train.py — LifeStack Training Loop

Runs a curriculum of episodes at increasing difficulty, logs rewards,
generates a learning curve plot, and compares agent performance
before and after memory accumulation.
"""

import sys, os; sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import json
import random
import shutil
import matplotlib
matplotlib.use("Agg")  # Non-interactive backend — safe for headless runs
import matplotlib.pyplot as plt

from scripts.run_episode import run_episode
from agent.memory import LifeStackMemory
from agent.agent import LifeStackAgent


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _difficulty_for_episode(episode: int) -> int:
    """Curriculum schedule: easy → medium → hard → extreme."""
    if episode <= 25:
        return random.randint(1, 2)
    elif episode <= 50:
        return random.randint(2, 3)
    elif episode <= 75:
        return random.randint(3, 4)
    else:
        return random.randint(4, 5)


def _rolling_avg(values: list, window: int = 5) -> list:
    """Compute a simple rolling average with the given window."""
    out = []
    for i in range(len(values)):
        start = max(0, i - window + 1)
        out.append(sum(values[start : i + 1]) / (i - start + 1))
    return out


def _phase_avg(rewards: list, start: int, end: int) -> float:
    """Average reward for 1-indexed episodes [start, end]."""
    subset = rewards[start - 1 : end]
    return round(sum(subset) / len(subset), 3) if subset else 0.0


# ---------------------------------------------------------------------------
# Main training function
# ---------------------------------------------------------------------------

def run_training(n_episodes: int = 50, save_plot: bool = True) -> dict:
    """
    Runs the full LifeStack curriculum training loop.

    Returns:
        summary dict with per-episode logs and phase averages.
    """
    episode_log = []
    rewards = []
    agent_history = []

    print(f"\n{'═' * 50}")
    print(f"  LIFESTACK TRAINING — {n_episodes} EPISODES")
    print(f"{'═' * 50}\n")

    # Initialize shared instances once — avoids reloading model weights each episode
    print("  Initializing shared agent and memory (one-time load)...")
    shared_memory = LifeStackMemory(silent=True)  # suppress per-decision spam
    shared_agent  = LifeStackAgent()
    print("  ✅ Ready.\n")

    for ep in range(1, n_episodes + 1):
        difficulty = _difficulty_for_episode(ep)

        # Run episode with shared memory + agent + history tracking
        result = run_episode(difficulty=difficulty, verbose=False,
                             memory=shared_memory, agent=shared_agent,
                             agent_history=agent_history)

        total_reward = result["total_reward"]
        rewards.append(total_reward)
        agent_history.append((result["initial_conflict_id"], total_reward))

        record = {
            "episode": ep,
            "reward": total_reward,
            "difficulty": difficulty,
            "person": result["person"],
            "conflicts_seen": result["conflicts_seen"],
            "steps": result["steps"],
        }
        episode_log.append(record)

        # Progress: print every episode
        mem_count = result["memory_stats"]["total_memories"]
        print(
            f"  Episode {ep:>3}/{n_episodes} | "
            f"Reward: {total_reward:.3f} | "
            f"Difficulty: {difficulty} | "
            f"Memories: {mem_count}"
        )

    # ------------------------------------------------------------------
    # Phase averages
    # ------------------------------------------------------------------
    early_avg = _phase_avg(rewards, 1, 25)
    mid_avg   = _phase_avg(rewards, 26, 50)
    late_avg  = _phase_avg(rewards, 51, 75)
    final_avg = _phase_avg(rewards, 76, n_episodes)
    overall   = round(sum(rewards) / len(rewards), 3)

    print(f"\n{'═' * 42}")
    print(f"  TRAINING SUMMARY")
    print(f"{'═' * 42}")
    print(f"  {'Phase':<10} {'Episodes':<12} {'Avg Reward'}")
    print(f"  {'-'*38}")
    print(f"  {'Early':<10} {'1-25':<12} {early_avg:.3f}")
    print(f"  {'Mid':<10} {'26-50':<12} {mid_avg:.3f}")
    print(f"  {'Late':<10} {'51-75':<12} {late_avg:.3f}")
    print(f"  {'Final':<10} {'76-' + str(n_episodes):<12} {final_avg:.3f}")
    print(f"  {'Overall':<10} {'1-' + str(n_episodes):<12} {overall:.3f}")
    print(f"{'═' * 42}\n")

    # ------------------------------------------------------------------
    # Save training log
    # ------------------------------------------------------------------
    log_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data", "training_log.json")
    with open(log_path, "w") as f:
        json.dump(episode_log, f, indent=2)
    print(f"  📄 Training log saved → {log_path}")

    # ------------------------------------------------------------------
    # Matplotlib learning curve
    # ------------------------------------------------------------------
    if save_plot:
        ep_nums = [r["episode"] for r in episode_log]
        raw     = [r["reward"]  for r in episode_log]
        rolling = _rolling_avg(raw, window=5)

        fig, ax = plt.subplots(figsize=(12, 5))
        ax.plot(ep_nums, raw,     color="steelblue", alpha=0.6, linewidth=1.2, label="Episode Reward")
        ax.plot(ep_nums, rolling, color="crimson",   linewidth=2.0, linestyle="--", label="5-Episode Rolling Avg")
        ax.axhline(y=0, color="gray", linewidth=0.8, linestyle="--", alpha=0.7)

        # Phase boundary shading
        ax.axvspan(1,  25, alpha=0.04, color="green",  label="Easy (diff 1-2)")
        ax.axvspan(26, 50, alpha=0.04, color="orange", label="Mid (diff 2-3)")
        ax.axvspan(51, 75, alpha=0.04, color="red",    label="Hard (diff 3-4)")
        ax.axvspan(76, n_episodes, alpha=0.04, color="purple", label="Extreme (diff 4-5)")

        ax.set_title("LifeStack Agent Learning Curve", fontsize=14, fontweight="bold")
        ax.set_xlabel("Episode", fontsize=11)
        ax.set_ylabel("Total Reward", fontsize=11)
        ax.legend(fontsize=9)
        ax.grid(True, alpha=0.3)
        fig.tight_layout()

        plot_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data", "reward_curve.png")
        fig.savefig(plot_path, dpi=150)
        plt.close(fig)
        print(f"  📊 Learning curve saved → {plot_path}")

    # ------------------------------------------------------------------
    # BEHAVIORAL COMPARISON — Friday 6PM (5 runs each)
    # ------------------------------------------------------------------
    N_COMPARE = 5
    print(f"\n{'═' * 58}")
    print(f"  BEHAVIORAL COMPARISON — Friday 6PM Crisis ({N_COMPARE} runs each)")
    print(f"{'═' * 58}")

    memory_dir    = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "lifestack_memory")
    memory_backup = memory_dir + "_backup"

    # --- WITHOUT memory: temporarily hide the ChromaDB folder ---
    had_memory = os.path.exists(memory_dir)
    if had_memory:
        shutil.move(memory_dir, memory_backup)

    no_mem_results = []
    try:
        for i in range(N_COMPARE):
            result = run_episode(difficulty=5, verbose=False)
            first_step = result["step_log"][0] if result["step_log"] else {}
            has_comm = any(
                s.get("action") == "communicate" for s in result["step_log"]
            )
            no_mem_results.append({
                "run": i + 1,
                "total_reward": result["total_reward"],
                "first_action": first_step.get("action", "unknown"),
                "first_domain": first_step.get("domain", "unknown"),
                "has_communication": has_comm,
                "steps": result["steps"],
            })
    finally:
        # Restore memory
        if had_memory and os.path.exists(memory_backup):
            if os.path.exists(memory_dir):
                shutil.rmtree(memory_dir)
            shutil.move(memory_backup, memory_dir)

    # --- WITH memory ---
    with_mem_results = []
    for i in range(N_COMPARE):
        result = run_episode(difficulty=5, verbose=False)
        first_step = result["step_log"][0] if result["step_log"] else {}
        has_comm = any(
            s.get("action") == "communicate" for s in result["step_log"]
        )
        with_mem_results.append({
            "run": i + 1,
            "total_reward": result["total_reward"],
            "first_action": first_step.get("action", "unknown"),
            "first_domain": first_step.get("domain", "unknown"),
            "has_communication": has_comm,
            "steps": result["steps"],
        })

    # --- Compute stats ---
    avg_no  = sum(r["total_reward"] for r in no_mem_results)   / N_COMPARE
    avg_yes = sum(r["total_reward"] for r in with_mem_results) / N_COMPARE
    improvement = avg_yes - avg_no
    pct = (improvement / abs(avg_no) * 100) if avg_no != 0 else 0

    # Most common first action
    from collections import Counter
    no_actions  = Counter(r["first_action"] for r in no_mem_results)
    yes_actions = Counter(r["first_action"] for r in with_mem_results)
    no_domains  = Counter(r["first_domain"] for r in no_mem_results)
    yes_domains = Counter(r["first_domain"] for r in with_mem_results)
    no_comm_pct  = sum(1 for r in no_mem_results  if r["has_communication"]) / N_COMPARE * 100
    yes_comm_pct = sum(1 for r in with_mem_results if r["has_communication"]) / N_COMPARE * 100

    # --- Print table ---
    print(f"\n  {'WITHOUT MEMORY':<28} {'WITH MEMORY':<28}")
    for i in range(N_COMPARE):
        nr = no_mem_results[i]
        wr = with_mem_results[i]
        print(f"  Run {nr['run']}: {nr['total_reward']:.3f} "
              f"({nr['first_action']:<14})"
              f"  Run {wr['run']}: {wr['total_reward']:.3f} "
              f"({wr['first_action']:<14})")
    print(f"  {'─' * 54}")
    print(f"  Avg:   {avg_no:.3f}                    Avg:   {avg_yes:.3f}")
    sign = "+" if improvement >= 0 else ""
    print(f"  Improvement: {sign}{improvement:.3f} ({sign}{pct:.1f}%)")

    print(f"\n  {'─' * 54}")
    print(f"  Most common 1st action WITHOUT memory: {no_actions.most_common(1)[0][0]}")
    print(f"  Most common 1st action WITH memory:    {yes_actions.most_common(1)[0][0]}")
    print(f"  Most common 1st domain WITHOUT memory: {no_domains.most_common(1)[0][0]}")
    print(f"  Most common 1st domain WITH memory:    {yes_domains.most_common(1)[0][0]}")
    print(f"  Communication used WITHOUT memory:     {no_comm_pct:.0f}% of runs")
    print(f"  Communication used WITH memory:        {yes_comm_pct:.0f}% of runs")

    # --- Behavioral insight ---
    if yes_actions.most_common(1)[0][0] != no_actions.most_common(1)[0][0]:
        print(f"\n  💡 Memory changed the agent's primary strategy from "
              f"'{no_actions.most_common(1)[0][0]}' to '{yes_actions.most_common(1)[0][0]}'")
    if yes_comm_pct > no_comm_pct:
        print(f"  💡 Memory taught the agent to include communication actions more often")
    print(f"{'═' * 58}\n")

    # --- Save comparison ---
    comparison = {
        "scenario": "Friday 6PM (difficulty 5)",
        "runs_per_condition": N_COMPARE,
        "without_memory": {
            "results": no_mem_results,
            "avg_reward": round(avg_no, 3),
            "most_common_first_action": no_actions.most_common(1)[0][0],
            "most_common_first_domain": no_domains.most_common(1)[0][0],
            "communication_rate": round(no_comm_pct, 1),
        },
        "with_memory": {
            "results": with_mem_results,
            "avg_reward": round(avg_yes, 3),
            "most_common_first_action": yes_actions.most_common(1)[0][0],
            "most_common_first_domain": yes_domains.most_common(1)[0][0],
            "communication_rate": round(yes_comm_pct, 1),
        },
        "improvement": {
            "absolute": round(improvement, 3),
            "percentage": round(pct, 1),
        },
    }
    comp_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data", "before_after_comparison.json")
    with open(comp_path, "w") as f:
        json.dump(comparison, f, indent=2)
    print(f"  📄 Behavioral comparison saved → {comp_path}")

    return {
        "episode_log": episode_log,
        "phase_averages": {
            "early": early_avg,
            "mid": mid_avg,
            "late": late_avg,
            "final": final_avg,
            "overall": overall,
        },
        "comparison": comparison,
    }


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

def main():
    run_training(n_episodes=100)


if __name__ == "__main__":
    main()