#!/usr/bin/env python3
"""
Example: Using Hypothesis Engine for LLM RL Training

This script demonstrates how the Hypothesis Engine can be used in a standard
reinforcement learning training loop for LLMs.

Three integration patterns are shown:
    1. Basic RL loop (any framework)
    2. Gymnasium-compatible wrapper
    3. Multi-episode training with auto-curriculum

Usage:
    python examples/training_loop.py
"""

import sys
import os

# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


def example_basic_loop():
    """
    Example 1: Basic RL Training Loop

    This shows the simplest integration pattern using the core
    HypothesisEngine API directly.
    """
    print("=" * 60)
    print("  Example 1: Basic RL Training Loop")
    print("=" * 60)

    from hypothesis_engine import HypothesisEngine

    env = HypothesisEngine(difficulty=1, experiment_budget=20, seed=42)

    # Standard RL loop
    for episode in range(3):
        obs = env.reset()
        done = False
        total_reward = 0.0
        step = 0

        while not done and step < 40:
            # YOUR AGENT SELECTS AN ACTION HERE
            # In real training, this comes from your LLM policy network
            action = select_action(obs, step)

            obs, reward, done, info = env.step(action)
            total_reward += reward
            step += 1

            # YOUR AGENT LEARNS HERE
            # e.g., store transition in replay buffer, compute policy gradient, etc.

        summary = env.get_episode_summary()
        print(f"  Episode {episode + 1}: steps={step}, reward={total_reward:.1f}, "
              f"world='{summary['world_name']}'")

    print()


def example_gymnasium_wrapper():
    """
    Example 2: Gymnasium-Compatible Wrapper

    Uses the standard gymnasium interface for compatibility with
    Stable-Baselines3, RLlib, TRL, and other RL frameworks.
    """
    print("=" * 60)
    print("  Example 2: Gymnasium-Compatible Wrapper")
    print("=" * 60)

    from hypothesis_engine.gym_wrapper import make_env

    env = make_env(difficulty=2, experiment_budget=20, seed=42)

    for episode in range(3):
        obs_text, info = env.reset()
        terminated = False
        truncated = False
        total_reward = 0.0
        step = 0

        print(f"\n  Episode {episode + 1}: {info.get('world_name', '?')}")

        while not terminated and not truncated and step < 40:
            # In a real setup, your LLM generates this action text
            action_text = generate_action_text(obs_text, step)

            obs_text, reward, terminated, truncated, info = env.step(action_text)
            total_reward += reward
            step += 1

        print(f"    Steps: {step}, Total Reward: {total_reward:.1f}")

    env.close()
    print()


def example_curriculum_training():
    """
    Example 3: Multi-Episode Training with Auto-Curriculum

    Demonstrates how the auto-curriculum automatically advances
    difficulty as the agent improves, providing a self-improving
    training signal.
    """
    print("=" * 60)
    print("  Example 3: Auto-Curriculum Training")
    print("=" * 60)

    from hypothesis_engine import HypothesisEngine

    env = HypothesisEngine(
        difficulty=1,
        experiment_budget=25,
        auto_curriculum=True,
        advance_threshold=60.0,
    )

    episode_rewards = []

    for episode in range(10):
        obs = env.reset()
        done = False
        total_reward = 0.0
        step = 0

        while not done and step < 40:
            action = select_action(obs, step)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            step += 1

        episode_rewards.append(total_reward)
        difficulty = env.world.difficulty if env.world else "?"

        # Get curriculum progress
        if env.curriculum:
            progress = env.curriculum.get_progress_summary()
            current_level = progress.get("current_difficulty", "?")
        else:
            current_level = difficulty

        print(f"  Episode {episode + 1:>2d}: Level {current_level}, "
              f"Reward: {total_reward:>6.1f}, "
              f"Running Avg: {sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):>6.1f}")

    print(f"\n  Final average reward (last 5): "
          f"{sum(episode_rewards[-5:]) / min(5, len(episode_rewards)):.1f}")
    print()


# ── Helper: Simple action selector (replace with your LLM) ───────────

def select_action(obs, step):
    """
    Dummy action selector for demonstration.

    In real training, replace this with your LLM policy:
        - Feed obs_text to LLM
        - Parse LLM output as JSON action
        - Return the action dict
    """
    world = obs.get("world", {})
    variables = world.get("variables", ["x"])
    ranges = world.get("variable_ranges", {"x": [-10, 10]})
    test_cases = obs.get("test_cases", [])
    remaining = obs.get("experiments_remaining", 0)

    # Simple strategy: probe, then predict
    if step < 8 and remaining > 2:
        # Run experiments at strategic points
        inputs = {}
        for var in variables:
            lo, hi = ranges.get(var, [-10, 10])
            # Probe different points
            probes = [0, 1, -1, 2, -2, 3, -3, 5]
            idx = step % len(probes)
            val = max(lo, min(hi, float(probes[idx])))
            inputs[var] = val
        return {"action": "experiment", "inputs": inputs}

    elif step == 8:
        # Simple hypothesis
        return {"action": "hypothesize", "expression": "2*x + 1"}

    else:
        # Predict (ends episode)
        predictions = [0.0] * len(test_cases) if test_cases else [0.0] * 20
        return {"action": "predict", "predictions": predictions}


def generate_action_text(obs_text, step):
    """
    Generate action text for the gymnasium wrapper.

    In real training, this is where your LLM generates text responses.
    """
    import json

    if step < 5:
        probes = [0, 1, -1, 2, -2]
        return json.dumps({
            "action": "experiment",
            "inputs": {"x": probes[step % len(probes)]}
        })
    elif step == 5:
        return json.dumps({
            "action": "hypothesize",
            "expression": "2*x + 1"
        })
    else:
        return json.dumps({
            "action": "predict",
            "predictions": [0.0] * 20
        })


# ── Integration Guide ─────────────────────────────────────────────────

INTEGRATION_GUIDE = """
== INTEGRATION WITH POPULAR RL FRAMEWORKS ==

1. Stable-Baselines3 (via gymnasium wrapper):
   
   from hypothesis_engine.gym_wrapper import make_env
   from stable_baselines3 import PPO
   
   env = make_env(difficulty=3, experiment_budget=30)
   model = PPO("MlpPolicy", env)  # Use custom text policy
   model.learn(total_timesteps=10000)

2. TRL (Transformer RL for LLM fine-tuning):
   
   from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv
   from trl import PPOTrainer
   
   env = HypothesisEngineGymEnv(difficulty=1, auto_curriculum=True)
   # Use env.reset() and env.step() in your TRL training loop
   # The text-based obs/action spaces work naturally with LLMs

3. Custom GRPO/RLHF Training:
   
   from hypothesis_engine import HypothesisEngine
   
   env = HypothesisEngine(difficulty=1, auto_curriculum=True)
   
   for episode in range(num_episodes):
       obs = env.reset()
       trajectory = []
       done = False
       
       while not done:
           # LLM generates action from observation text
           action = llm.generate(format_prompt(obs))
           obs, reward, done, info = env.step(parse_action(action))
           trajectory.append((obs, action, reward))
       
       # Use trajectory for GRPO/PPO update
       update_policy(trajectory)

4. Ray RLlib:
   
   from ray.rllib.algorithms.ppo import PPOConfig
   from hypothesis_engine.gym_wrapper import HypothesisEngineGymEnv
   
   config = PPOConfig().environment(
       env=HypothesisEngineGymEnv,
       env_config={"difficulty": 3, "experiment_budget": 30},
   )
"""


if __name__ == "__main__":
    print()
    print("Hypothesis Engine — RL Training Integration Examples")
    print("=" * 60)
    print()

    example_basic_loop()
    example_gymnasium_wrapper()
    example_curriculum_training()

    print(INTEGRATION_GUIDE)