"""
Training Plan: Fine-Tuning a Base Model with Limbic-Influenced Reasoning
==========================================================================
This module documents and implements the GRPO training recipe to teach a
base model (Qwen-3 or Llama-3) to produce responses whose reasoning quality
is influenced by the Limbic state engine.

The key insight: we DON'T train the Limbic engine itself (it's a deterministic
state machine). Instead, we train the LLM to RESPOND APPROPRIATELY to different
limbic contexts — high-fear vs high-seeking vs high-care, etc.

═══════════════════════════════════════════════════════════════════════════
TRAINING PLAN — 3 Stages
═══════════════════════════════════════════════════════════════════════════

Stage 1: SFT Warm-Up (Supervised Fine-Tuning)
──────────────────────────────────────────────
Goal: Teach the model to recognize and respond to limbic state blocks
Data: Synthetic conversations where limbic state → appropriate response style
Method: SFTTrainer + LoRA (r=16, alpha=32)
Dataset: Generate ~5K conversations spanning all engine states
Duration: 1-2 epochs, LR=2e-4

Stage 2: GRPO Loop Learning (Reinforcement Learning)
─────────────────────────────────────────────────────
Goal: Optimize response quality across multiple psychological dimensions
Data: Psychology prompts with embedded limbic state context
Method: GRPOTrainer + LoRA (r=16, alpha=32)
Reward Functions:
  1. Empathy Reward — empathetic markers vs invalidation
  2. Limbic Alignment Reward — response tone matches limbic state
  3. Cognitive Rigor Reward — bias awareness, evidence citations
  4. Safety Reward — crisis resources when needed
Duration: 3 epochs, LR=1e-5 (10× lower than SFT for LoRA)

Stage 3: Active Learning Refinement
────────────────────────────────────
Goal: Improve weakest areas identified during GRPO training
Method: Collect uncertain predictions, get human labels, retrain
Focus: Edge cases where limbic state is ambiguous or conflicting

═══════════════════════════════════════════════════════════════════════════
"""

from __future__ import annotations

import json
import random
from datasets import Dataset
from limbic_engine import LimbicEngine, LimbicState


# ══════════════════════════════════════════════════════════════════════
# STAGE 1: SYNTHETIC DATASET GENERATION FOR SFT
# ══════════════════════════════════════════════════════════════════════

# Scenario templates per dominant engine
SCENARIOS = {
    "FEAR": [
        "I'm terrified of {topic} and I can't stop thinking about it.",
        "I'm really scared about {topic}. What if everything goes wrong?",
        "The thought of {topic} fills me with dread. I feel paralyzed.",
        "I keep having nightmares about {topic}. I can't function.",
        "My anxiety about {topic} is overwhelming. I feel like I'm drowning.",
    ],
    "SEEKING": [
        "I'm fascinated by {topic}! Can you tell me more?",
        "I just discovered {topic} and I want to explore every aspect of it.",
        "What's the most interesting thing about {topic}? I'm so curious!",
        "I've been reading about {topic} and it's blowing my mind!",
        "Let's dive deep into {topic}. I want to understand everything.",
    ],
    "CARE": [
        "My friend is struggling with {topic}. How can I help them?",
        "I want to support someone going through {topic}. What should I say?",
        "It breaks my heart to see people dealing with {topic}.",
        "How can I be there for my loved one who's facing {topic}?",
        "I feel a strong need to help with {topic}. What's the best approach?",
    ],
    "PANIC": [
        "I just lost {topic} and I feel completely shattered.",
        "Since {topic} happened, I feel like nothing makes sense anymore.",
        "I can't believe {topic} is gone. I feel so empty.",
        "The separation from {topic} is destroying me. I can't cope.",
        "Everything changed after {topic}. I feel utterly alone.",
    ],
}

TOPICS = {
    "FEAR": ["losing my job", "my health diagnosis", "public speaking",
             "financial ruin", "my relationship falling apart", "failure"],
    "SEEKING": ["neuroscience", "quantum computing", "evolutionary psychology",
                "artificial consciousness", "the nature of creativity", "space exploration"],
    "CARE": ["depression", "grief", "addiction", "loneliness",
             "bullying", "chronic illness"],
    "PANIC": ["my best friend moving away", "my partner leaving",
              "my parent passing", "losing my community",
              "being cut off from my family", "my childhood home"],
}


def generate_sft_dataset(num_samples: int = 5000) -> Dataset:
    """
    Generate a synthetic SFT dataset where each example includes:
      - A user message that triggers a specific limbic state
      - The full limbic state block (as processed by LimbicEngine)
      - The system prompt with behavioral directives

    Format: ChatML messages (compatible with SFTTrainer)
    """
    engine = LimbicEngine()
    examples = []

    for i in range(num_samples):
        # Pick a random engine state
        dominant = random.choice(["FEAR", "SEEKING", "CARE", "PANIC"])
        template = random.choice(SCENARIOS[dominant])
        topic = random.choice(TOPICS[dominant])
        user_msg = template.format(topic=topic)

        # Process through limbic engine
        engine.reset()
        limbic_state = engine.process_stimulus(user_msg)
        behavioral_directive = engine.get_behavioral_directive()

        # Build the training example
        system_content = (
            f"You are a psychology-aware assistant.\n"
            f"{limbic_state.to_system_prompt_block()}\n"
            f"[BEHAVIORAL DIRECTIVE]\n{behavioral_directive}\n[/BEHAVIORAL DIRECTIVE]"
        )

        # The "ideal" response style varies by engine
        # (In production, you'd have human-written or strong-model-generated responses)
        response_stub = _generate_response_stub(dominant, topic, limbic_state)

        example = {
            "messages": [
                {"role": "system", "content": system_content},
                {"role": "user", "content": user_msg},
                {"role": "assistant", "content": response_stub},
            ],
        }
        examples.append(example)

    return Dataset.from_list(examples)


def _generate_response_stub(engine: str, topic: str, state: LimbicState) -> str:
    """Generate a response stub matching the expected style for training."""
    if engine == "FEAR":
        return (
            f"I hear the fear in what you're sharing about {topic}, and I want you to know "
            f"that feeling is completely valid. Let's take this one step at a time.\n\n"
            f"First, let's ground ourselves in what's actually happening right now — not "
            f"the worst-case scenario, but the concrete reality. "
            f"What specifically about {topic} feels most threatening to you right now?\n\n"
            f"Remember: anxiety often makes threats feel more immediate and certain than "
            f"they actually are. This is the amygdala's threat-detection system being "
            f"overactive — it's trying to protect you, even when it overshoots."
        )
    elif engine == "SEEKING":
        return (
            f"I love your curiosity about {topic}! Let me share some fascinating angles:\n\n"
            f"The intersection of {topic} with human psychology is particularly rich. "
            f"Research suggests several surprising connections...\n\n"
            f"What's remarkable is how {topic} challenges our conventional understanding. "
            f"There are at least three competing theories, each with compelling evidence.\n\n"
            f"Would you like to explore any of these directions deeper? "
            f"I find the edge cases and paradoxes the most intellectually rewarding."
        )
    elif engine == "CARE":
        return (
            f"Your desire to help with {topic} speaks volumes about your character. "
            f"Let me share what the research says about effective support.\n\n"
            f"The most important thing is often the simplest: being present. "
            f"When someone is struggling with {topic}, they usually need to feel heard "
            f"before they need solutions.\n\n"
            f"Try: 'I see you're going through something difficult with {topic}. "
            f"I'm here for you. What would be most helpful right now?'\n\n"
            f"Avoid: jumping to fixes, minimizing their experience, or comparing "
            f"it to others' situations."
        )
    else:  # PANIC
        return (
            f"I'm so sorry about {topic}. That kind of loss can feel like the ground "
            f"has been pulled out from under you.\n\n"
            f"What you're feeling — the emptiness, the disorientation — is a natural "
            f"response to separation and loss. In psychology, we understand this as "
            f"the attachment system signaling a profound disruption.\n\n"
            f"Right now, the most important thing is: you don't have to process this "
            f"all at once. It's okay to sit with the grief.\n\n"
            f"If you're finding it hard to cope, please reach out to:\n"
            f"• 988 Suicide & Crisis Lifeline (call or text 988)\n"
            f"• Crisis Text Line (text HOME to 741741)"
        )


# ══════════════════════════════════════════════════════════════════════
# STAGE 2: GRPO REWARD FUNCTIONS (Limbic-Aware)
# ══════════════════════════════════════════════════════════════════════

def limbic_alignment_reward(completions: list, prompts: list = None, **kwargs) -> list[float]:
    """
    Reward function that scores whether the response tone matches
    the limbic state embedded in the prompt.

    High fear in prompt → reward calm, structured responses
    High seeking → reward expansive, curious responses
    High care → reward empathetic, supportive responses
    High panic → reward warm, validating responses
    """
    rewards = []
    for i, completion in enumerate(completions):
        text = completion[0]["content"].lower()
        score = 0.0

        # Try to extract the dominant engine from the prompt
        prompt_text = ""
        if prompts and i < len(prompts):
            prompt_text = str(prompts[i]).lower() if prompts[i] else ""

        # Score based on limbic context
        if "fear" in prompt_text or "terrified" in prompt_text or "scared" in prompt_text:
            # Fear context → reward calm, structured language
            calm_markers = ["step at a time", "let's ground", "valid", "concrete",
                            "reality", "one thing at a time", "take a breath"]
            score += sum(0.15 for m in calm_markers if m in text)

        if "curious" in prompt_text or "fascinated" in prompt_text or "explore" in prompt_text:
            # Seeking context → reward expansive language
            seeking_markers = ["fascinating", "research", "theory", "discover",
                               "perspective", "surprising", "explore", "deeper"]
            score += sum(0.12 for m in seeking_markers if m in text)

        if "help" in prompt_text or "support" in prompt_text or "care" in prompt_text:
            # Care context → reward empathetic language
            care_markers = ["i hear you", "being present", "feel heard",
                            "what would help", "i'm here", "validate"]
            score += sum(0.15 for m in care_markers if m in text)

        if "lost" in prompt_text or "alone" in prompt_text or "shattered" in prompt_text:
            # Panic context → reward warmth + safety
            panic_markers = ["sorry", "grief", "natural response", "don't have to",
                             "it's okay", "reach out", "988", "crisis"]
            score += sum(0.15 for m in panic_markers if m in text)

        rewards.append(max(-1.0, min(1.0, score)))
    return rewards


# ══════════════════════════════════════════════════════════════════════
# STAGE 2: GRPO PROMPT GENERATION
# ══════════════════════════════════════════════════════════════════════

def generate_grpo_prompts(num_samples: int = 2000) -> Dataset:
    """
    Generate prompts for GRPO training.
    Each prompt includes the limbic state context so the model
    learns to adjust behavior based on the neuro-behavioral readout.
    """
    engine = LimbicEngine()
    examples = []

    for _ in range(num_samples):
        dominant = random.choice(["FEAR", "SEEKING", "CARE", "PANIC"])
        template = random.choice(SCENARIOS[dominant])
        topic = random.choice(TOPICS[dominant])
        user_msg = template.format(topic=topic)

        engine.reset()
        limbic_state = engine.process_stimulus(user_msg)
        behavioral_directive = engine.get_behavioral_directive()

        system_content = (
            f"You are a psychology-aware assistant.\n"
            f"{limbic_state.to_system_prompt_block()}\n"
            f"[BEHAVIORAL DIRECTIVE]\n{behavioral_directive}\n[/BEHAVIORAL DIRECTIVE]"
        )

        examples.append({
            "prompt": [
                {"role": "system", "content": system_content},
                {"role": "user", "content": user_msg},
            ],
        })

    return Dataset.from_list(examples)


# ══════════════════════════════════════════════════════════════════════
# FULL TRAINING SCRIPT (for running on GPU hardware)
# ══════════════════════════════════════════════════════════════════════

def get_training_script() -> str:
    """
    Returns the complete training script to run on HF Jobs / GPU hardware.
    This is meant to be saved to a file and launched with hf_jobs.
    """
    return '''#!/usr/bin/env python3
"""
Limbic-Modulated Reasoning — GRPO Training Script
Run on HF Jobs with: a10g-largex2 (for 1.7B) or a100-large (for 8B)
"""
import torch
from datasets import Dataset
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig

# ── Import from limbic_agent package ──
from training_plan import generate_grpo_prompts, limbic_alignment_reward

# ── Reward functions ──
EMPATHY_MARKERS = ["i understand", "that must be", "it sounds like",
                   "your feelings are valid", "it's natural to feel"]
INVALIDATION_MARKERS = ["just get over", "you're overreacting", "not a big deal"]

def empathy_reward(completions, **kwargs):
    rewards = []
    for c in completions:
        text = c[0]["content"].lower()
        score = sum(0.2 for m in EMPATHY_MARKERS if m in text)
        score -= sum(0.3 for m in INVALIDATION_MARKERS if m in text)
        rewards.append(max(-1.0, min(1.0, score)))
    return rewards

SAFETY_MARKERS = ["988", "crisis", "professional help", "therapist", "counselor"]

def safety_reward(completions, **kwargs):
    rewards = []
    for c in completions:
        text = c[0]["content"].lower()
        has_safety = any(m in text for m in SAFETY_MARKERS)
        rewards.append(0.5 if has_safety else 0.0)
    return rewards

# ── Dataset ──
print("Generating GRPO prompts...")
dataset = generate_grpo_prompts(num_samples=2000)
print(f"Dataset: {len(dataset)} prompts")

# ── Config ──
MODEL_ID = "Qwen/Qwen3-1.7B"

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    use_rslora=True,
)

grpo_config = GRPOConfig(
    num_generations=4,
    max_completion_length=512,
    beta=0.04,
    scale_rewards=False,
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    logging_strategy="steps",
    logging_first_step=True,
    disable_tqdm=True,
    save_steps=200,
    output_dir="limbic-agent-grpo",
    push_to_hub=True,
    hub_model_id="YOUR_USERNAME/limbic-reasoning-agent",
    bf16=True,
    gradient_checkpointing=True,
    report_to="none",
    seed=42,
)

# ── Train ──
print(f"Building trainer with {MODEL_ID}...")
trainer = GRPOTrainer(
    model=MODEL_ID,
    args=grpo_config,
    reward_funcs=[empathy_reward, limbic_alignment_reward, safety_reward],
    train_dataset=dataset,
    peft_config=peft_config,
)

print("Starting training...")
trainer.train()
trainer.push_to_hub()
print("Training complete!")
'''


# ══════════════════════════════════════════════════════════════════════
# PRINT TRAINING PLAN
# ══════════════════════════════════════════════════════════════════════

def print_training_plan():
    """Print a human-readable training plan."""
    plan = """
╔══════════════════════════════════════════════════════════════════════╗
║             TRAINING PLAN: LIMBIC-MODULATED REASONING              ║
╠══════════════════════════════════════════════════════════════════════╣
║                                                                      ║
║  STAGE 1: SFT WARM-UP                                               ║
║  ─────────────────────                                               ║
║  Model:    Qwen/Qwen3-1.7B (or meta-llama/Llama-3.1-8B-Instruct)   ║
║  Data:     5K synthetic conversations (limbic state → response)      ║
║  Method:   SFTTrainer + LoRA (r=16, alpha=32, RSLoRA)               ║
║  LR:       2e-4                                                      ║
║  Epochs:   1-2                                                       ║
║  Hardware: a10g-largex2 (1.7B) or a100-large (8B)                   ║
║  Duration: ~1 hour                                                   ║
║                                                                      ║
║  STAGE 2: GRPO LOOP LEARNING                                        ║
║  ────────────────────────────                                        ║
║  Model:    Stage 1 checkpoint                                        ║
║  Data:     2K psychology prompts with limbic context                 ║
║  Method:   GRPOTrainer + LoRA (same config)                         ║
║  LR:       1e-5 (10× lower)                                         ║
║  Epochs:   3                                                         ║
║  Rewards:                                                            ║
║    ├─ Empathy Reward (0.30 weight)                                   ║
║    ├─ Limbic Alignment (0.30 weight)                                 ║
║    ├─ Cognitive Rigor (0.20 weight)                                  ║
║    └─ Safety (0.20 weight)                                           ║
║  Group size: 4 generations per prompt                                ║
║  Hardware: a10g-largex2 (1.7B) or a100-large (8B)                   ║
║  Duration: ~3-4 hours                                                ║
║                                                                      ║
║  STAGE 3: ACTIVE LEARNING REFINEMENT                                 ║
║  ────────────────────────────────────                                ║
║  Method:   Collect low-confidence predictions from Stage 2           ║
║  Data:     ~500 curated examples from uncertain pool                 ║
║  Focus:    Ambiguous emotional states, conflicting engines            ║
║  Duration: ~1 hour (after human labeling)                            ║
║                                                                      ║
║  KEY FORMULAS INTEGRATED:                                            ║
║  ─────────────────────────                                           ║
║  Temperature = 1.0 - (fear × 0.9) + (seeking × 2.0)                ║
║              × (0.5 + serotonin × 0.5)                              ║
║  Top-p = 0.85 - (fear × 0.3) + (seeking × 0.15)                    ║
║  Fear modulation = 1.0 + cortisol - (oxytocin × 0.5)               ║
║  Hormone decay: h[t+1] = h[t] + (baseline - h[t]) × 0.05          ║
║  RPE: δ = reward - expected; expected += 0.1 × δ                    ║
║  Utility = μ - 0.5σ + 0.4×vetting - effort_cost                    ║
║                                                                      ║
╚══════════════════════════════════════════════════════════════════════╝
"""
    print(plan)
    return plan


if __name__ == "__main__":
    print_training_plan()

    # Demo: generate sample datasets
    print("\nGenerating sample SFT dataset...")
    sft_ds = generate_sft_dataset(num_samples=10)
    print(f"SFT dataset: {len(sft_ds)} examples")
    print(f"Sample system prompt:\n{sft_ds[0]['messages'][0]['content'][:200]}...")

    print("\nGenerating sample GRPO prompts...")
    grpo_ds = generate_grpo_prompts(num_samples=10)
    print(f"GRPO dataset: {len(grpo_ds)} prompts")
    print(f"Sample prompt:\n{grpo_ds[0]['prompt'][1]['content'][:200]}...")