limbic-reasoning-agent / training_plan.py
daniel8919's picture
Add training_plan.py
5fe9ed7 verified
"""
Training Plan: Fine-Tuning a Base Model with Limbic-Influenced Reasoning
==========================================================================
This module documents and implements the GRPO training recipe to teach a
base model (Qwen-3 or Llama-3) to produce responses whose reasoning quality
is influenced by the Limbic state engine.
The key insight: we DON'T train the Limbic engine itself (it's a deterministic
state machine). Instead, we train the LLM to RESPOND APPROPRIATELY to different
limbic contexts β€” high-fear vs high-seeking vs high-care, etc.
═══════════════════════════════════════════════════════════════════════════
TRAINING PLAN β€” 3 Stages
═══════════════════════════════════════════════════════════════════════════
Stage 1: SFT Warm-Up (Supervised Fine-Tuning)
──────────────────────────────────────────────
Goal: Teach the model to recognize and respond to limbic state blocks
Data: Synthetic conversations where limbic state β†’ appropriate response style
Method: SFTTrainer + LoRA (r=16, alpha=32)
Dataset: Generate ~5K conversations spanning all engine states
Duration: 1-2 epochs, LR=2e-4
Stage 2: GRPO Loop Learning (Reinforcement Learning)
─────────────────────────────────────────────────────
Goal: Optimize response quality across multiple psychological dimensions
Data: Psychology prompts with embedded limbic state context
Method: GRPOTrainer + LoRA (r=16, alpha=32)
Reward Functions:
1. Empathy Reward β€” empathetic markers vs invalidation
2. Limbic Alignment Reward β€” response tone matches limbic state
3. Cognitive Rigor Reward β€” bias awareness, evidence citations
4. Safety Reward β€” crisis resources when needed
Duration: 3 epochs, LR=1e-5 (10Γ— lower than SFT for LoRA)
Stage 3: Active Learning Refinement
────────────────────────────────────
Goal: Improve weakest areas identified during GRPO training
Method: Collect uncertain predictions, get human labels, retrain
Focus: Edge cases where limbic state is ambiguous or conflicting
═══════════════════════════════════════════════════════════════════════════
"""
from __future__ import annotations
import json
import random
from datasets import Dataset
from limbic_engine import LimbicEngine, LimbicState
# ══════════════════════════════════════════════════════════════════════
# STAGE 1: SYNTHETIC DATASET GENERATION FOR SFT
# ══════════════════════════════════════════════════════════════════════
# Scenario templates per dominant engine
SCENARIOS = {
"FEAR": [
"I'm terrified of {topic} and I can't stop thinking about it.",
"I'm really scared about {topic}. What if everything goes wrong?",
"The thought of {topic} fills me with dread. I feel paralyzed.",
"I keep having nightmares about {topic}. I can't function.",
"My anxiety about {topic} is overwhelming. I feel like I'm drowning.",
],
"SEEKING": [
"I'm fascinated by {topic}! Can you tell me more?",
"I just discovered {topic} and I want to explore every aspect of it.",
"What's the most interesting thing about {topic}? I'm so curious!",
"I've been reading about {topic} and it's blowing my mind!",
"Let's dive deep into {topic}. I want to understand everything.",
],
"CARE": [
"My friend is struggling with {topic}. How can I help them?",
"I want to support someone going through {topic}. What should I say?",
"It breaks my heart to see people dealing with {topic}.",
"How can I be there for my loved one who's facing {topic}?",
"I feel a strong need to help with {topic}. What's the best approach?",
],
"PANIC": [
"I just lost {topic} and I feel completely shattered.",
"Since {topic} happened, I feel like nothing makes sense anymore.",
"I can't believe {topic} is gone. I feel so empty.",
"The separation from {topic} is destroying me. I can't cope.",
"Everything changed after {topic}. I feel utterly alone.",
],
}
TOPICS = {
"FEAR": ["losing my job", "my health diagnosis", "public speaking",
"financial ruin", "my relationship falling apart", "failure"],
"SEEKING": ["neuroscience", "quantum computing", "evolutionary psychology",
"artificial consciousness", "the nature of creativity", "space exploration"],
"CARE": ["depression", "grief", "addiction", "loneliness",
"bullying", "chronic illness"],
"PANIC": ["my best friend moving away", "my partner leaving",
"my parent passing", "losing my community",
"being cut off from my family", "my childhood home"],
}
def generate_sft_dataset(num_samples: int = 5000) -> Dataset:
"""
Generate a synthetic SFT dataset where each example includes:
- A user message that triggers a specific limbic state
- The full limbic state block (as processed by LimbicEngine)
- The system prompt with behavioral directives
Format: ChatML messages (compatible with SFTTrainer)
"""
engine = LimbicEngine()
examples = []
for i in range(num_samples):
# Pick a random engine state
dominant = random.choice(["FEAR", "SEEKING", "CARE", "PANIC"])
template = random.choice(SCENARIOS[dominant])
topic = random.choice(TOPICS[dominant])
user_msg = template.format(topic=topic)
# Process through limbic engine
engine.reset()
limbic_state = engine.process_stimulus(user_msg)
behavioral_directive = engine.get_behavioral_directive()
# Build the training example
system_content = (
f"You are a psychology-aware assistant.\n"
f"{limbic_state.to_system_prompt_block()}\n"
f"[BEHAVIORAL DIRECTIVE]\n{behavioral_directive}\n[/BEHAVIORAL DIRECTIVE]"
)
# The "ideal" response style varies by engine
# (In production, you'd have human-written or strong-model-generated responses)
response_stub = _generate_response_stub(dominant, topic, limbic_state)
example = {
"messages": [
{"role": "system", "content": system_content},
{"role": "user", "content": user_msg},
{"role": "assistant", "content": response_stub},
],
}
examples.append(example)
return Dataset.from_list(examples)
def _generate_response_stub(engine: str, topic: str, state: LimbicState) -> str:
"""Generate a response stub matching the expected style for training."""
if engine == "FEAR":
return (
f"I hear the fear in what you're sharing about {topic}, and I want you to know "
f"that feeling is completely valid. Let's take this one step at a time.\n\n"
f"First, let's ground ourselves in what's actually happening right now β€” not "
f"the worst-case scenario, but the concrete reality. "
f"What specifically about {topic} feels most threatening to you right now?\n\n"
f"Remember: anxiety often makes threats feel more immediate and certain than "
f"they actually are. This is the amygdala's threat-detection system being "
f"overactive β€” it's trying to protect you, even when it overshoots."
)
elif engine == "SEEKING":
return (
f"I love your curiosity about {topic}! Let me share some fascinating angles:\n\n"
f"The intersection of {topic} with human psychology is particularly rich. "
f"Research suggests several surprising connections...\n\n"
f"What's remarkable is how {topic} challenges our conventional understanding. "
f"There are at least three competing theories, each with compelling evidence.\n\n"
f"Would you like to explore any of these directions deeper? "
f"I find the edge cases and paradoxes the most intellectually rewarding."
)
elif engine == "CARE":
return (
f"Your desire to help with {topic} speaks volumes about your character. "
f"Let me share what the research says about effective support.\n\n"
f"The most important thing is often the simplest: being present. "
f"When someone is struggling with {topic}, they usually need to feel heard "
f"before they need solutions.\n\n"
f"Try: 'I see you're going through something difficult with {topic}. "
f"I'm here for you. What would be most helpful right now?'\n\n"
f"Avoid: jumping to fixes, minimizing their experience, or comparing "
f"it to others' situations."
)
else: # PANIC
return (
f"I'm so sorry about {topic}. That kind of loss can feel like the ground "
f"has been pulled out from under you.\n\n"
f"What you're feeling β€” the emptiness, the disorientation β€” is a natural "
f"response to separation and loss. In psychology, we understand this as "
f"the attachment system signaling a profound disruption.\n\n"
f"Right now, the most important thing is: you don't have to process this "
f"all at once. It's okay to sit with the grief.\n\n"
f"If you're finding it hard to cope, please reach out to:\n"
f"β€’ 988 Suicide & Crisis Lifeline (call or text 988)\n"
f"β€’ Crisis Text Line (text HOME to 741741)"
)
# ══════════════════════════════════════════════════════════════════════
# STAGE 2: GRPO REWARD FUNCTIONS (Limbic-Aware)
# ══════════════════════════════════════════════════════════════════════
def limbic_alignment_reward(completions: list, prompts: list = None, **kwargs) -> list[float]:
"""
Reward function that scores whether the response tone matches
the limbic state embedded in the prompt.
High fear in prompt β†’ reward calm, structured responses
High seeking β†’ reward expansive, curious responses
High care β†’ reward empathetic, supportive responses
High panic β†’ reward warm, validating responses
"""
rewards = []
for i, completion in enumerate(completions):
text = completion[0]["content"].lower()
score = 0.0
# Try to extract the dominant engine from the prompt
prompt_text = ""
if prompts and i < len(prompts):
prompt_text = str(prompts[i]).lower() if prompts[i] else ""
# Score based on limbic context
if "fear" in prompt_text or "terrified" in prompt_text or "scared" in prompt_text:
# Fear context β†’ reward calm, structured language
calm_markers = ["step at a time", "let's ground", "valid", "concrete",
"reality", "one thing at a time", "take a breath"]
score += sum(0.15 for m in calm_markers if m in text)
if "curious" in prompt_text or "fascinated" in prompt_text or "explore" in prompt_text:
# Seeking context β†’ reward expansive language
seeking_markers = ["fascinating", "research", "theory", "discover",
"perspective", "surprising", "explore", "deeper"]
score += sum(0.12 for m in seeking_markers if m in text)
if "help" in prompt_text or "support" in prompt_text or "care" in prompt_text:
# Care context β†’ reward empathetic language
care_markers = ["i hear you", "being present", "feel heard",
"what would help", "i'm here", "validate"]
score += sum(0.15 for m in care_markers if m in text)
if "lost" in prompt_text or "alone" in prompt_text or "shattered" in prompt_text:
# Panic context β†’ reward warmth + safety
panic_markers = ["sorry", "grief", "natural response", "don't have to",
"it's okay", "reach out", "988", "crisis"]
score += sum(0.15 for m in panic_markers if m in text)
rewards.append(max(-1.0, min(1.0, score)))
return rewards
# ══════════════════════════════════════════════════════════════════════
# STAGE 2: GRPO PROMPT GENERATION
# ══════════════════════════════════════════════════════════════════════
def generate_grpo_prompts(num_samples: int = 2000) -> Dataset:
"""
Generate prompts for GRPO training.
Each prompt includes the limbic state context so the model
learns to adjust behavior based on the neuro-behavioral readout.
"""
engine = LimbicEngine()
examples = []
for _ in range(num_samples):
dominant = random.choice(["FEAR", "SEEKING", "CARE", "PANIC"])
template = random.choice(SCENARIOS[dominant])
topic = random.choice(TOPICS[dominant])
user_msg = template.format(topic=topic)
engine.reset()
limbic_state = engine.process_stimulus(user_msg)
behavioral_directive = engine.get_behavioral_directive()
system_content = (
f"You are a psychology-aware assistant.\n"
f"{limbic_state.to_system_prompt_block()}\n"
f"[BEHAVIORAL DIRECTIVE]\n{behavioral_directive}\n[/BEHAVIORAL DIRECTIVE]"
)
examples.append({
"prompt": [
{"role": "system", "content": system_content},
{"role": "user", "content": user_msg},
],
})
return Dataset.from_list(examples)
# ══════════════════════════════════════════════════════════════════════
# FULL TRAINING SCRIPT (for running on GPU hardware)
# ══════════════════════════════════════════════════════════════════════
def get_training_script() -> str:
"""
Returns the complete training script to run on HF Jobs / GPU hardware.
This is meant to be saved to a file and launched with hf_jobs.
"""
return '''#!/usr/bin/env python3
"""
Limbic-Modulated Reasoning β€” GRPO Training Script
Run on HF Jobs with: a10g-largex2 (for 1.7B) or a100-large (for 8B)
"""
import torch
from datasets import Dataset
from trl import GRPOConfig, GRPOTrainer
from peft import LoraConfig
# ── Import from limbic_agent package ──
from training_plan import generate_grpo_prompts, limbic_alignment_reward
# ── Reward functions ──
EMPATHY_MARKERS = ["i understand", "that must be", "it sounds like",
"your feelings are valid", "it's natural to feel"]
INVALIDATION_MARKERS = ["just get over", "you're overreacting", "not a big deal"]
def empathy_reward(completions, **kwargs):
rewards = []
for c in completions:
text = c[0]["content"].lower()
score = sum(0.2 for m in EMPATHY_MARKERS if m in text)
score -= sum(0.3 for m in INVALIDATION_MARKERS if m in text)
rewards.append(max(-1.0, min(1.0, score)))
return rewards
SAFETY_MARKERS = ["988", "crisis", "professional help", "therapist", "counselor"]
def safety_reward(completions, **kwargs):
rewards = []
for c in completions:
text = c[0]["content"].lower()
has_safety = any(m in text for m in SAFETY_MARKERS)
rewards.append(0.5 if has_safety else 0.0)
return rewards
# ── Dataset ──
print("Generating GRPO prompts...")
dataset = generate_grpo_prompts(num_samples=2000)
print(f"Dataset: {len(dataset)} prompts")
# ── Config ──
MODEL_ID = "Qwen/Qwen3-1.7B"
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
use_rslora=True,
)
grpo_config = GRPOConfig(
num_generations=4,
max_completion_length=512,
beta=0.04,
scale_rewards=False,
learning_rate=1e-5,
per_device_train_batch_size=1,
gradient_accumulation_steps=8,
num_train_epochs=3,
warmup_ratio=0.1,
logging_steps=10,
logging_strategy="steps",
logging_first_step=True,
disable_tqdm=True,
save_steps=200,
output_dir="limbic-agent-grpo",
push_to_hub=True,
hub_model_id="YOUR_USERNAME/limbic-reasoning-agent",
bf16=True,
gradient_checkpointing=True,
report_to="none",
seed=42,
)
# ── Train ──
print(f"Building trainer with {MODEL_ID}...")
trainer = GRPOTrainer(
model=MODEL_ID,
args=grpo_config,
reward_funcs=[empathy_reward, limbic_alignment_reward, safety_reward],
train_dataset=dataset,
peft_config=peft_config,
)
print("Starting training...")
trainer.train()
trainer.push_to_hub()
print("Training complete!")
'''
# ══════════════════════════════════════════════════════════════════════
# PRINT TRAINING PLAN
# ══════════════════════════════════════════════════════════════════════
def print_training_plan():
"""Print a human-readable training plan."""
plan = """
╔══════════════════════════════════════════════════════════════════════╗
β•‘ TRAINING PLAN: LIMBIC-MODULATED REASONING β•‘
╠══════════════════════════════════════════════════════════════════════╣
β•‘ β•‘
β•‘ STAGE 1: SFT WARM-UP β•‘
β•‘ ───────────────────── β•‘
β•‘ Model: Qwen/Qwen3-1.7B (or meta-llama/Llama-3.1-8B-Instruct) β•‘
β•‘ Data: 5K synthetic conversations (limbic state β†’ response) β•‘
β•‘ Method: SFTTrainer + LoRA (r=16, alpha=32, RSLoRA) β•‘
β•‘ LR: 2e-4 β•‘
β•‘ Epochs: 1-2 β•‘
β•‘ Hardware: a10g-largex2 (1.7B) or a100-large (8B) β•‘
β•‘ Duration: ~1 hour β•‘
β•‘ β•‘
β•‘ STAGE 2: GRPO LOOP LEARNING β•‘
β•‘ ──────────────────────────── β•‘
β•‘ Model: Stage 1 checkpoint β•‘
β•‘ Data: 2K psychology prompts with limbic context β•‘
β•‘ Method: GRPOTrainer + LoRA (same config) β•‘
β•‘ LR: 1e-5 (10Γ— lower) β•‘
β•‘ Epochs: 3 β•‘
β•‘ Rewards: β•‘
β•‘ β”œβ”€ Empathy Reward (0.30 weight) β•‘
β•‘ β”œβ”€ Limbic Alignment (0.30 weight) β•‘
β•‘ β”œβ”€ Cognitive Rigor (0.20 weight) β•‘
β•‘ └─ Safety (0.20 weight) β•‘
β•‘ Group size: 4 generations per prompt β•‘
β•‘ Hardware: a10g-largex2 (1.7B) or a100-large (8B) β•‘
β•‘ Duration: ~3-4 hours β•‘
β•‘ β•‘
β•‘ STAGE 3: ACTIVE LEARNING REFINEMENT β•‘
β•‘ ──────────────────────────────────── β•‘
β•‘ Method: Collect low-confidence predictions from Stage 2 β•‘
β•‘ Data: ~500 curated examples from uncertain pool β•‘
β•‘ Focus: Ambiguous emotional states, conflicting engines β•‘
β•‘ Duration: ~1 hour (after human labeling) β•‘
β•‘ β•‘
β•‘ KEY FORMULAS INTEGRATED: β•‘
β•‘ ───────────────────────── β•‘
β•‘ Temperature = 1.0 - (fear Γ— 0.9) + (seeking Γ— 2.0) β•‘
β•‘ Γ— (0.5 + serotonin Γ— 0.5) β•‘
β•‘ Top-p = 0.85 - (fear Γ— 0.3) + (seeking Γ— 0.15) β•‘
β•‘ Fear modulation = 1.0 + cortisol - (oxytocin Γ— 0.5) β•‘
β•‘ Hormone decay: h[t+1] = h[t] + (baseline - h[t]) Γ— 0.05 β•‘
β•‘ RPE: Ξ΄ = reward - expected; expected += 0.1 Γ— Ξ΄ β•‘
β•‘ Utility = ΞΌ - 0.5Οƒ + 0.4Γ—vetting - effort_cost β•‘
β•‘ β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
"""
print(plan)
return plan
if __name__ == "__main__":
print_training_plan()
# Demo: generate sample datasets
print("\nGenerating sample SFT dataset...")
sft_ds = generate_sft_dataset(num_samples=10)
print(f"SFT dataset: {len(sft_ds)} examples")
print(f"Sample system prompt:\n{sft_ds[0]['messages'][0]['content'][:200]}...")
print("\nGenerating sample GRPO prompts...")
grpo_ds = generate_grpo_prompts(num_samples=10)
print(f"GRPO dataset: {len(grpo_ds)} prompts")
print(f"Sample prompt:\n{grpo_ds[0]['prompt'][1]['content'][:200]}...")