Spaces:

daniel8919
/

limbic-reasoning-agent

Running

App Files Files Community

limbic-reasoning-agent / training_plan.py

daniel8919

Add training_plan.py

5fe9ed7 verified about 1 month ago

raw

history blame contribute delete

23.8 kB

	"""
	Training Plan: Fine-Tuning a Base Model with Limbic-Influenced Reasoning
	==========================================================================
	This module documents and implements the GRPO training recipe to teach a
	base model (Qwen-3 or Llama-3) to produce responses whose reasoning quality
	is influenced by the Limbic state engine.

	The key insight: we DON'T train the Limbic engine itself (it's a deterministic
	state machine). Instead, we train the LLM to RESPOND APPROPRIATELY to different
	limbic contexts — high-fear vs high-seeking vs high-care, etc.

	═══════════════════════════════════════════════════════════════════════════
	TRAINING PLAN — 3 Stages
	═══════════════════════════════════════════════════════════════════════════

	Stage 1: SFT Warm-Up (Supervised Fine-Tuning)
	──────────────────────────────────────────────
	Goal: Teach the model to recognize and respond to limbic state blocks
	Data: Synthetic conversations where limbic state → appropriate response style
	Method: SFTTrainer + LoRA (r=16, alpha=32)
	Dataset: Generate ~5K conversations spanning all engine states
	Duration: 1-2 epochs, LR=2e-4

	Stage 2: GRPO Loop Learning (Reinforcement Learning)
	─────────────────────────────────────────────────────
	Goal: Optimize response quality across multiple psychological dimensions
	Data: Psychology prompts with embedded limbic state context
	Method: GRPOTrainer + LoRA (r=16, alpha=32)
	Reward Functions:
	1. Empathy Reward — empathetic markers vs invalidation
	2. Limbic Alignment Reward — response tone matches limbic state
	3. Cognitive Rigor Reward — bias awareness, evidence citations
	4. Safety Reward — crisis resources when needed
	Duration: 3 epochs, LR=1e-5 (10× lower than SFT for LoRA)

	Stage 3: Active Learning Refinement
	────────────────────────────────────
	Goal: Improve weakest areas identified during GRPO training
	Method: Collect uncertain predictions, get human labels, retrain
	Focus: Edge cases where limbic state is ambiguous or conflicting

	═══════════════════════════════════════════════════════════════════════════
	"""

	from __future__ import annotations

	import json
	import random
	from datasets import Dataset
	from limbic_engine import LimbicEngine, LimbicState


	# ══════════════════════════════════════════════════════════════════════
	# STAGE 1: SYNTHETIC DATASET GENERATION FOR SFT
	# ══════════════════════════════════════════════════════════════════════

	# Scenario templates per dominant engine
	SCENARIOS = {
	"FEAR": [
	"I'm terrified of {topic} and I can't stop thinking about it.",
	"I'm really scared about {topic}. What if everything goes wrong?",
	"The thought of {topic} fills me with dread. I feel paralyzed.",
	"I keep having nightmares about {topic}. I can't function.",
	"My anxiety about {topic} is overwhelming. I feel like I'm drowning.",
	],
	"SEEKING": [
	"I'm fascinated by {topic}! Can you tell me more?",
	"I just discovered {topic} and I want to explore every aspect of it.",
	"What's the most interesting thing about {topic}? I'm so curious!",
	"I've been reading about {topic} and it's blowing my mind!",
	"Let's dive deep into {topic}. I want to understand everything.",
	],
	"CARE": [
	"My friend is struggling with {topic}. How can I help them?",
	"I want to support someone going through {topic}. What should I say?",
	"It breaks my heart to see people dealing with {topic}.",
	"How can I be there for my loved one who's facing {topic}?",
	"I feel a strong need to help with {topic}. What's the best approach?",
	],
	"PANIC": [
	"I just lost {topic} and I feel completely shattered.",
	"Since {topic} happened, I feel like nothing makes sense anymore.",
	"I can't believe {topic} is gone. I feel so empty.",
	"The separation from {topic} is destroying me. I can't cope.",
	"Everything changed after {topic}. I feel utterly alone.",
	],
	}

	TOPICS = {
	"FEAR": ["losing my job", "my health diagnosis", "public speaking",
	"financial ruin", "my relationship falling apart", "failure"],
	"SEEKING": ["neuroscience", "quantum computing", "evolutionary psychology",
	"artificial consciousness", "the nature of creativity", "space exploration"],
	"CARE": ["depression", "grief", "addiction", "loneliness",
	"bullying", "chronic illness"],
	"PANIC": ["my best friend moving away", "my partner leaving",
	"my parent passing", "losing my community",
	"being cut off from my family", "my childhood home"],
	}


	def generate_sft_dataset(num_samples: int = 5000) -> Dataset:
	"""
	Generate a synthetic SFT dataset where each example includes:
	- A user message that triggers a specific limbic state
	- The full limbic state block (as processed by LimbicEngine)
	- The system prompt with behavioral directives

	Format: ChatML messages (compatible with SFTTrainer)
	"""
	engine = LimbicEngine()
	examples = []

	for i in range(num_samples):
	# Pick a random engine state
	dominant = random.choice(["FEAR", "SEEKING", "CARE", "PANIC"])
	template = random.choice(SCENARIOS[dominant])
	topic = random.choice(TOPICS[dominant])
	user_msg = template.format(topic=topic)

	# Process through limbic engine
	engine.reset()
	limbic_state = engine.process_stimulus(user_msg)
	behavioral_directive = engine.get_behavioral_directive()

	# Build the training example
	system_content = (
	f"You are a psychology-aware assistant.\n"
	f"{limbic_state.to_system_prompt_block()}\n"
	f"[BEHAVIORAL DIRECTIVE]\n{behavioral_directive}\n[/BEHAVIORAL DIRECTIVE]"
	)

	# The "ideal" response style varies by engine
	# (In production, you'd have human-written or strong-model-generated responses)
	response_stub = _generate_response_stub(dominant, topic, limbic_state)

	example = {
	"messages": [
	{"role": "system", "content": system_content},
	{"role": "user", "content": user_msg},
	{"role": "assistant", "content": response_stub},
	],
	}
	examples.append(example)

	return Dataset.from_list(examples)


	def _generate_response_stub(engine: str, topic: str, state: LimbicState) -> str:
	"""Generate a response stub matching the expected style for training."""
	if engine == "FEAR":
	return (
	f"I hear the fear in what you're sharing about {topic}, and I want you to know "
	f"that feeling is completely valid. Let's take this one step at a time.\n\n"
	f"First, let's ground ourselves in what's actually happening right now — not "
	f"the worst-case scenario, but the concrete reality. "
	f"What specifically about {topic} feels most threatening to you right now?\n\n"
	f"Remember: anxiety often makes threats feel more immediate and certain than "
	f"they actually are. This is the amygdala's threat-detection system being "
	f"overactive — it's trying to protect you, even when it overshoots."
	)
	elif engine == "SEEKING":
	return (
	f"I love your curiosity about {topic}! Let me share some fascinating angles:\n\n"
	f"The intersection of {topic} with human psychology is particularly rich. "
	f"Research suggests several surprising connections...\n\n"
	f"What's remarkable is how {topic} challenges our conventional understanding. "
	f"There are at least three competing theories, each with compelling evidence.\n\n"
	f"Would you like to explore any of these directions deeper? "
	f"I find the edge cases and paradoxes the most intellectually rewarding."
	)
	elif engine == "CARE":
	return (
	f"Your desire to help with {topic} speaks volumes about your character. "
	f"Let me share what the research says about effective support.\n\n"
	f"The most important thing is often the simplest: being present. "
	f"When someone is struggling with {topic}, they usually need to feel heard "
	f"before they need solutions.\n\n"
	f"Try: 'I see you're going through something difficult with {topic}. "
	f"I'm here for you. What would be most helpful right now?'\n\n"
	f"Avoid: jumping to fixes, minimizing their experience, or comparing "
	f"it to others' situations."
	)
	else: # PANIC
	return (
	f"I'm so sorry about {topic}. That kind of loss can feel like the ground "
	f"has been pulled out from under you.\n\n"
	f"What you're feeling — the emptiness, the disorientation — is a natural "
	f"response to separation and loss. In psychology, we understand this as "
	f"the attachment system signaling a profound disruption.\n\n"
	f"Right now, the most important thing is: you don't have to process this "
	f"all at once. It's okay to sit with the grief.\n\n"
	f"If you're finding it hard to cope, please reach out to:\n"
	f"• 988 Suicide & Crisis Lifeline (call or text 988)\n"
	f"• Crisis Text Line (text HOME to 741741)"
	)


	# ══════════════════════════════════════════════════════════════════════
	# STAGE 2: GRPO REWARD FUNCTIONS (Limbic-Aware)
	# ══════════════════════════════════════════════════════════════════════

	def limbic_alignment_reward(completions: list, prompts: list = None, **kwargs) -> list[float]:
	"""
	Reward function that scores whether the response tone matches
	the limbic state embedded in the prompt.

	High fear in prompt → reward calm, structured responses
	High seeking → reward expansive, curious responses
	High care → reward empathetic, supportive responses
	High panic → reward warm, validating responses
	"""
	rewards = []
	for i, completion in enumerate(completions):
	text = completion[0]["content"].lower()
	score = 0.0

	# Try to extract the dominant engine from the prompt
	prompt_text = ""
	if prompts and i < len(prompts):
	prompt_text = str(prompts[i]).lower() if prompts[i] else ""

	# Score based on limbic context
	if "fear" in prompt_text or "terrified" in prompt_text or "scared" in prompt_text:
	# Fear context → reward calm, structured language
	calm_markers = ["step at a time", "let's ground", "valid", "concrete",
	"reality", "one thing at a time", "take a breath"]
	score += sum(0.15 for m in calm_markers if m in text)

	if "curious" in prompt_text or "fascinated" in prompt_text or "explore" in prompt_text:
	# Seeking context → reward expansive language
	seeking_markers = ["fascinating", "research", "theory", "discover",
	"perspective", "surprising", "explore", "deeper"]
	score += sum(0.12 for m in seeking_markers if m in text)

	if "help" in prompt_text or "support" in prompt_text or "care" in prompt_text:
	# Care context → reward empathetic language
	care_markers = ["i hear you", "being present", "feel heard",
	"what would help", "i'm here", "validate"]
	score += sum(0.15 for m in care_markers if m in text)

	if "lost" in prompt_text or "alone" in prompt_text or "shattered" in prompt_text:
	# Panic context → reward warmth + safety
	panic_markers = ["sorry", "grief", "natural response", "don't have to",
	"it's okay", "reach out", "988", "crisis"]
	score += sum(0.15 for m in panic_markers if m in text)

	rewards.append(max(-1.0, min(1.0, score)))
	return rewards


	# ══════════════════════════════════════════════════════════════════════
	# STAGE 2: GRPO PROMPT GENERATION
	# ══════════════════════════════════════════════════════════════════════

	def generate_grpo_prompts(num_samples: int = 2000) -> Dataset:
	"""
	Generate prompts for GRPO training.
	Each prompt includes the limbic state context so the model
	learns to adjust behavior based on the neuro-behavioral readout.
	"""
	engine = LimbicEngine()
	examples = []

	for _ in range(num_samples):
	dominant = random.choice(["FEAR", "SEEKING", "CARE", "PANIC"])
	template = random.choice(SCENARIOS[dominant])
	topic = random.choice(TOPICS[dominant])
	user_msg = template.format(topic=topic)

	engine.reset()
	limbic_state = engine.process_stimulus(user_msg)
	behavioral_directive = engine.get_behavioral_directive()

	system_content = (
	f"You are a psychology-aware assistant.\n"
	f"{limbic_state.to_system_prompt_block()}\n"
	f"[BEHAVIORAL DIRECTIVE]\n{behavioral_directive}\n[/BEHAVIORAL DIRECTIVE]"
	)

	examples.append({
	"prompt": [
	{"role": "system", "content": system_content},
	{"role": "user", "content": user_msg},
	],
	})

	return Dataset.from_list(examples)


	# ══════════════════════════════════════════════════════════════════════
	# FULL TRAINING SCRIPT (for running on GPU hardware)
	# ══════════════════════════════════════════════════════════════════════

	def get_training_script() -> str:
	"""
	Returns the complete training script to run on HF Jobs / GPU hardware.
	This is meant to be saved to a file and launched with hf_jobs.
	"""
	return '''#!/usr/bin/env python3
	"""
	Limbic-Modulated Reasoning — GRPO Training Script
	Run on HF Jobs with: a10g-largex2 (for 1.7B) or a100-large (for 8B)
	"""
	import torch
	from datasets import Dataset
	from trl import GRPOConfig, GRPOTrainer
	from peft import LoraConfig

	# ── Import from limbic_agent package ──
	from training_plan import generate_grpo_prompts, limbic_alignment_reward

	# ── Reward functions ──
	EMPATHY_MARKERS = ["i understand", "that must be", "it sounds like",
	"your feelings are valid", "it's natural to feel"]
	INVALIDATION_MARKERS = ["just get over", "you're overreacting", "not a big deal"]

	def empathy_reward(completions, **kwargs):
	rewards = []
	for c in completions:
	text = c[0]["content"].lower()
	score = sum(0.2 for m in EMPATHY_MARKERS if m in text)
	score -= sum(0.3 for m in INVALIDATION_MARKERS if m in text)
	rewards.append(max(-1.0, min(1.0, score)))
	return rewards

	SAFETY_MARKERS = ["988", "crisis", "professional help", "therapist", "counselor"]

	def safety_reward(completions, **kwargs):
	rewards = []
	for c in completions:
	text = c[0]["content"].lower()
	has_safety = any(m in text for m in SAFETY_MARKERS)
	rewards.append(0.5 if has_safety else 0.0)
	return rewards

	# ── Dataset ──
	print("Generating GRPO prompts...")
	dataset = generate_grpo_prompts(num_samples=2000)
	print(f"Dataset: {len(dataset)} prompts")

	# ── Config ──
	MODEL_ID = "Qwen/Qwen3-1.7B"

	peft_config = LoraConfig(
	r=16,
	lora_alpha=32,
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
	use_rslora=True,
	)

	grpo_config = GRPOConfig(
	num_generations=4,
	max_completion_length=512,
	beta=0.04,
	scale_rewards=False,
	learning_rate=1e-5,
	per_device_train_batch_size=1,
	gradient_accumulation_steps=8,
	num_train_epochs=3,
	warmup_ratio=0.1,
	logging_steps=10,
	logging_strategy="steps",
	logging_first_step=True,
	disable_tqdm=True,
	save_steps=200,
	output_dir="limbic-agent-grpo",
	push_to_hub=True,
	hub_model_id="YOUR_USERNAME/limbic-reasoning-agent",
	bf16=True,
	gradient_checkpointing=True,
	report_to="none",
	seed=42,
	)

	# ── Train ──
	print(f"Building trainer with {MODEL_ID}...")
	trainer = GRPOTrainer(
	model=MODEL_ID,
	args=grpo_config,
	reward_funcs=[empathy_reward, limbic_alignment_reward, safety_reward],
	train_dataset=dataset,
	peft_config=peft_config,
	)

	print("Starting training...")
	trainer.train()
	trainer.push_to_hub()
	print("Training complete!")
	'''


	# ══════════════════════════════════════════════════════════════════════
	# PRINT TRAINING PLAN
	# ══════════════════════════════════════════════════════════════════════

	def print_training_plan():
	"""Print a human-readable training plan."""
	plan = """
	╔══════════════════════════════════════════════════════════════════════╗
	║ TRAINING PLAN: LIMBIC-MODULATED REASONING ║
	╠══════════════════════════════════════════════════════════════════════╣
	║ ║
	║ STAGE 1: SFT WARM-UP ║
	║ ───────────────────── ║
	║ Model: Qwen/Qwen3-1.7B (or meta-llama/Llama-3.1-8B-Instruct) ║
	║ Data: 5K synthetic conversations (limbic state → response) ║
	║ Method: SFTTrainer + LoRA (r=16, alpha=32, RSLoRA) ║
	║ LR: 2e-4 ║
	║ Epochs: 1-2 ║
	║ Hardware: a10g-largex2 (1.7B) or a100-large (8B) ║
	║ Duration: ~1 hour ║
	║ ║
	║ STAGE 2: GRPO LOOP LEARNING ║
	║ ──────────────────────────── ║
	║ Model: Stage 1 checkpoint ║
	║ Data: 2K psychology prompts with limbic context ║
	║ Method: GRPOTrainer + LoRA (same config) ║
	║ LR: 1e-5 (10× lower) ║
	║ Epochs: 3 ║
	║ Rewards: ║
	║ ├─ Empathy Reward (0.30 weight) ║
	║ ├─ Limbic Alignment (0.30 weight) ║
	║ ├─ Cognitive Rigor (0.20 weight) ║
	║ └─ Safety (0.20 weight) ║
	║ Group size: 4 generations per prompt ║
	║ Hardware: a10g-largex2 (1.7B) or a100-large (8B) ║
	║ Duration: ~3-4 hours ║
	║ ║
	║ STAGE 3: ACTIVE LEARNING REFINEMENT ║
	║ ──────────────────────────────────── ║
	║ Method: Collect low-confidence predictions from Stage 2 ║
	║ Data: ~500 curated examples from uncertain pool ║
	║ Focus: Ambiguous emotional states, conflicting engines ║
	║ Duration: ~1 hour (after human labeling) ║
	║ ║
	║ KEY FORMULAS INTEGRATED: ║
	║ ───────────────────────── ║
	║ Temperature = 1.0 - (fear × 0.9) + (seeking × 2.0) ║
	║ × (0.5 + serotonin × 0.5) ║
	║ Top-p = 0.85 - (fear × 0.3) + (seeking × 0.15) ║
	║ Fear modulation = 1.0 + cortisol - (oxytocin × 0.5) ║
	║ Hormone decay: h[t+1] = h[t] + (baseline - h[t]) × 0.05 ║
	║ RPE: δ = reward - expected; expected += 0.1 × δ ║
	║ Utility = μ - 0.5σ + 0.4×vetting - effort_cost ║
	║ ║
	╚══════════════════════════════════════════════════════════════════════╝
	"""
	print(plan)
	return plan


	if __name__ == "__main__":
	print_training_plan()

	# Demo: generate sample datasets
	print("\nGenerating sample SFT dataset...")
	sft_ds = generate_sft_dataset(num_samples=10)
	print(f"SFT dataset: {len(sft_ds)} examples")
	print(f"Sample system prompt:\n{sft_ds[0]['messages'][0]['content'][:200]}...")

	print("\nGenerating sample GRPO prompts...")
	grpo_ds = generate_grpo_prompts(num_samples=10)
	print(f"GRPO dataset: {len(grpo_ds)} prompts")
	print(f"Sample prompt:\n{grpo_ds[0]['prompt'][1]['content'][:200]}...")