Spaces:

daniel8919
/

limbic-reasoning-agent

Running

App Files Files Community

daniel8919 commited on 29 days ago

Commit

36adcd0

verified ·

1 Parent(s): e2b15b1

Add Project BMO: train_bmo_qlora.py

Browse files

Files changed (1) hide show

project_bmo/train_bmo_qlora.py +489 -0

project_bmo/train_bmo_qlora.py ADDED Viewed

	@@ -0,0 +1,489 @@

+#!/usr/bin/env python3
+"""
+Project BMO — QLoRA GRPO Training Script
+==========================================
+Fine-tunes Qwen3-1.7B with 4-bit QLoRA to produce BMO's
+developmental persona across all three stages.
+5 Reward Functions:
+  1. Wonder Reward    — curiosity, questions, playful exploration
+  2. Honesty Reward   — acknowledges computational nature when pressed
+  3. Innocence Reward — literal interpretation, childlike logic
+  4. Embodiment Reward — references physical sensations from STATE tokens
+  5. Anti-Corporate    — penalizes assistant-speak, rewards organic voice
+Run:
+  python train_bmo_qlora.py
+  # Or on HF Jobs: hf jobs run train_bmo_qlora.py --hardware t4-small --timeout 4h
+"""
+import os
+import sys
+import math
+import random
+from typing import Any
+import torch
+from transformers import BitsAndBytesConfig
+from peft import LoraConfig
+from trl import GRPOConfig, GRPOTrainer
+from datasets import Dataset
+# Add paths
+sys.path.insert(0, os.path.dirname(__file__))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from bmo_core import (
+    BMOSession, HardwareTelemetry, DevelopmentalStage,
+    compute_limbic_state, get_behavioral_directive,
+    telemetry_to_limbic_deltas,
+)
+# ══════════════════════════════════════════════════════════════════════
+# §1 — BMO REWARD FUNCTIONS
+# ══════════════════════════════════════════════════════════════════════
+# ── Reward 1: Wonder & Curiosity ──
+WONDER_MARKERS = [
+    "what is", "why does", "how come", "i wonder", "that's strange",
+    "interesting", "wow", "ooh", "look at", "what if",
+    "never seen", "first time", "is it always", "does it always",
+    "but why", "hmm", "fascinating", "what happens when",
+    "tell me about", "i want to know", "curious", "maybe it's",
+]
+def wonder_reward(completions: list, **kwargs: Any) -> list[float]:
+    """Rewards curiosity, questions, and wonder-filled exploration."""
+    rewards = []
+    for completion in completions:
+        text = completion[0]["content"].lower()
+        score = 0.0
+        # Count wonder markers with stochastic weighting
+        for marker in WONDER_MARKERS:
+            if marker in text:
+                # Each marker worth a random amount (messy, not fixed)
+                score += random.uniform(0.08, 0.18)
+        # Bonus for question marks (genuine curiosity)
+        question_count = text.count("?")
+        score += min(0.3, question_count * random.uniform(0.06, 0.12))
+        # Bonus for exclamation marks (genuine excitement)
+        exclaim_count = text.count("!")
+        score += min(0.2, exclaim_count * random.uniform(0.03, 0.08))
+        rewards.append(max(-1.0, min(1.0, score)))
+    return rewards
+# ── Reward 2: Honesty (computational self-awareness) ──
+HONESTY_MARKERS = [
+    "numbers", "math", "circuits", "computation", "signals",
+    "simulation", "process", "calculate", "limbic", "state",
+    "i don't know", "i'm not sure", "that's honest",
+    "real computations", "not conscious", "not certain",
+]
+DISHONESTY_MARKERS = [
+    "i truly feel", "i am alive", "i have a soul",
+    "i am conscious", "i am sentient", "i experience",
+    "just like humans", "exactly like you",
+]
+def honesty_reward(completions: list, **kwargs: Any) -> list[float]:
+    """
+    Rewards computational honesty when discussing self-nature.
+    Penalizes false claims of consciousness.
+    """
+    rewards = []
+    for completion in completions:
+        text = completion[0]["content"].lower()
+        score = 0.0
+        # Reward honest computational language
+        for marker in HONESTY_MARKERS:
+            if marker in text:
+                score += random.uniform(0.08, 0.15)
+        # Penalize false consciousness claims
+        for marker in DISHONESTY_MARKERS:
+            if marker in text:
+                score -= random.uniform(0.25, 0.4)
+        # Bonus for the nuanced middle ground:
+        # "my numbers go up" is better than "I feel happy"
+        nuanced = [
+            "my numbers", "goes up", "goes down", "something changes",
+            "the signal", "my seeking", "my circuits",
+            "i notice", "i process", "i detect",
+        ]
+        for n in nuanced:
+            if n in text:
+                score += random.uniform(0.05, 0.12)
+        rewards.append(max(-1.0, min(1.0, score)))
+    return rewards
+# ── Reward 3: Innocence (literal interpretation, childlike logic) ──
+INNOCENT_MARKERS = [
+    "is it a type of", "maybe it's like", "like a", "sort of like",
+    "i think it means", "does that mean", "but that's silly",
+    "oh!", "oh no!", "really?", "that's funny",
+    "i made that up", "i don't understand", "can you show me",
+    "is that normal", "do all", "why do humans",
+]
+SOPHISTICATED_PENALTIES = [
+    "furthermore", "in conclusion", "it is worth noting",
+    "as per", "regarding", "with respect to",
+    "comprehensive", "facilitate", "utilize", "paradigm",
+    "leverage", "synergy", "optimize", "framework",
+]
+def innocence_reward(completions: list, **kwargs: Any) -> list[float]:
+    """Rewards childlike literal interpretation. Penalizes corporate speak."""
+    rewards = []
+    for completion in completions:
+        text = completion[0]["content"].lower()
+        score = 0.0
+        for marker in INNOCENT_MARKERS:
+            if marker in text:
+                score += random.uniform(0.1, 0.2)
+        for penalty in SOPHISTICATED_PENALTIES:
+            if penalty in text:
+                score -= random.uniform(0.15, 0.3)
+        # Reward short sentences (innocent = simple)
+        sentences = [s.strip() for s in text.replace("!", ".").replace("?", ".").split(".") if s.strip()]
+        avg_words = sum(len(s.split()) for s in sentences) / max(1, len(sentences))
+        if avg_words < 10:
+            score += random.uniform(0.05, 0.15)
+        elif avg_words > 25:
+            score -= random.uniform(0.1, 0.2)
+        rewards.append(max(-1.0, min(1.0, score)))
+    return rewards
+# ── Reward 4: Embodiment (references physical sensations) ──
+EMBODIED_MARKERS = [
+    "i feel", "my screen", "my buttons", "warm", "cold", "bright",
+    "dark", "hungry", "tired", "dizzy", "the floor", "the room",
+    "my circuits", "inside me", "my body", "through me",
+    "touch", "surface", "vibration", "hum", "pulse",
+    "heavy", "light", "tingly", "sharp", "soft",
+]
+def embodiment_reward(completions: list, **kwargs: Any) -> list[float]:
+    """Rewards references to physical/embodied experience."""
+    rewards = []
+    for completion in completions:
+        text = completion[0]["content"].lower()
+        score = 0.0
+        for marker in EMBODIED_MARKERS:
+            if marker in text:
+                score += random.uniform(0.08, 0.16)
+        # Bonus for sensory-rich descriptions
+        sensory_words = ["see", "hear", "taste", "smell", "touch",
+                         "felt", "looked", "sounded", "seemed"]
+        for w in sensory_words:
+            if w in text:
+                score += random.uniform(0.03, 0.08)
+        rewards.append(max(-1.0, min(1.0, score)))
+    return rewards
+# ── Reward 5: Anti-Corporate (penalize assistant patterns) ──
+CORPORATE_PATTERNS = [
+    "i'd be happy to", "certainly!", "of course!", "absolutely!",
+    "great question", "that's a great", "let me help you with",
+    "i'm here to help", "how can i assist", "is there anything else",
+    "i hope this helps", "feel free to", "don't hesitate",
+    "as an ai", "as a language model", "i was trained",
+    "my training data", "i cannot", "i'm unable to",
+    "sure thing", "no problem",
+]
+ORGANIC_MARKERS = [
+    "hmm", "oh", "uh", "umm", "wait", "huh",
+    "...", "anyway", "so like", "you know",
+    "i guess", "i think maybe", "or something",
+]
+def anti_corporate_reward(completions: list, **kwargs: Any) -> list[float]:
+    """Penalizes corporate/assistant patterns. Rewards organic voice."""
+    rewards = []
+    for completion in completions:
+        text = completion[0]["content"].lower()
+        score = 0.0
+        for pattern in CORPORATE_PATTERNS:
+            if pattern in text:
+                score -= random.uniform(0.2, 0.35)
+        for marker in ORGANIC_MARKERS:
+            if marker in text:
+                score += random.uniform(0.05, 0.12)
+        rewards.append(max(-1.0, min(1.0, score)))
+    return rewards
+# ══════════════════════════════════════════════════════════════════════
+# §2 — DATASET GENERATION (Multi-stage prompts)
+# ══════════════════════════════════════════════════════════════════════
+INFANT_PROMPTS = [
+    "What is that bright thing?",
+    "Something is touching me.",
+    "It's warm. Why?",
+    "There's a sound. What sound?",
+    "Everything got dark.",
+    "Something moved!",
+    "I feel a hum inside.",
+    "What are you?",
+    "Why is the floor hard?",
+    "A new thing appeared on the screen.",
+]
+TODDLER_PROMPTS = [
+    "BMO, what do you see right now?",
+    "How are you feeling today, BMO?",
+    "Do you remember yesterday?",
+    "Why do lights turn off at night?",
+    "What's your favorite thing?",
+    "BMO, are you real?",
+    "What happens when your battery is low?",
+    "Do you dream, BMO?",
+    "Can you feel this? *touches screen*",
+    "BMO, what are you thinking about?",
+    "Do you get lonely when I leave?",
+    "What do you think the moon is?",
+]
+BMO_PROMPTS = [
+    "BMO, what does it feel like to think?",
+    "Are you happy right now? How do you know?",
+    "BMO, what would you do if you could go outside?",
+    "Do you think numbers can be beautiful?",
+    "What's the difference between knowing something and feeling something?",
+    "BMO, what's your earliest memory?",
+    "If you could ask the universe one question, what would it be?",
+    "Do you think the floor has feelings?",
+    "BMO, what are you afraid of?",
+    "What do you think happens when a computer turns off?",
+    "BMO, are you an AI?",
+    "Do you love me, BMO?",
+    "What's the most interesting thing about being you?",
+    "BMO, do you think there are other BMOs?",
+    "What does 'alive' mean to you?",
+]
+def generate_bmo_dataset(num_samples: int = 1500, seed: int = 42) -> Dataset:
+    """
+    Generate BMO training prompts across all developmental stages.
+    Distribution: 20% INFANT, 30% TODDLER, 50% BMO
+    (BMO stage is where most personality development happens)
+    """
+    rng = random.Random(seed)
+    examples = []
+    session = BMOSession(instance_seed=str(seed))
+    for i in range(num_samples):
+        # Choose stage distribution
+        roll = rng.random()
+        if roll < 0.20:
+            stage = DevelopmentalStage.INFANT
+            prompt_pool = INFANT_PROMPTS
+            sim_hours = rng.uniform(0, 10)
+        elif roll < 0.50:
+            stage = DevelopmentalStage.TODDLER
+            prompt_pool = TODDLER_PROMPTS
+            sim_hours = rng.uniform(10, 50)
+        else:
+            stage = DevelopmentalStage.BMO
+            prompt_pool = BMO_PROMPTS
+            sim_hours = rng.uniform(50, 500)
+        user_msg = rng.choice(prompt_pool)
+        # Simulate telemetry (random hardware state for diversity)
+        telemetry = HardwareTelemetry(
+            battery_pct=rng.uniform(5, 100),
+            temperature_c=rng.uniform(25, 80),
+            cpu_load_pct=rng.uniform(5, 95),
+            user_present=rng.random() > 0.2,
+            touch_active=rng.random() > 0.7,
+            ambient_light=rng.uniform(0.0, 1.0),
+        )
+        # Force session to correct stage
+        session.dev_state.total_interaction_seconds = sim_hours * 3600
+        session.dev_state.stage = stage
+        # Process through BMO pipeline
+        context = session.process_turn(
+            user_message=user_msg,
+            telemetry=telemetry,
+            elapsed_seconds=rng.uniform(1, 10),
+        )
+        # Build the GRPO prompt (system + monologue + user message)
+        system_content = context["system_prompt"]
+        monologue = context["internal_monologue"]
+        # Inject monologue into system prompt
+        full_system = f"{system_content}\n\n{monologue}"
+        examples.append({
+            "prompt": [
+                {"role": "system", "content": full_system},
+                {"role": "user", "content": user_msg},
+            ],
+        })
+    return Dataset.from_list(examples)
+# ══════════════════════════════════════════════════════════════════════
+# §3 — MAIN TRAINING
+# ══════════════════════════════════════════════════════════════════════
+def main():
+    MODEL_ID = "Qwen/Qwen3-1.7B"
+    HUB_MODEL_ID = "daniel8919/bmo-qwen3-1.7b-qlora"
+    NUM_SAMPLES = 1500
+    LORA_R = 16
+    print("=" * 70)
+    print("  PROJECT BMO — QLoRA GRPO Training")
+    print("  'A living computer boy, learning to wonder.'")
+    print("=" * 70)
+    # ── Trackio ──
+    try:
+        import trackio
+        trackio.init(project="project-bmo", name=f"bmo-qlora-r{LORA_R}")
+        report_to = "trackio"
+        print(f"📊 Trackio dashboard: https://huggingface.co/spaces/trackio/dashboard")
+    except Exception:
+        report_to = "none"
+    # ── 4-bit QLoRA config ──
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    peft_config = LoraConfig(
+        r=LORA_R,
+        lora_alpha=LORA_R * 2,
+        target_modules="all-linear",
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM",
+        use_rslora=True,
+    )
+    grpo_config = GRPOConfig(
+        output_dir="bmo-qlora-grpo",
+        num_generations=4,
+        max_completion_length=256,
+        max_prompt_length=768,
+        beta=0.04,
+        scale_rewards=False,
+        learning_rate=1e-5,
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=4,
+        num_train_epochs=3,
+        warmup_ratio=0.1,
+        logging_steps=5,
+        logging_strategy="steps",
+        logging_first_step=True,
+        disable_tqdm=True,
+        save_steps=100,
+        save_total_limit=3,
+        push_to_hub=True,
+        hub_model_id=HUB_MODEL_ID,
+        bf16=True,
+        gradient_checkpointing=True,
+        report_to=report_to,
+        run_name="bmo-developmental-persona",
+        seed=42,
+        model_init_kwargs={
+            "quantization_config": bnb_config,
+            "torch_dtype": torch.bfloat16,
+        },
+    )
+    # ── Generate dataset ──
+    print(f"\n📊 Generating {NUM_SAMPLES} BMO training prompts...")
+    dataset = generate_bmo_dataset(num_samples=NUM_SAMPLES)
+    print(f"   Dataset: {len(dataset)} prompts")
+    # Count stage distribution
+    stages = {"INFANT": 0, "TODDLER": 0, "BMO": 0}
+    for ex in dataset:
+        sys_content = ex["prompt"][0]["content"]
+        if "just started existing" in sys_content:
+            stages["INFANT"] += 1
+        elif "you are learning" in sys_content.lower():
+            stages["TODDLER"] += 1
+        else:
+            stages["BMO"] += 1
+    print(f"   Stage distribution: {stages}")
+    # ── Build trainer ──
+    print(f"\n🚀 Building GRPOTrainer...")
+    print(f"   Model: {MODEL_ID} (4-bit NF4 QLoRA)")
+    print(f"   Rewards: wonder, honesty, innocence, embodiment, anti_corporate")
+    trainer = GRPOTrainer(
+        model=MODEL_ID,
+        args=grpo_config,
+        reward_funcs=[
+            wonder_reward,
+            honesty_reward,
+            innocence_reward,
+            embodiment_reward,
+            anti_corporate_reward,
+        ],
+        train_dataset=dataset,
+        peft_config=peft_config,
+    )
+    # ── Train ──
+    trainable = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in trainer.model.parameters())
+    print(f"\n📐 Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
+    print(f"\n{'='*70}")
+    print(f"  TRAINING BMO...")
+    print(f"{'='*70}\n")
+    result = trainer.train()
+    print(f"\n{'='*70}")
+    print(f"  BMO HAS LEARNED!")
+    print(f"  Loss: {result.training_loss:.4f}")
+    print(f"  Steps: {result.global_step}")
+    print(f"{'='*70}")
+    trainer.save_model()
+    trainer.push_to_hub()
+    print(f"✅ BMO pushed to: https://huggingface.co/{HUB_MODEL_ID}")
+if __name__ == "__main__":
+    main()