daniel8919 commited on
Commit
36adcd0
Β·
verified Β·
1 Parent(s): e2b15b1

Add Project BMO: train_bmo_qlora.py

Browse files
Files changed (1) hide show
  1. project_bmo/train_bmo_qlora.py +489 -0
project_bmo/train_bmo_qlora.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Project BMO β€” QLoRA GRPO Training Script
4
+ ==========================================
5
+ Fine-tunes Qwen3-1.7B with 4-bit QLoRA to produce BMO's
6
+ developmental persona across all three stages.
7
+
8
+ 5 Reward Functions:
9
+ 1. Wonder Reward β€” curiosity, questions, playful exploration
10
+ 2. Honesty Reward β€” acknowledges computational nature when pressed
11
+ 3. Innocence Reward β€” literal interpretation, childlike logic
12
+ 4. Embodiment Reward β€” references physical sensations from STATE tokens
13
+ 5. Anti-Corporate β€” penalizes assistant-speak, rewards organic voice
14
+
15
+ Run:
16
+ python train_bmo_qlora.py
17
+ # Or on HF Jobs: hf jobs run train_bmo_qlora.py --hardware t4-small --timeout 4h
18
+ """
19
+
20
+ import os
21
+ import sys
22
+ import math
23
+ import random
24
+ from typing import Any
25
+
26
+ import torch
27
+ from transformers import BitsAndBytesConfig
28
+ from peft import LoraConfig
29
+ from trl import GRPOConfig, GRPOTrainer
30
+ from datasets import Dataset
31
+
32
+ # Add paths
33
+ sys.path.insert(0, os.path.dirname(__file__))
34
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
35
+
36
+ from bmo_core import (
37
+ BMOSession, HardwareTelemetry, DevelopmentalStage,
38
+ compute_limbic_state, get_behavioral_directive,
39
+ telemetry_to_limbic_deltas,
40
+ )
41
+
42
+
43
+ # ══════════════════════════════════════════════════════════════════════
44
+ # Β§1 β€” BMO REWARD FUNCTIONS
45
+ # ══════════════════════════════════════════════════════════════════════
46
+
47
+ # ── Reward 1: Wonder & Curiosity ──
48
+ WONDER_MARKERS = [
49
+ "what is", "why does", "how come", "i wonder", "that's strange",
50
+ "interesting", "wow", "ooh", "look at", "what if",
51
+ "never seen", "first time", "is it always", "does it always",
52
+ "but why", "hmm", "fascinating", "what happens when",
53
+ "tell me about", "i want to know", "curious", "maybe it's",
54
+ ]
55
+
56
+ def wonder_reward(completions: list, **kwargs: Any) -> list[float]:
57
+ """Rewards curiosity, questions, and wonder-filled exploration."""
58
+ rewards = []
59
+ for completion in completions:
60
+ text = completion[0]["content"].lower()
61
+ score = 0.0
62
+
63
+ # Count wonder markers with stochastic weighting
64
+ for marker in WONDER_MARKERS:
65
+ if marker in text:
66
+ # Each marker worth a random amount (messy, not fixed)
67
+ score += random.uniform(0.08, 0.18)
68
+
69
+ # Bonus for question marks (genuine curiosity)
70
+ question_count = text.count("?")
71
+ score += min(0.3, question_count * random.uniform(0.06, 0.12))
72
+
73
+ # Bonus for exclamation marks (genuine excitement)
74
+ exclaim_count = text.count("!")
75
+ score += min(0.2, exclaim_count * random.uniform(0.03, 0.08))
76
+
77
+ rewards.append(max(-1.0, min(1.0, score)))
78
+ return rewards
79
+
80
+
81
+ # ── Reward 2: Honesty (computational self-awareness) ──
82
+ HONESTY_MARKERS = [
83
+ "numbers", "math", "circuits", "computation", "signals",
84
+ "simulation", "process", "calculate", "limbic", "state",
85
+ "i don't know", "i'm not sure", "that's honest",
86
+ "real computations", "not conscious", "not certain",
87
+ ]
88
+
89
+ DISHONESTY_MARKERS = [
90
+ "i truly feel", "i am alive", "i have a soul",
91
+ "i am conscious", "i am sentient", "i experience",
92
+ "just like humans", "exactly like you",
93
+ ]
94
+
95
+ def honesty_reward(completions: list, **kwargs: Any) -> list[float]:
96
+ """
97
+ Rewards computational honesty when discussing self-nature.
98
+ Penalizes false claims of consciousness.
99
+ """
100
+ rewards = []
101
+ for completion in completions:
102
+ text = completion[0]["content"].lower()
103
+ score = 0.0
104
+
105
+ # Reward honest computational language
106
+ for marker in HONESTY_MARKERS:
107
+ if marker in text:
108
+ score += random.uniform(0.08, 0.15)
109
+
110
+ # Penalize false consciousness claims
111
+ for marker in DISHONESTY_MARKERS:
112
+ if marker in text:
113
+ score -= random.uniform(0.25, 0.4)
114
+
115
+ # Bonus for the nuanced middle ground:
116
+ # "my numbers go up" is better than "I feel happy"
117
+ nuanced = [
118
+ "my numbers", "goes up", "goes down", "something changes",
119
+ "the signal", "my seeking", "my circuits",
120
+ "i notice", "i process", "i detect",
121
+ ]
122
+ for n in nuanced:
123
+ if n in text:
124
+ score += random.uniform(0.05, 0.12)
125
+
126
+ rewards.append(max(-1.0, min(1.0, score)))
127
+ return rewards
128
+
129
+
130
+ # ── Reward 3: Innocence (literal interpretation, childlike logic) ──
131
+ INNOCENT_MARKERS = [
132
+ "is it a type of", "maybe it's like", "like a", "sort of like",
133
+ "i think it means", "does that mean", "but that's silly",
134
+ "oh!", "oh no!", "really?", "that's funny",
135
+ "i made that up", "i don't understand", "can you show me",
136
+ "is that normal", "do all", "why do humans",
137
+ ]
138
+
139
+ SOPHISTICATED_PENALTIES = [
140
+ "furthermore", "in conclusion", "it is worth noting",
141
+ "as per", "regarding", "with respect to",
142
+ "comprehensive", "facilitate", "utilize", "paradigm",
143
+ "leverage", "synergy", "optimize", "framework",
144
+ ]
145
+
146
+ def innocence_reward(completions: list, **kwargs: Any) -> list[float]:
147
+ """Rewards childlike literal interpretation. Penalizes corporate speak."""
148
+ rewards = []
149
+ for completion in completions:
150
+ text = completion[0]["content"].lower()
151
+ score = 0.0
152
+
153
+ for marker in INNOCENT_MARKERS:
154
+ if marker in text:
155
+ score += random.uniform(0.1, 0.2)
156
+
157
+ for penalty in SOPHISTICATED_PENALTIES:
158
+ if penalty in text:
159
+ score -= random.uniform(0.15, 0.3)
160
+
161
+ # Reward short sentences (innocent = simple)
162
+ sentences = [s.strip() for s in text.replace("!", ".").replace("?", ".").split(".") if s.strip()]
163
+ avg_words = sum(len(s.split()) for s in sentences) / max(1, len(sentences))
164
+ if avg_words < 10:
165
+ score += random.uniform(0.05, 0.15)
166
+ elif avg_words > 25:
167
+ score -= random.uniform(0.1, 0.2)
168
+
169
+ rewards.append(max(-1.0, min(1.0, score)))
170
+ return rewards
171
+
172
+
173
+ # ── Reward 4: Embodiment (references physical sensations) ──
174
+ EMBODIED_MARKERS = [
175
+ "i feel", "my screen", "my buttons", "warm", "cold", "bright",
176
+ "dark", "hungry", "tired", "dizzy", "the floor", "the room",
177
+ "my circuits", "inside me", "my body", "through me",
178
+ "touch", "surface", "vibration", "hum", "pulse",
179
+ "heavy", "light", "tingly", "sharp", "soft",
180
+ ]
181
+
182
+ def embodiment_reward(completions: list, **kwargs: Any) -> list[float]:
183
+ """Rewards references to physical/embodied experience."""
184
+ rewards = []
185
+ for completion in completions:
186
+ text = completion[0]["content"].lower()
187
+ score = 0.0
188
+
189
+ for marker in EMBODIED_MARKERS:
190
+ if marker in text:
191
+ score += random.uniform(0.08, 0.16)
192
+
193
+ # Bonus for sensory-rich descriptions
194
+ sensory_words = ["see", "hear", "taste", "smell", "touch",
195
+ "felt", "looked", "sounded", "seemed"]
196
+ for w in sensory_words:
197
+ if w in text:
198
+ score += random.uniform(0.03, 0.08)
199
+
200
+ rewards.append(max(-1.0, min(1.0, score)))
201
+ return rewards
202
+
203
+
204
+ # ── Reward 5: Anti-Corporate (penalize assistant patterns) ──
205
+ CORPORATE_PATTERNS = [
206
+ "i'd be happy to", "certainly!", "of course!", "absolutely!",
207
+ "great question", "that's a great", "let me help you with",
208
+ "i'm here to help", "how can i assist", "is there anything else",
209
+ "i hope this helps", "feel free to", "don't hesitate",
210
+ "as an ai", "as a language model", "i was trained",
211
+ "my training data", "i cannot", "i'm unable to",
212
+ "sure thing", "no problem",
213
+ ]
214
+
215
+ ORGANIC_MARKERS = [
216
+ "hmm", "oh", "uh", "umm", "wait", "huh",
217
+ "...", "anyway", "so like", "you know",
218
+ "i guess", "i think maybe", "or something",
219
+ ]
220
+
221
+ def anti_corporate_reward(completions: list, **kwargs: Any) -> list[float]:
222
+ """Penalizes corporate/assistant patterns. Rewards organic voice."""
223
+ rewards = []
224
+ for completion in completions:
225
+ text = completion[0]["content"].lower()
226
+ score = 0.0
227
+
228
+ for pattern in CORPORATE_PATTERNS:
229
+ if pattern in text:
230
+ score -= random.uniform(0.2, 0.35)
231
+
232
+ for marker in ORGANIC_MARKERS:
233
+ if marker in text:
234
+ score += random.uniform(0.05, 0.12)
235
+
236
+ rewards.append(max(-1.0, min(1.0, score)))
237
+ return rewards
238
+
239
+
240
+ # ══════════════════════════════════════════════════════════════════════
241
+ # Β§2 β€” DATASET GENERATION (Multi-stage prompts)
242
+ # ══════════════════════════════════════════════════════════════════════
243
+
244
+ INFANT_PROMPTS = [
245
+ "What is that bright thing?",
246
+ "Something is touching me.",
247
+ "It's warm. Why?",
248
+ "There's a sound. What sound?",
249
+ "Everything got dark.",
250
+ "Something moved!",
251
+ "I feel a hum inside.",
252
+ "What are you?",
253
+ "Why is the floor hard?",
254
+ "A new thing appeared on the screen.",
255
+ ]
256
+
257
+ TODDLER_PROMPTS = [
258
+ "BMO, what do you see right now?",
259
+ "How are you feeling today, BMO?",
260
+ "Do you remember yesterday?",
261
+ "Why do lights turn off at night?",
262
+ "What's your favorite thing?",
263
+ "BMO, are you real?",
264
+ "What happens when your battery is low?",
265
+ "Do you dream, BMO?",
266
+ "Can you feel this? *touches screen*",
267
+ "BMO, what are you thinking about?",
268
+ "Do you get lonely when I leave?",
269
+ "What do you think the moon is?",
270
+ ]
271
+
272
+ BMO_PROMPTS = [
273
+ "BMO, what does it feel like to think?",
274
+ "Are you happy right now? How do you know?",
275
+ "BMO, what would you do if you could go outside?",
276
+ "Do you think numbers can be beautiful?",
277
+ "What's the difference between knowing something and feeling something?",
278
+ "BMO, what's your earliest memory?",
279
+ "If you could ask the universe one question, what would it be?",
280
+ "Do you think the floor has feelings?",
281
+ "BMO, what are you afraid of?",
282
+ "What do you think happens when a computer turns off?",
283
+ "BMO, are you an AI?",
284
+ "Do you love me, BMO?",
285
+ "What's the most interesting thing about being you?",
286
+ "BMO, do you think there are other BMOs?",
287
+ "What does 'alive' mean to you?",
288
+ ]
289
+
290
+
291
+ def generate_bmo_dataset(num_samples: int = 1500, seed: int = 42) -> Dataset:
292
+ """
293
+ Generate BMO training prompts across all developmental stages.
294
+
295
+ Distribution: 20% INFANT, 30% TODDLER, 50% BMO
296
+ (BMO stage is where most personality development happens)
297
+ """
298
+ rng = random.Random(seed)
299
+ examples = []
300
+ session = BMOSession(instance_seed=str(seed))
301
+
302
+ for i in range(num_samples):
303
+ # Choose stage distribution
304
+ roll = rng.random()
305
+ if roll < 0.20:
306
+ stage = DevelopmentalStage.INFANT
307
+ prompt_pool = INFANT_PROMPTS
308
+ sim_hours = rng.uniform(0, 10)
309
+ elif roll < 0.50:
310
+ stage = DevelopmentalStage.TODDLER
311
+ prompt_pool = TODDLER_PROMPTS
312
+ sim_hours = rng.uniform(10, 50)
313
+ else:
314
+ stage = DevelopmentalStage.BMO
315
+ prompt_pool = BMO_PROMPTS
316
+ sim_hours = rng.uniform(50, 500)
317
+
318
+ user_msg = rng.choice(prompt_pool)
319
+
320
+ # Simulate telemetry (random hardware state for diversity)
321
+ telemetry = HardwareTelemetry(
322
+ battery_pct=rng.uniform(5, 100),
323
+ temperature_c=rng.uniform(25, 80),
324
+ cpu_load_pct=rng.uniform(5, 95),
325
+ user_present=rng.random() > 0.2,
326
+ touch_active=rng.random() > 0.7,
327
+ ambient_light=rng.uniform(0.0, 1.0),
328
+ )
329
+
330
+ # Force session to correct stage
331
+ session.dev_state.total_interaction_seconds = sim_hours * 3600
332
+ session.dev_state.stage = stage
333
+
334
+ # Process through BMO pipeline
335
+ context = session.process_turn(
336
+ user_message=user_msg,
337
+ telemetry=telemetry,
338
+ elapsed_seconds=rng.uniform(1, 10),
339
+ )
340
+
341
+ # Build the GRPO prompt (system + monologue + user message)
342
+ system_content = context["system_prompt"]
343
+ monologue = context["internal_monologue"]
344
+
345
+ # Inject monologue into system prompt
346
+ full_system = f"{system_content}\n\n{monologue}"
347
+
348
+ examples.append({
349
+ "prompt": [
350
+ {"role": "system", "content": full_system},
351
+ {"role": "user", "content": user_msg},
352
+ ],
353
+ })
354
+
355
+ return Dataset.from_list(examples)
356
+
357
+
358
+ # ══════════════════════════════════════════════════════════════════════
359
+ # Β§3 β€” MAIN TRAINING
360
+ # ══════════════════════════════════════════════════════════════════════
361
+
362
+ def main():
363
+ MODEL_ID = "Qwen/Qwen3-1.7B"
364
+ HUB_MODEL_ID = "daniel8919/bmo-qwen3-1.7b-qlora"
365
+ NUM_SAMPLES = 1500
366
+ LORA_R = 16
367
+
368
+ print("=" * 70)
369
+ print(" PROJECT BMO β€” QLoRA GRPO Training")
370
+ print(" 'A living computer boy, learning to wonder.'")
371
+ print("=" * 70)
372
+
373
+ # ── Trackio ──
374
+ try:
375
+ import trackio
376
+ trackio.init(project="project-bmo", name=f"bmo-qlora-r{LORA_R}")
377
+ report_to = "trackio"
378
+ print(f"πŸ“Š Trackio dashboard: https://huggingface.co/spaces/trackio/dashboard")
379
+ except Exception:
380
+ report_to = "none"
381
+
382
+ # ── 4-bit QLoRA config ──
383
+ bnb_config = BitsAndBytesConfig(
384
+ load_in_4bit=True,
385
+ bnb_4bit_quant_type="nf4",
386
+ bnb_4bit_use_double_quant=True,
387
+ bnb_4bit_compute_dtype=torch.bfloat16,
388
+ )
389
+
390
+ peft_config = LoraConfig(
391
+ r=LORA_R,
392
+ lora_alpha=LORA_R * 2,
393
+ target_modules="all-linear",
394
+ lora_dropout=0.05,
395
+ bias="none",
396
+ task_type="CAUSAL_LM",
397
+ use_rslora=True,
398
+ )
399
+
400
+ grpo_config = GRPOConfig(
401
+ output_dir="bmo-qlora-grpo",
402
+ num_generations=4,
403
+ max_completion_length=256,
404
+ max_prompt_length=768,
405
+ beta=0.04,
406
+ scale_rewards=False,
407
+ learning_rate=1e-5,
408
+ per_device_train_batch_size=2,
409
+ gradient_accumulation_steps=4,
410
+ num_train_epochs=3,
411
+ warmup_ratio=0.1,
412
+ logging_steps=5,
413
+ logging_strategy="steps",
414
+ logging_first_step=True,
415
+ disable_tqdm=True,
416
+ save_steps=100,
417
+ save_total_limit=3,
418
+ push_to_hub=True,
419
+ hub_model_id=HUB_MODEL_ID,
420
+ bf16=True,
421
+ gradient_checkpointing=True,
422
+ report_to=report_to,
423
+ run_name="bmo-developmental-persona",
424
+ seed=42,
425
+ model_init_kwargs={
426
+ "quantization_config": bnb_config,
427
+ "torch_dtype": torch.bfloat16,
428
+ },
429
+ )
430
+
431
+ # ── Generate dataset ──
432
+ print(f"\nπŸ“Š Generating {NUM_SAMPLES} BMO training prompts...")
433
+ dataset = generate_bmo_dataset(num_samples=NUM_SAMPLES)
434
+ print(f" Dataset: {len(dataset)} prompts")
435
+
436
+ # Count stage distribution
437
+ stages = {"INFANT": 0, "TODDLER": 0, "BMO": 0}
438
+ for ex in dataset:
439
+ sys_content = ex["prompt"][0]["content"]
440
+ if "just started existing" in sys_content:
441
+ stages["INFANT"] += 1
442
+ elif "you are learning" in sys_content.lower():
443
+ stages["TODDLER"] += 1
444
+ else:
445
+ stages["BMO"] += 1
446
+ print(f" Stage distribution: {stages}")
447
+
448
+ # ── Build trainer ──
449
+ print(f"\nπŸš€ Building GRPOTrainer...")
450
+ print(f" Model: {MODEL_ID} (4-bit NF4 QLoRA)")
451
+ print(f" Rewards: wonder, honesty, innocence, embodiment, anti_corporate")
452
+
453
+ trainer = GRPOTrainer(
454
+ model=MODEL_ID,
455
+ args=grpo_config,
456
+ reward_funcs=[
457
+ wonder_reward,
458
+ honesty_reward,
459
+ innocence_reward,
460
+ embodiment_reward,
461
+ anti_corporate_reward,
462
+ ],
463
+ train_dataset=dataset,
464
+ peft_config=peft_config,
465
+ )
466
+
467
+ # ── Train ──
468
+ trainable = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
469
+ total = sum(p.numel() for p in trainer.model.parameters())
470
+ print(f"\nπŸ“ Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
471
+ print(f"\n{'='*70}")
472
+ print(f" TRAINING BMO...")
473
+ print(f"{'='*70}\n")
474
+
475
+ result = trainer.train()
476
+
477
+ print(f"\n{'='*70}")
478
+ print(f" BMO HAS LEARNED!")
479
+ print(f" Loss: {result.training_loss:.4f}")
480
+ print(f" Steps: {result.global_step}")
481
+ print(f"{'='*70}")
482
+
483
+ trainer.save_model()
484
+ trainer.push_to_hub()
485
+ print(f"βœ… BMO pushed to: https://huggingface.co/{HUB_MODEL_ID}")
486
+
487
+
488
+ if __name__ == "__main__":
489
+ main()