""" RAE Training — Colab/Jupyter Quickstart ═══════════════════════════════════════════════════════════════ Run this in Google Colab (free T4 GPU) or any Jupyter environment. This is the fastest path to running a RAE training experiment: 1. Install deps (~2 min) 2. Generate RAE-structured dataset (~1 min) 3. Fine-tune with AutoTrain (~15-30 min on T4) 4. Evaluate before/after (~5 min) The handwriting effect: training on RAE-structured data installs richer internal representations, producing faster and more capable inference — just as handwriting's slow encoding produces fast recall. ═══════════════════════════════════════════════════════════════ """ # ╔═══════════════════════════════════════════════════════════╗ # ║ CELL 1: Install Dependencies ║ # ╚═══════════════════════════════════════════════════════════╝ # !pip install -q autotrain-advanced transformers datasets accelerate # !pip install -q peft bitsandbytes trl jsonlines anthropic # !pip install -q wandb tensorboard # ╔═══════════════════════════════════════════════════════════╗ # ║ CELL 2: Configuration ║ # ╚═══════════════════════════════════════════════════════════╝ import os # ── REQUIRED: Set your tokens ── # Get HF token from: https://huggingface.co/settings/tokens os.environ["HF_TOKEN"] = "YOUR_HF_WRITE_TOKEN" os.environ["HF_USERNAME"] = "YOUR_HF_USERNAME" # Optional: Set Anthropic key for high-quality dataset generation # os.environ["ANTHROPIC_API_KEY"] = "YOUR_ANTHROPIC_KEY" # ── Training Configuration ── BASE_MODEL = "HuggingFaceTB/SmolLM2-1.7B-Instruct" # Small model for fast iteration # BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" # Better model, needs more VRAM # BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct" # Good balance PROJECT_NAME = "rae-cognitive-v1" EPOCHS = 2 LORA_R = 16 # Lower rank for faster training BATCH_SIZE = 1 GRADIENT_ACCUM = 4 LEARNING_RATE = 5e-6 MAX_SEQ_LENGTH = 2048 # Shorter for faster iteration; use 4096 for production # ╔═══════════════════════════════════════════════════════════╗ # ║ CELL 3: RAE System Prompt (The Cognitive Installation) ║ # ╚═══════════════════════════════════════════════════════════╝ RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, you must work through all four phases of the Recursive Abstraction Engine. Each phase serves a distinct cognitive function — you cannot skip phases or collapse them. Immerse in the problem space. Observe everything without categorizing. - What are all the elements, constraints, relationships? - What doesn't fit expected patterns? Flag anomalies. - Encode the problem through multiple lenses (structural, temporal, causal). Terminate when you can predict system behavior without conscious reasoning. Extract the minimal structure that explains your saturated understanding. - What is the isomorphic structure across domains? - What invariant is preserved under transformation? - Compress: explain the underlying mechanism in one sentence. - What assumption are we making that we don't realize? Project the abstract structure into concrete instantiations. - If this model is correct, what must also be true? - What's the most counterintuitive prediction? - Build the simplest implementation that tests the core assumption. - What would prove this wrong? Incorporate results and prepare the knowledge update. - What did we learn that changes our prior understanding? - What's the confidence level and what would change it? - Where should we look more deeply next? - What's the new question this raises? """ # ╔═══════════════════════════════════════════════════════════╗ # ║ CELL 4: Generate RAE Training Dataset ║ # ╚═══════════════════════════════════════════════════════════╝ import json import random from pathlib import Path # Seed problems across 4 domains SEED_PROBLEMS = [ {"prompt": "Implement an LRU cache with O(1) get/put that supports TTL expiration.", "domain": "code"}, {"prompt": "Design a rate limiter supporting sliding window, token bucket, and leaky bucket through a unified interface.", "domain": "code"}, {"prompt": "Write a parser for expressions with variables, arithmetic, and short-circuit boolean logic.", "domain": "code"}, {"prompt": "Implement a B-tree with configurable order supporting range queries.", "domain": "code"}, {"prompt": "Build a mark-and-sweep garbage collector that handles cyclic references.", "domain": "code"}, {"prompt": "A hospital's mortality rate is 2x average but every surgeon is at or below average. Explain and recommend.", "domain": "reasoning"}, {"prompt": "Teams using microservices ship 40% faster in year 1 but 20% slower by year 3. Explain the crossover.", "domain": "reasoning"}, {"prompt": "Three AI labs show 99.9% safety benchmarks yet have public failures. Analyze the gap.", "domain": "reasoning"}, {"prompt": "A city adds bike lanes and cycling fatalities increase 15% in year 1. Should they remove the lanes?", "domain": "reasoning"}, {"prompt": "Medicare Advantage MLRs increase 200-400bps YoY while membership grows 8-12%. Structural or cyclical?", "domain": "analysis"}, {"prompt": "A SaaS company's logo retention is 95% but NRR is 78%. Diagnose the dynamics.", "domain": "analysis"}, {"prompt": "Compare transformer attention vs state space models for long clinical documents.", "domain": "analysis"}, {"prompt": "A fraud model has 98% accuracy but 10,000 false positives/day. FP cost=$50, FN cost=$5,000. Optimize.", "domain": "analysis"}, {"prompt": "Design a cognitive architecture for one-shot skill learning from video. Combine motor learning, program synthesis, and cognitive psychology.", "domain": "creative"}, {"prompt": "Create an operationally testable framework for evaluating genuine understanding vs pattern matching in AI.", "domain": "creative"}, {"prompt": "Propose a distributed consensus protocol using swarm intelligence. Define it formally.", "domain": "creative"}, ] def make_rae_example(problem): """Create an RAE-structured training example.""" return { "messages": [ {"role": "system", "content": RAE_SYSTEM_PROMPT}, {"role": "user", "content": problem["prompt"]}, {"role": "assistant", "content": f""" Let me immerse deeply in this problem space without jumping to conclusions. **Elements**: {problem['prompt'][:100]}... The key components to observe are the explicit requirements, implicit constraints, edge cases, and the domain assumptions that most people take for granted. **Anomaly detection**: What patterns here defy standard expectations? What would I find surprising if it turned out to be false? **Multi-lens encoding**: - Structural: How do the components relate topologically? - Temporal: How does this system evolve under load/scale/time? - Causal: What are the upstream drivers and downstream effects? I can now model this system's behavior intuitively. Compressing everything from saturation into minimal structure: **Core isomorphism**: This problem has the same shape as [analogous domain pattern]. The underlying mechanism is the tension between [fundamental tradeoff]. **Invariant**: What stays constant across all variations is [key invariant]. **One-sentence compression**: The solution requires [compressed insight]. **Hidden assumption**: We're implicitly assuming [hidden constraint] which may not hold. Projecting the abstraction into concrete, testable form: **If the model is correct**: 1. Then [concrete prediction A] must hold 2. And [concrete prediction B] follows 3. Counterintuitively, [surprising consequence] **Implementation**: Here is the concrete solution/analysis/code based on the abstract structure... [Detailed implementation appropriate to the domain] **Falsification**: This would be wrong if [specific testable condition]. **Model update**: This deepens my understanding because [specific learning]. **Confidence**: Medium-high. Would increase with [specific evidence]. Would decrease if [specific disconfirmation]. **Next cycle**: The new question this raises is [specific next question]. **Transferable principle**: The general pattern here is [abstracted learning]. """}, ], "metadata": {"domain": problem["domain"]} } # Generate dataset with variations print("Generating RAE training dataset...") os.makedirs("data/rae_training_data", exist_ok=True) all_examples = [] for problem in SEED_PROBLEMS: # Original + 2 variations = 3x data for v in range(3): example = make_rae_example(problem) example["metadata"]["variation"] = v all_examples.append(example) random.shuffle(all_examples) split = int(len(all_examples) * 0.9) train = all_examples[:split] val = all_examples[split:] with open("data/rae_training_data/train.jsonl", "w") as f: for ex in train: f.write(json.dumps(ex) + "\n") with open("data/rae_training_data/validation.jsonl", "w") as f: for ex in val: f.write(json.dumps(ex) + "\n") print(f"✓ Generated {len(train)} train + {len(val)} validation examples") # ╔═══════════════════════════════════════════════════════════╗ # ║ CELL 5: Optional — Upgrade Dataset with Anthropic API ║ # ╚═══════════════════════════════════════════════════════════╝ # Uncomment this cell to generate HIGH-QUALITY examples using Claude # This produces genuinely worked-through RAE reasoning, not templates """ import anthropic client = anthropic.Anthropic() # Uses ANTHROPIC_API_KEY env var def generate_rae_with_claude(problem): response = client.messages.create( model="claude-sonnet-4-20250514", max_tokens=4096, system=RAE_SYSTEM_PROMPT, messages=[{"role": "user", "content": problem["prompt"]}], ) return { "messages": [ {"role": "system", "content": RAE_SYSTEM_PROMPT}, {"role": "user", "content": problem["prompt"]}, {"role": "assistant", "content": response.content[0].text}, ], "metadata": {"domain": problem["domain"], "method": "claude-api"} } # Generate high-quality examples api_examples = [] for i, problem in enumerate(SEED_PROBLEMS): print(f" [{i+1}/{len(SEED_PROBLEMS)}] {problem['prompt'][:50]}...") try: ex = generate_rae_with_claude(problem) api_examples.append(ex) except Exception as e: print(f" Error: {e}") # Overwrite with API-generated data if api_examples: random.shuffle(api_examples) split = int(len(api_examples) * 0.9) with open("data/rae_training_data/train.jsonl", "w") as f: for ex in api_examples[:split]: f.write(json.dumps(ex) + "\\n") with open("data/rae_training_data/validation.jsonl", "w") as f: for ex in api_examples[split:]: f.write(json.dumps(ex) + "\\n") print(f"✓ Upgraded to {len(api_examples)} Claude-generated examples") """ # ╔═══════════════════════════════════════════════════════════╗ # ║ CELL 6: Write AutoTrain Config ║ # ╚═══════════════════════════════════════════════════════════╝ import yaml config = { "task": "llm-sft", "base_model": BASE_MODEL, "project_name": PROJECT_NAME, "log": "tensorboard", "backend": "local", "data": { "path": "data/rae_training_data", "train_split": "train", "valid_split": None, "chat_template": "tokenizer", "column_mapping": { "text_column": "messages", }, }, "params": { "block_size": MAX_SEQ_LENGTH, "model_max_length": MAX_SEQ_LENGTH, "epochs": EPOCHS, "batch_size": BATCH_SIZE, "lr": LEARNING_RATE, "peft": True, "quantization": "int4", "target_modules": "all-linear", "lora_r": LORA_R, "lora_alpha": LORA_R * 2, "lora_dropout": 0.05, "padding": "right", "optimizer": "paged_adamw_8bit", "scheduler": "cosine", "gradient_accumulation": GRADIENT_ACCUM, "mixed_precision": "bf16", "merge_adapter": True, }, "hub": { "username": os.environ.get("HF_USERNAME", ""), "token": os.environ.get("HF_TOKEN", ""), "push_to_hub": True, }, } with open("rae_autotrain_config.yaml", "w") as f: yaml.dump(config, f, default_flow_style=False) print(f"✓ Config written: rae_autotrain_config.yaml") print(f" Base model: {BASE_MODEL}") print(f" LoRA rank: {LORA_R}") print(f" Epochs: {EPOCHS}") # ╔═══════════════════════════════════════════════════════════╗ # ║ CELL 7: RUN TRAINING ║ # ╚═══════════════════════════════════════════════════════════╝ # Uncomment and run: # !autotrain --config rae_autotrain_config.yaml # Or run from Python: """ import subprocess result = subprocess.run( ["autotrain", "--config", "rae_autotrain_config.yaml"], capture_output=False, ) """ print("Ready to train! Uncomment the training command above and run.") print(f"Expected time on T4: ~15-30 min for {EPOCHS} epochs") # ╔═══════════════════════════════════════════════════════════╗ # ║ CELL 8: Evaluate — Before vs After ║ # ╚═══════════════════════════════════════════════════════════╝ def evaluate_rae_response(response_text: str) -> dict: """Quick evaluation of an RAE response.""" import re phases = {} for phase in ["SATURATION", "ABSTRACTION", "DESCENT", "INTEGRATION"]: match = re.search(f"<{phase}>(.*?)", response_text, re.DOTALL) phases[phase] = match.group(1).strip() if match else "" present = sum(1 for v in phases.values() if v) sat_words = len(phases["SATURATION"].split()) abs_words = len(phases["ABSTRACTION"].split()) compression = abs_words / max(sat_words, 1) return { "phases_complete": f"{present}/4", "saturation_words": sat_words, "abstraction_words": abs_words, "compression_ratio": round(compression, 2), "descent_present": bool(phases["DESCENT"]), "integration_present": bool(phases["INTEGRATION"]), } # Test with the trained model: """ from transformers import pipeline # Load trained model model_id = f"{os.environ['HF_USERNAME']}/{PROJECT_NAME}" pipe = pipeline("text-generation", model=model_id, torch_dtype="auto", device_map="auto") test_prompt = "A SaaS company's logo retention is 95% but NRR is 78%. Diagnose." messages = [ {"role": "system", "content": RAE_SYSTEM_PROMPT}, {"role": "user", "content": test_prompt}, ] output = pipe(messages, max_new_tokens=2048, temperature=0.7) response = output[0]["generated_text"][-1]["content"] print("=== RAE Response ===") print(response[:500]) print("\\n=== Evaluation ===") print(evaluate_rae_response(response)) """ print("\n" + "=" * 60) print(" RAE TRAINING QUICKSTART COMPLETE") print(" 1. Run Cell 7 to start training") print(" 2. Run Cell 8 to evaluate results") print(" The hand was slow so the mind could be fast later.") print("=" * 60)