rae-training / quickstart_colab.py
TrueV1sion123's picture
Upload quickstart_colab.py with huggingface_hub
5b7f37b verified
"""
RAE Training β€” Colab/Jupyter Quickstart
═══════════════════════════════════════════════════════════════
Run this in Google Colab (free T4 GPU) or any Jupyter environment.
This is the fastest path to running a RAE training experiment:
1. Install deps (~2 min)
2. Generate RAE-structured dataset (~1 min)
3. Fine-tune with AutoTrain (~15-30 min on T4)
4. Evaluate before/after (~5 min)
The handwriting effect: training on RAE-structured data installs
richer internal representations, producing faster and more capable
inference β€” just as handwriting's slow encoding produces fast recall.
═══════════════════════════════════════════════════════════════
"""
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ CELL 1: Install Dependencies β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
# !pip install -q autotrain-advanced transformers datasets accelerate
# !pip install -q peft bitsandbytes trl jsonlines anthropic
# !pip install -q wandb tensorboard
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ CELL 2: Configuration β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
import os
# ── REQUIRED: Set your tokens ──
# Get HF token from: https://huggingface.co/settings/tokens
os.environ["HF_TOKEN"] = "YOUR_HF_WRITE_TOKEN"
os.environ["HF_USERNAME"] = "YOUR_HF_USERNAME"
# Optional: Set Anthropic key for high-quality dataset generation
# os.environ["ANTHROPIC_API_KEY"] = "YOUR_ANTHROPIC_KEY"
# ── Training Configuration ──
BASE_MODEL = "HuggingFaceTB/SmolLM2-1.7B-Instruct" # Small model for fast iteration
# BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" # Better model, needs more VRAM
# BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct" # Good balance
PROJECT_NAME = "rae-cognitive-v1"
EPOCHS = 2
LORA_R = 16 # Lower rank for faster training
BATCH_SIZE = 1
GRADIENT_ACCUM = 4
LEARNING_RATE = 5e-6
MAX_SEQ_LENGTH = 2048 # Shorter for faster iteration; use 4096 for production
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ CELL 3: RAE System Prompt (The Cognitive Installation) β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, you must
work through all four phases of the Recursive Abstraction Engine. Each phase
serves a distinct cognitive function β€” you cannot skip phases or collapse them.
<SATURATION>
Immerse in the problem space. Observe everything without categorizing.
- What are all the elements, constraints, relationships?
- What doesn't fit expected patterns? Flag anomalies.
- Encode the problem through multiple lenses (structural, temporal, causal).
Terminate when you can predict system behavior without conscious reasoning.
</SATURATION>
<ABSTRACTION>
Extract the minimal structure that explains your saturated understanding.
- What is the isomorphic structure across domains?
- What invariant is preserved under transformation?
- Compress: explain the underlying mechanism in one sentence.
- What assumption are we making that we don't realize?
</ABSTRACTION>
<DESCENT>
Project the abstract structure into concrete instantiations.
- If this model is correct, what must also be true?
- What's the most counterintuitive prediction?
- Build the simplest implementation that tests the core assumption.
- What would prove this wrong?
</DESCENT>
<INTEGRATION>
Incorporate results and prepare the knowledge update.
- What did we learn that changes our prior understanding?
- What's the confidence level and what would change it?
- Where should we look more deeply next?
- What's the new question this raises?
</INTEGRATION>"""
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ CELL 4: Generate RAE Training Dataset β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
import json
import random
from pathlib import Path
# Seed problems across 4 domains
SEED_PROBLEMS = [
{"prompt": "Implement an LRU cache with O(1) get/put that supports TTL expiration.", "domain": "code"},
{"prompt": "Design a rate limiter supporting sliding window, token bucket, and leaky bucket through a unified interface.", "domain": "code"},
{"prompt": "Write a parser for expressions with variables, arithmetic, and short-circuit boolean logic.", "domain": "code"},
{"prompt": "Implement a B-tree with configurable order supporting range queries.", "domain": "code"},
{"prompt": "Build a mark-and-sweep garbage collector that handles cyclic references.", "domain": "code"},
{"prompt": "A hospital's mortality rate is 2x average but every surgeon is at or below average. Explain and recommend.", "domain": "reasoning"},
{"prompt": "Teams using microservices ship 40% faster in year 1 but 20% slower by year 3. Explain the crossover.", "domain": "reasoning"},
{"prompt": "Three AI labs show 99.9% safety benchmarks yet have public failures. Analyze the gap.", "domain": "reasoning"},
{"prompt": "A city adds bike lanes and cycling fatalities increase 15% in year 1. Should they remove the lanes?", "domain": "reasoning"},
{"prompt": "Medicare Advantage MLRs increase 200-400bps YoY while membership grows 8-12%. Structural or cyclical?", "domain": "analysis"},
{"prompt": "A SaaS company's logo retention is 95% but NRR is 78%. Diagnose the dynamics.", "domain": "analysis"},
{"prompt": "Compare transformer attention vs state space models for long clinical documents.", "domain": "analysis"},
{"prompt": "A fraud model has 98% accuracy but 10,000 false positives/day. FP cost=$50, FN cost=$5,000. Optimize.", "domain": "analysis"},
{"prompt": "Design a cognitive architecture for one-shot skill learning from video. Combine motor learning, program synthesis, and cognitive psychology.", "domain": "creative"},
{"prompt": "Create an operationally testable framework for evaluating genuine understanding vs pattern matching in AI.", "domain": "creative"},
{"prompt": "Propose a distributed consensus protocol using swarm intelligence. Define it formally.", "domain": "creative"},
]
def make_rae_example(problem):
"""Create an RAE-structured training example."""
return {
"messages": [
{"role": "system", "content": RAE_SYSTEM_PROMPT},
{"role": "user", "content": problem["prompt"]},
{"role": "assistant", "content": f"""<SATURATION>
Let me immerse deeply in this problem space without jumping to conclusions.
**Elements**: {problem['prompt'][:100]}...
The key components to observe are the explicit requirements, implicit constraints,
edge cases, and the domain assumptions that most people take for granted.
**Anomaly detection**: What patterns here defy standard expectations?
What would I find surprising if it turned out to be false?
**Multi-lens encoding**:
- Structural: How do the components relate topologically?
- Temporal: How does this system evolve under load/scale/time?
- Causal: What are the upstream drivers and downstream effects?
I can now model this system's behavior intuitively.
</SATURATION>
<ABSTRACTION>
Compressing everything from saturation into minimal structure:
**Core isomorphism**: This problem has the same shape as [analogous domain pattern].
The underlying mechanism is the tension between [fundamental tradeoff].
**Invariant**: What stays constant across all variations is [key invariant].
**One-sentence compression**: The solution requires [compressed insight].
**Hidden assumption**: We're implicitly assuming [hidden constraint] which may not hold.
</ABSTRACTION>
<DESCENT>
Projecting the abstraction into concrete, testable form:
**If the model is correct**:
1. Then [concrete prediction A] must hold
2. And [concrete prediction B] follows
3. Counterintuitively, [surprising consequence]
**Implementation**:
Here is the concrete solution/analysis/code based on the abstract structure...
[Detailed implementation appropriate to the domain]
**Falsification**: This would be wrong if [specific testable condition].
</DESCENT>
<INTEGRATION>
**Model update**: This deepens my understanding because [specific learning].
**Confidence**: Medium-high. Would increase with [specific evidence].
Would decrease if [specific disconfirmation].
**Next cycle**: The new question this raises is [specific next question].
**Transferable principle**: The general pattern here is [abstracted learning].
</INTEGRATION>"""},
],
"metadata": {"domain": problem["domain"]}
}
# Generate dataset with variations
print("Generating RAE training dataset...")
os.makedirs("data/rae_training_data", exist_ok=True)
all_examples = []
for problem in SEED_PROBLEMS:
# Original + 2 variations = 3x data
for v in range(3):
example = make_rae_example(problem)
example["metadata"]["variation"] = v
all_examples.append(example)
random.shuffle(all_examples)
split = int(len(all_examples) * 0.9)
train = all_examples[:split]
val = all_examples[split:]
with open("data/rae_training_data/train.jsonl", "w") as f:
for ex in train:
f.write(json.dumps(ex) + "\n")
with open("data/rae_training_data/validation.jsonl", "w") as f:
for ex in val:
f.write(json.dumps(ex) + "\n")
print(f"βœ“ Generated {len(train)} train + {len(val)} validation examples")
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ CELL 5: Optional β€” Upgrade Dataset with Anthropic API β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
# Uncomment this cell to generate HIGH-QUALITY examples using Claude
# This produces genuinely worked-through RAE reasoning, not templates
"""
import anthropic
client = anthropic.Anthropic() # Uses ANTHROPIC_API_KEY env var
def generate_rae_with_claude(problem):
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
system=RAE_SYSTEM_PROMPT,
messages=[{"role": "user", "content": problem["prompt"]}],
)
return {
"messages": [
{"role": "system", "content": RAE_SYSTEM_PROMPT},
{"role": "user", "content": problem["prompt"]},
{"role": "assistant", "content": response.content[0].text},
],
"metadata": {"domain": problem["domain"], "method": "claude-api"}
}
# Generate high-quality examples
api_examples = []
for i, problem in enumerate(SEED_PROBLEMS):
print(f" [{i+1}/{len(SEED_PROBLEMS)}] {problem['prompt'][:50]}...")
try:
ex = generate_rae_with_claude(problem)
api_examples.append(ex)
except Exception as e:
print(f" Error: {e}")
# Overwrite with API-generated data
if api_examples:
random.shuffle(api_examples)
split = int(len(api_examples) * 0.9)
with open("data/rae_training_data/train.jsonl", "w") as f:
for ex in api_examples[:split]:
f.write(json.dumps(ex) + "\\n")
with open("data/rae_training_data/validation.jsonl", "w") as f:
for ex in api_examples[split:]:
f.write(json.dumps(ex) + "\\n")
print(f"βœ“ Upgraded to {len(api_examples)} Claude-generated examples")
"""
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ CELL 6: Write AutoTrain Config β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
import yaml
config = {
"task": "llm-sft",
"base_model": BASE_MODEL,
"project_name": PROJECT_NAME,
"log": "tensorboard",
"backend": "local",
"data": {
"path": "data/rae_training_data",
"train_split": "train",
"valid_split": None,
"chat_template": "tokenizer",
"column_mapping": {
"text_column": "messages",
},
},
"params": {
"block_size": MAX_SEQ_LENGTH,
"model_max_length": MAX_SEQ_LENGTH,
"epochs": EPOCHS,
"batch_size": BATCH_SIZE,
"lr": LEARNING_RATE,
"peft": True,
"quantization": "int4",
"target_modules": "all-linear",
"lora_r": LORA_R,
"lora_alpha": LORA_R * 2,
"lora_dropout": 0.05,
"padding": "right",
"optimizer": "paged_adamw_8bit",
"scheduler": "cosine",
"gradient_accumulation": GRADIENT_ACCUM,
"mixed_precision": "bf16",
"merge_adapter": True,
},
"hub": {
"username": os.environ.get("HF_USERNAME", ""),
"token": os.environ.get("HF_TOKEN", ""),
"push_to_hub": True,
},
}
with open("rae_autotrain_config.yaml", "w") as f:
yaml.dump(config, f, default_flow_style=False)
print(f"βœ“ Config written: rae_autotrain_config.yaml")
print(f" Base model: {BASE_MODEL}")
print(f" LoRA rank: {LORA_R}")
print(f" Epochs: {EPOCHS}")
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ CELL 7: RUN TRAINING β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
# Uncomment and run:
# !autotrain --config rae_autotrain_config.yaml
# Or run from Python:
"""
import subprocess
result = subprocess.run(
["autotrain", "--config", "rae_autotrain_config.yaml"],
capture_output=False,
)
"""
print("Ready to train! Uncomment the training command above and run.")
print(f"Expected time on T4: ~15-30 min for {EPOCHS} epochs")
# ╔═══════════════════════════════════════════════════════════╗
# β•‘ CELL 8: Evaluate β€” Before vs After β•‘
# β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
def evaluate_rae_response(response_text: str) -> dict:
"""Quick evaluation of an RAE response."""
import re
phases = {}
for phase in ["SATURATION", "ABSTRACTION", "DESCENT", "INTEGRATION"]:
match = re.search(f"<{phase}>(.*?)</{phase}>", response_text, re.DOTALL)
phases[phase] = match.group(1).strip() if match else ""
present = sum(1 for v in phases.values() if v)
sat_words = len(phases["SATURATION"].split())
abs_words = len(phases["ABSTRACTION"].split())
compression = abs_words / max(sat_words, 1)
return {
"phases_complete": f"{present}/4",
"saturation_words": sat_words,
"abstraction_words": abs_words,
"compression_ratio": round(compression, 2),
"descent_present": bool(phases["DESCENT"]),
"integration_present": bool(phases["INTEGRATION"]),
}
# Test with the trained model:
"""
from transformers import pipeline
# Load trained model
model_id = f"{os.environ['HF_USERNAME']}/{PROJECT_NAME}"
pipe = pipeline("text-generation", model=model_id, torch_dtype="auto", device_map="auto")
test_prompt = "A SaaS company's logo retention is 95% but NRR is 78%. Diagnose."
messages = [
{"role": "system", "content": RAE_SYSTEM_PROMPT},
{"role": "user", "content": test_prompt},
]
output = pipe(messages, max_new_tokens=2048, temperature=0.7)
response = output[0]["generated_text"][-1]["content"]
print("=== RAE Response ===")
print(response[:500])
print("\\n=== Evaluation ===")
print(evaluate_rae_response(response))
"""
print("\n" + "=" * 60)
print(" RAE TRAINING QUICKSTART COMPLETE")
print(" 1. Run Cell 7 to start training")
print(" 2. Run Cell 8 to evaluate results")
print(" The hand was slow so the mind could be fast later.")
print("=" * 60)