rae-training / quickstart_colab.py

Upload quickstart_colab.py with huggingface_hub

5b7f37b verified 8 days ago

17.6 kB

	"""
	RAE Training — Colab/Jupyter Quickstart
	═══════════════════════════════════════════════════════════════
	Run this in Google Colab (free T4 GPU) or any Jupyter environment.

	This is the fastest path to running a RAE training experiment:
	1. Install deps (~2 min)
	2. Generate RAE-structured dataset (~1 min)
	3. Fine-tune with AutoTrain (~15-30 min on T4)
	4. Evaluate before/after (~5 min)

	The handwriting effect: training on RAE-structured data installs
	richer internal representations, producing faster and more capable
	inference — just as handwriting's slow encoding produces fast recall.
	═══════════════════════════════════════════════════════════════
	"""

	# ╔═══════════════════════════════════════════════════════════╗
	# ║ CELL 1: Install Dependencies ║
	# ╚═══════════════════════════════════════════════════════════╝

	# !pip install -q autotrain-advanced transformers datasets accelerate
	# !pip install -q peft bitsandbytes trl jsonlines anthropic
	# !pip install -q wandb tensorboard

	# ╔═══════════════════════════════════════════════════════════╗
	# ║ CELL 2: Configuration ║
	# ╚═══════════════════════════════════════════════════════════╝

	import os

	# ── REQUIRED: Set your tokens ──
	# Get HF token from: https://huggingface.co/settings/tokens
	os.environ["HF_TOKEN"] = "YOUR_HF_WRITE_TOKEN"
	os.environ["HF_USERNAME"] = "YOUR_HF_USERNAME"

	# Optional: Set Anthropic key for high-quality dataset generation
	# os.environ["ANTHROPIC_API_KEY"] = "YOUR_ANTHROPIC_KEY"

	# ── Training Configuration ──
	BASE_MODEL = "HuggingFaceTB/SmolLM2-1.7B-Instruct" # Small model for fast iteration
	# BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" # Better model, needs more VRAM
	# BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct" # Good balance

	PROJECT_NAME = "rae-cognitive-v1"
	EPOCHS = 2
	LORA_R = 16 # Lower rank for faster training
	BATCH_SIZE = 1
	GRADIENT_ACCUM = 4
	LEARNING_RATE = 5e-6
	MAX_SEQ_LENGTH = 2048 # Shorter for faster iteration; use 4096 for production

	# ╔═══════════════════════════════════════════════════════════╗
	# ║ CELL 3: RAE System Prompt (The Cognitive Installation) ║
	# ╚═══════════════════════════════════════════════════════════╝

	RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, you must
	work through all four phases of the Recursive Abstraction Engine. Each phase
	serves a distinct cognitive function — you cannot skip phases or collapse them.

	<SATURATION>
	Immerse in the problem space. Observe everything without categorizing.
	- What are all the elements, constraints, relationships?
	- What doesn't fit expected patterns? Flag anomalies.
	- Encode the problem through multiple lenses (structural, temporal, causal).
	Terminate when you can predict system behavior without conscious reasoning.
	</SATURATION>

	<ABSTRACTION>
	Extract the minimal structure that explains your saturated understanding.
	- What is the isomorphic structure across domains?
	- What invariant is preserved under transformation?
	- Compress: explain the underlying mechanism in one sentence.
	- What assumption are we making that we don't realize?
	</ABSTRACTION>

	<DESCENT>
	Project the abstract structure into concrete instantiations.
	- If this model is correct, what must also be true?
	- What's the most counterintuitive prediction?
	- Build the simplest implementation that tests the core assumption.
	- What would prove this wrong?
	</DESCENT>

	<INTEGRATION>
	Incorporate results and prepare the knowledge update.
	- What did we learn that changes our prior understanding?
	- What's the confidence level and what would change it?
	- Where should we look more deeply next?
	- What's the new question this raises?
	</INTEGRATION>"""

	# ╔═══════════════════════════════════════════════════════════╗
	# ║ CELL 4: Generate RAE Training Dataset ║
	# ╚═══════════════════════════════════════════════════════════╝

	import json
	import random
	from pathlib import Path

	# Seed problems across 4 domains
	SEED_PROBLEMS = [
	{"prompt": "Implement an LRU cache with O(1) get/put that supports TTL expiration.", "domain": "code"},
	{"prompt": "Design a rate limiter supporting sliding window, token bucket, and leaky bucket through a unified interface.", "domain": "code"},
	{"prompt": "Write a parser for expressions with variables, arithmetic, and short-circuit boolean logic.", "domain": "code"},
	{"prompt": "Implement a B-tree with configurable order supporting range queries.", "domain": "code"},
	{"prompt": "Build a mark-and-sweep garbage collector that handles cyclic references.", "domain": "code"},
	{"prompt": "A hospital's mortality rate is 2x average but every surgeon is at or below average. Explain and recommend.", "domain": "reasoning"},
	{"prompt": "Teams using microservices ship 40% faster in year 1 but 20% slower by year 3. Explain the crossover.", "domain": "reasoning"},
	{"prompt": "Three AI labs show 99.9% safety benchmarks yet have public failures. Analyze the gap.", "domain": "reasoning"},
	{"prompt": "A city adds bike lanes and cycling fatalities increase 15% in year 1. Should they remove the lanes?", "domain": "reasoning"},
	{"prompt": "Medicare Advantage MLRs increase 200-400bps YoY while membership grows 8-12%. Structural or cyclical?", "domain": "analysis"},
	{"prompt": "A SaaS company's logo retention is 95% but NRR is 78%. Diagnose the dynamics.", "domain": "analysis"},
	{"prompt": "Compare transformer attention vs state space models for long clinical documents.", "domain": "analysis"},
	{"prompt": "A fraud model has 98% accuracy but 10,000 false positives/day. FP cost=$50, FN cost=$5,000. Optimize.", "domain": "analysis"},
	{"prompt": "Design a cognitive architecture for one-shot skill learning from video. Combine motor learning, program synthesis, and cognitive psychology.", "domain": "creative"},
	{"prompt": "Create an operationally testable framework for evaluating genuine understanding vs pattern matching in AI.", "domain": "creative"},
	{"prompt": "Propose a distributed consensus protocol using swarm intelligence. Define it formally.", "domain": "creative"},
	]

	def make_rae_example(problem):
	"""Create an RAE-structured training example."""
	return {
	"messages": [
	{"role": "system", "content": RAE_SYSTEM_PROMPT},
	{"role": "user", "content": problem["prompt"]},
	{"role": "assistant", "content": f"""<SATURATION>
	Let me immerse deeply in this problem space without jumping to conclusions.

	Elements: {problem['prompt'][:100]}...
	The key components to observe are the explicit requirements, implicit constraints,
	edge cases, and the domain assumptions that most people take for granted.

	Anomaly detection: What patterns here defy standard expectations?
	What would I find surprising if it turned out to be false?

	Multi-lens encoding:
	- Structural: How do the components relate topologically?
	- Temporal: How does this system evolve under load/scale/time?
	- Causal: What are the upstream drivers and downstream effects?

	I can now model this system's behavior intuitively.
	</SATURATION>

	<ABSTRACTION>
	Compressing everything from saturation into minimal structure:

	Core isomorphism: This problem has the same shape as [analogous domain pattern].
	The underlying mechanism is the tension between [fundamental tradeoff].

	Invariant: What stays constant across all variations is [key invariant].

	One-sentence compression: The solution requires [compressed insight].

	Hidden assumption: We're implicitly assuming [hidden constraint] which may not hold.
	</ABSTRACTION>

	<DESCENT>
	Projecting the abstraction into concrete, testable form:

	If the model is correct:
	1. Then [concrete prediction A] must hold
	2. And [concrete prediction B] follows
	3. Counterintuitively, [surprising consequence]

	Implementation:
	Here is the concrete solution/analysis/code based on the abstract structure...

	[Detailed implementation appropriate to the domain]

	Falsification: This would be wrong if [specific testable condition].
	</DESCENT>

	<INTEGRATION>
	Model update: This deepens my understanding because [specific learning].
	Confidence: Medium-high. Would increase with [specific evidence].
	Would decrease if [specific disconfirmation].
	Next cycle: The new question this raises is [specific next question].
	Transferable principle: The general pattern here is [abstracted learning].
	</INTEGRATION>"""},
	],
	"metadata": {"domain": problem["domain"]}
	}

	# Generate dataset with variations
	print("Generating RAE training dataset...")
	os.makedirs("data/rae_training_data", exist_ok=True)

	all_examples = []
	for problem in SEED_PROBLEMS:
	# Original + 2 variations = 3x data
	for v in range(3):
	example = make_rae_example(problem)
	example["metadata"]["variation"] = v
	all_examples.append(example)

	random.shuffle(all_examples)
	split = int(len(all_examples) * 0.9)
	train = all_examples[:split]
	val = all_examples[split:]

	with open("data/rae_training_data/train.jsonl", "w") as f:
	for ex in train:
	f.write(json.dumps(ex) + "\n")

	with open("data/rae_training_data/validation.jsonl", "w") as f:
	for ex in val:
	f.write(json.dumps(ex) + "\n")

	print(f"✓ Generated {len(train)} train + {len(val)} validation examples")

	# ╔═══════════════════════════════════════════════════════════╗
	# ║ CELL 5: Optional — Upgrade Dataset with Anthropic API ║
	# ╚═══════════════════════════════════════════════════════════╝

	# Uncomment this cell to generate HIGH-QUALITY examples using Claude
	# This produces genuinely worked-through RAE reasoning, not templates

	"""
	import anthropic

	client = anthropic.Anthropic() # Uses ANTHROPIC_API_KEY env var

	def generate_rae_with_claude(problem):
	response = client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=4096,
	system=RAE_SYSTEM_PROMPT,
	messages=[{"role": "user", "content": problem["prompt"]}],
	)
	return {
	"messages": [
	{"role": "system", "content": RAE_SYSTEM_PROMPT},
	{"role": "user", "content": problem["prompt"]},
	{"role": "assistant", "content": response.content[0].text},
	],
	"metadata": {"domain": problem["domain"], "method": "claude-api"}
	}

	# Generate high-quality examples
	api_examples = []
	for i, problem in enumerate(SEED_PROBLEMS):
	print(f" [{i+1}/{len(SEED_PROBLEMS)}] {problem['prompt'][:50]}...")
	try:
	ex = generate_rae_with_claude(problem)
	api_examples.append(ex)
	except Exception as e:
	print(f" Error: {e}")

	# Overwrite with API-generated data
	if api_examples:
	random.shuffle(api_examples)
	split = int(len(api_examples) * 0.9)
	with open("data/rae_training_data/train.jsonl", "w") as f:
	for ex in api_examples[:split]:
	f.write(json.dumps(ex) + "\\n")
	with open("data/rae_training_data/validation.jsonl", "w") as f:
	for ex in api_examples[split:]:
	f.write(json.dumps(ex) + "\\n")
	print(f"✓ Upgraded to {len(api_examples)} Claude-generated examples")
	"""

	# ╔═══════════════════════════════════════════════════════════╗
	# ║ CELL 6: Write AutoTrain Config ║
	# ╚═══════════════════════════════════════════════════════════╝

	import yaml

	config = {
	"task": "llm-sft",
	"base_model": BASE_MODEL,
	"project_name": PROJECT_NAME,
	"log": "tensorboard",
	"backend": "local",
	"data": {
	"path": "data/rae_training_data",
	"train_split": "train",
	"valid_split": None,
	"chat_template": "tokenizer",
	"column_mapping": {
	"text_column": "messages",
	},
	},
	"params": {
	"block_size": MAX_SEQ_LENGTH,
	"model_max_length": MAX_SEQ_LENGTH,
	"epochs": EPOCHS,
	"batch_size": BATCH_SIZE,
	"lr": LEARNING_RATE,
	"peft": True,
	"quantization": "int4",
	"target_modules": "all-linear",
	"lora_r": LORA_R,
	"lora_alpha": LORA_R * 2,
	"lora_dropout": 0.05,
	"padding": "right",
	"optimizer": "paged_adamw_8bit",
	"scheduler": "cosine",
	"gradient_accumulation": GRADIENT_ACCUM,
	"mixed_precision": "bf16",
	"merge_adapter": True,
	},
	"hub": {
	"username": os.environ.get("HF_USERNAME", ""),
	"token": os.environ.get("HF_TOKEN", ""),
	"push_to_hub": True,
	},
	}

	with open("rae_autotrain_config.yaml", "w") as f:
	yaml.dump(config, f, default_flow_style=False)

	print(f"✓ Config written: rae_autotrain_config.yaml")
	print(f" Base model: {BASE_MODEL}")
	print(f" LoRA rank: {LORA_R}")
	print(f" Epochs: {EPOCHS}")

	# ╔═══════════════════════════════════════════════════════════╗
	# ║ CELL 7: RUN TRAINING ║
	# ╚═══════════════════════════════════════════════════════════╝

	# Uncomment and run:
	# !autotrain --config rae_autotrain_config.yaml

	# Or run from Python:
	"""
	import subprocess
	result = subprocess.run(
	["autotrain", "--config", "rae_autotrain_config.yaml"],
	capture_output=False,
	)
	"""

	print("Ready to train! Uncomment the training command above and run.")
	print(f"Expected time on T4: ~15-30 min for {EPOCHS} epochs")

	# ╔═══════════════════════════════════════════════════════════╗
	# ║ CELL 8: Evaluate — Before vs After ║
	# ╚═══════════════════════════════════════════════════════════╝

	def evaluate_rae_response(response_text: str) -> dict:
	"""Quick evaluation of an RAE response."""
	import re

	phases = {}
	for phase in ["SATURATION", "ABSTRACTION", "DESCENT", "INTEGRATION"]:
	match = re.search(f"<{phase}>(.*?)</{phase}>", response_text, re.DOTALL)
	phases[phase] = match.group(1).strip() if match else ""

	present = sum(1 for v in phases.values() if v)
	sat_words = len(phases["SATURATION"].split())
	abs_words = len(phases["ABSTRACTION"].split())
	compression = abs_words / max(sat_words, 1)

	return {
	"phases_complete": f"{present}/4",
	"saturation_words": sat_words,
	"abstraction_words": abs_words,
	"compression_ratio": round(compression, 2),
	"descent_present": bool(phases["DESCENT"]),
	"integration_present": bool(phases["INTEGRATION"]),
	}


	# Test with the trained model:
	"""
	from transformers import pipeline

	# Load trained model
	model_id = f"{os.environ['HF_USERNAME']}/{PROJECT_NAME}"
	pipe = pipeline("text-generation", model=model_id, torch_dtype="auto", device_map="auto")

	test_prompt = "A SaaS company's logo retention is 95% but NRR is 78%. Diagnose."

	messages = [
	{"role": "system", "content": RAE_SYSTEM_PROMPT},
	{"role": "user", "content": test_prompt},
	]

	output = pipe(messages, max_new_tokens=2048, temperature=0.7)
	response = output[0]["generated_text"][-1]["content"]

	print("=== RAE Response ===")
	print(response[:500])
	print("\\n=== Evaluation ===")
	print(evaluate_rae_response(response))
	"""

	print("\n" + "=" * 60)
	print(" RAE TRAINING QUICKSTART COMPLETE")
	print(" 1. Run Cell 7 to start training")
	print(" 2. Run Cell 8 to evaluate results")
	print(" The hand was slow so the mind could be fast later.")
	print("=" * 60)