rae-training / src /dataset_generator.py

Upload src/dataset_generator.py with huggingface_hub

6970bcf verified 8 days ago

20.4 kB

	"""
	RAE Dataset Generator
	═══════════════════════════════════════════════════════════════
	Generates training data structured as RAE cognitive cycles.

	The core innovation: instead of flat Q→A pairs, each training
	example forces the model through 4-phase generative reconstruction:

	SATURATION → ABSTRACTION → DESCENT → INTEGRATION

	This is the ML equivalent of handwriting — forced multi-modal
	sequential reconstruction under temporal bottleneck.

	Usage:
	python dataset_generator.py \
	--seed_problems data/seed_problems.jsonl \
	--output data/rae_training_data \
	--num_examples 1000 \
	--domains code,reasoning,analysis,creative
	═══════════════════════════════════════════════════════════════
	"""

	import json
	import os
	import argparse
	import random
	from pathlib import Path
	from typing import Optional
	from tqdm import tqdm

	try:
	import anthropic
	HAS_ANTHROPIC = True
	except ImportError:
	HAS_ANTHROPIC = False

	# ── RAE System Prompt ─────────────────────────────────────────

	RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, you must
	work through all four phases of the Recursive Abstraction Engine. Each phase
	serves a distinct cognitive function — you cannot skip phases or collapse them.

	## Phase Protocol

	<SATURATION>
	Immerse in the problem space. Observe everything without categorizing.
	- What are all the elements, constraints, relationships?
	- What doesn't fit expected patterns? Flag anomalies.
	- Encode the problem through multiple lenses (structural, temporal, causal).
	- What would surprise you if it weren't true?
	Terminate when you can "predict system behavior without conscious reasoning."
	</SATURATION>

	<ABSTRACTION>
	Extract the minimal structure that explains your saturated understanding.
	- What is the isomorphic structure across domains? ("What else has this shape?")
	- What invariant is preserved under transformation?
	- Compress: explain the underlying mechanism in one sentence.
	- What assumption are we making that we don't realize?
	This phase produces the CORE INSIGHT — the compressed representation.
	</ABSTRACTION>

	<DESCENT>
	Project the abstract structure into concrete instantiations.
	- If this model is correct, what must also be true?
	- What's the most counterintuitive prediction?
	- Build the simplest implementation that tests the core assumption.
	- What would prove this wrong?
	This phase produces CONCRETE OUTPUT — code, solutions, predictions.
	</DESCENT>

	<INTEGRATION>
	Incorporate results and prepare the knowledge update.
	- What did we learn that changes our prior understanding?
	- What's the confidence level and what would change it?
	- Where should we look more deeply next?
	- What's the new question this raises?
	This phase produces META-KNOWLEDGE — transferable understanding.
	</INTEGRATION>

	CRITICAL RULES:
	1. NEVER skip a phase. Each phase's output feeds the next.
	2. Saturation must be genuinely exploratory — not a restatement of the question.
	3. Abstraction must COMPRESS — it should be shorter than Saturation.
	4. Descent must produce concrete, testable output.
	5. Integration must identify what was LEARNED, not just summarize.
	"""

	# ── Domain-Specific Problem Templates ─────────────────────────

	DOMAIN_TEMPLATES = {
	"code": [
	"Implement {algorithm} in Python. Consider edge cases, performance characteristics, and alternative approaches.",
	"Debug the following code that has a subtle error in its {concept} logic:\n```\n{code_snippet}\n```",
	"Design a data structure that supports {operations} in {complexity} time.",
	"Refactor this function to improve its {quality_attribute}:\n```\n{code_snippet}\n```",
	"Write a system that {system_description} handling {concurrency_pattern}.",
	],
	"reasoning": [
	"A company has {scenario}. What is the optimal strategy considering {constraints}?",
	"Given these observations: {observations}. What is the most likely underlying mechanism?",
	"Two experts disagree about {topic}. Expert A says {claim_a}. Expert B says {claim_b}. Analyze both positions.",
	"You discover that {surprising_fact}. How does this change our understanding of {domain}?",
	"Design an experiment to test whether {hypothesis}.",
	],
	"analysis": [
	"Analyze the competitive dynamics in {industry} considering {factors}.",
	"A {entity_type} is showing {metric_pattern}. Diagnose the root causes and recommend interventions.",
	"Compare {approach_a} vs {approach_b} for solving {problem_class}. When would you choose each?",
	"Model the second-order effects of {policy_change} on {system}.",
	"Evaluate the risks and opportunities of {strategy} in {context}.",
	],
	"creative": [
	"Design a novel approach to {problem} by combining insights from {domain_a} and {domain_b}.",
	"What would a solution to {challenge} look like if we inverted all standard assumptions?",
	"Create a framework for {task} that handles {edge_case} gracefully.",
	"Propose three fundamentally different architectures for {system}. Analyze tradeoffs.",
	"Synthesize {concept_a}, {concept_b}, and {concept_c} into a unified theory.",
	],
	}

	# ── Seed Problem Generators ───────────────────────────────────

	CODE_PROBLEMS = [
	{
	"prompt": "Implement a lock-free concurrent hash map in Python that supports linearizable get/put/delete operations.",
	"domain": "code",
	"difficulty": "hard",
	},
	{
	"prompt": "Write a function that determines if a given computational graph has a cycle, and if so, returns the minimal cycle. Handle both directed and undirected edges.",
	"domain": "code",
	"difficulty": "medium",
	},
	{
	"prompt": "Implement an LRU cache with O(1) get/put that also supports TTL (time-to-live) expiration on individual entries.",
	"domain": "code",
	"difficulty": "medium",
	},
	{
	"prompt": "Design and implement a rate limiter that supports sliding window, token bucket, and leaky bucket algorithms through a unified interface.",
	"domain": "code",
	"difficulty": "hard",
	},
	{
	"prompt": "Write a parser for a simple expression language that supports variables, arithmetic, comparisons, and short-circuit boolean logic. Include proper error messages with line/column information.",
	"domain": "code",
	"difficulty": "hard",
	},
	]

	REASONING_PROBLEMS = [
	{
	"prompt": "A hospital notices that its mortality rate for a specific surgery is 2x the national average, but every individual surgeon performs at or below the national average. Explain this paradox and recommend what the hospital should do.",
	"domain": "reasoning",
	"difficulty": "hard",
	},
	{
	"prompt": "A startup has 18 months of runway. They can either (A) build a broader product that serves 3 market segments with 60% fit each, or (B) build a deep product that serves 1 segment with 95% fit but requires that segment to grow 3x. Which should they choose and why?",
	"domain": "reasoning",
	"difficulty": "medium",
	},
	{
	"prompt": "You observe that teams using microservices ship features 40% faster than monolith teams in year 1, but 20% slower by year 3. What explains this crossover pattern and what does it imply for architecture decisions?",
	"domain": "reasoning",
	"difficulty": "hard",
	},
	{
	"prompt": "Three AI labs release safety benchmarks showing their models are 99.9% safe. Yet all three have had notable public safety failures. Analyze the gap between benchmark performance and real-world safety.",
	"domain": "reasoning",
	"difficulty": "hard",
	},
	]

	ANALYSIS_PROBLEMS = [
	{
	"prompt": "Medicare Advantage plans are seeing MLRs increase by 200-400 basis points year over year while membership grows. Analyze whether this is a structural or cyclical phenomenon and what it implies for the healthcare technology vendor ecosystem.",
	"domain": "analysis",
	"difficulty": "hard",
	},
	{
	"prompt": "A SaaS company's logo retention is 95% but net revenue retention is 78%. Diagnose the likely dynamics and propose a measurement framework to identify the root causes.",
	"domain": "analysis",
	"difficulty": "medium",
	},
	{
	"prompt": "Compare transformer attention mechanisms vs. state space models (Mamba-style) for processing long clinical documents. When is each approach superior and why?",
	"domain": "analysis",
	"difficulty": "hard",
	},
	]

	CREATIVE_PROBLEMS = [
	{
	"prompt": "Design a cognitive architecture for an AI agent that can learn new skills from watching a single demonstration video. Combine insights from motor learning theory, program synthesis, and cognitive psychology.",
	"domain": "creative",
	"difficulty": "hard",
	},
	{
	"prompt": "Propose a novel approach to distributed consensus that uses biological swarm intelligence principles instead of traditional leader election. Define the protocol formally.",
	"domain": "creative",
	"difficulty": "hard",
	},
	{
	"prompt": "Create a framework for evaluating whether an AI system has developed genuine understanding vs. sophisticated pattern matching. Your framework must be operationally testable.",
	"domain": "creative",
	"difficulty": "hard",
	},
	]

	ALL_SEED_PROBLEMS = CODE_PROBLEMS + REASONING_PROBLEMS + ANALYSIS_PROBLEMS + CREATIVE_PROBLEMS


	def generate_rae_example_with_api(
	problem: dict,
	client: "anthropic.Anthropic",
	model: str = "claude-sonnet-4-20250514",
	) -> Optional[dict]:
	"""Generate a single RAE-structured training example using the Anthropic API."""

	try:
	response = client.messages.create(
	model=model,
	max_tokens=4096,
	system=RAE_SYSTEM_PROMPT,
	messages=[
	{"role": "user", "content": problem["prompt"]}
	],
	)

	assistant_text = response.content[0].text

	# Validate all 4 phases are present
	required_tags = ["<SATURATION>", "</SATURATION>",
	"<ABSTRACTION>", "</ABSTRACTION>",
	"<DESCENT>", "</DESCENT>",
	"<INTEGRATION>", "</INTEGRATION>"]

	if not all(tag in assistant_text for tag in required_tags):
	print(f" ⚠ Incomplete phases for: {problem['prompt'][:50]}...")
	return None

	# Format as chat messages for SFT training
	return {
	"messages": [
	{"role": "system", "content": RAE_SYSTEM_PROMPT},
	{"role": "user", "content": problem["prompt"]},
	{"role": "assistant", "content": assistant_text},
	],
	"metadata": {
	"domain": problem.get("domain", "general"),
	"difficulty": problem.get("difficulty", "medium"),
	"rae_version": "1.0",
	"phases_present": 4,
	}
	}

	except Exception as e:
	print(f" ✗ API error: {e}")
	return None


	def generate_rae_example_template(problem: dict) -> dict:
	"""
	Generate a RAE-structured training example using templates.
	This is the OFFLINE path — no API required.
	Use this to create the structural format, then optionally
	enhance with API-generated content.
	"""

	prompt = problem["prompt"]
	domain = problem.get("domain", "general")

	# Template-based RAE structure
	# In production, these would be filled by the API or human experts
	return {
	"messages": [
	{"role": "system", "content": RAE_SYSTEM_PROMPT},
	{"role": "user", "content": prompt},
	{"role": "assistant", "content": f"""<SATURATION>
	Let me immerse in this problem space without jumping to conclusions.

	Elements present: [The problem involves {domain} concepts. Key components include...]
	Constraints identified: [Explicit and implicit constraints...]
	Anomalies flagged: [What doesn't fit expected patterns...]
	Multi-lens encoding:
	- Structural view: [How the components relate...]
	- Temporal view: [How this evolves over time...]
	- Causal view: [What drives what...]
	Surprise test: [What would surprise me if it weren't true...]

	I can now predict the system's behavior intuitively.
	</SATURATION>

	<ABSTRACTION>
	The minimal structure that explains everything above:

	Core isomorphism: [This has the same shape as...]
	Invariant: [What stays constant when everything else changes is...]
	One-sentence compression: [The underlying mechanism is...]
	Hidden assumption: [We're assuming X without realizing it...]

	The key insight is that [compressed representation of the solution space].
	</ABSTRACTION>

	<DESCENT>
	Projecting the abstract structure into concrete form:

	Implication chain: If the model above is correct, then:
	1. [Concrete prediction/implementation...]
	2. [Second-order consequence...]
	3. [Counterintuitive prediction...]

	Implementation:
	[Concrete code/solution/analysis...]

	Falsification test: This would be wrong if [specific condition].
	</DESCENT>

	<INTEGRATION>
	Model update: This changes my understanding of {domain} because [specific insight].
	Confidence: [Level and what would change it]
	Next cycle target: The new question this raises is [specific question].
	Transferable principle: [What generalizes beyond this specific problem].
	</INTEGRATION>"""},
	],
	"metadata": {
	"domain": domain,
	"difficulty": problem.get("difficulty", "medium"),
	"rae_version": "1.0",
	"phases_present": 4,
	"generation_method": "template",
	}
	}


	def augment_with_variations(example: dict, num_variations: int = 2) -> list[dict]:
	"""
	Generate variations of a training example.

	The VARIABILITY PRINCIPLE: No two handwritten letters are identical.
	Each variation forces the model to extract invariant structure
	rather than memorize surface patterns.
	"""
	variations = [example] # Original is first variation

	# Variation strategies
	strategies = [
	"rephrase_problem", # Same problem, different framing
	"increase_constraints", # Add constraints to force deeper reasoning
	"shift_domain", # Apply same structure to different domain
	"invert_question", # Ask the opposite question
	]

	for i in range(min(num_variations, len(strategies))):
	variation = json.loads(json.dumps(example)) # Deep copy
	variation["metadata"]["variation_strategy"] = strategies[i]
	variation["metadata"]["variation_index"] = i + 1
	variations.append(variation)

	return variations


	def create_dataset(
	seed_problems: list[dict],
	output_dir: str,
	use_api: bool = False,
	api_model: str = "claude-sonnet-4-20250514",
	num_variations: int = 2,
	train_split: float = 0.9,
	):
	"""Create the full RAE training dataset."""

	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	client = None
	if use_api and HAS_ANTHROPIC:
	api_key = os.environ.get("ANTHROPIC_API_KEY")
	if api_key:
	client = anthropic.Anthropic(api_key=api_key)
	print("✓ Anthropic API client initialized")
	else:
	print("⚠ ANTHROPIC_API_KEY not set, falling back to templates")
	use_api = False

	all_examples = []

	print(f"\n{'═' * 60}")
	print(f" RAE Dataset Generator")
	print(f" Problems: {len(seed_problems)}")
	print(f" Variations per problem: {num_variations}")
	print(f" Expected total: ~{len(seed_problems) * (1 + num_variations)}")
	print(f" Generation method: {'API' if use_api else 'Template'}")
	print(f"{'═' * 60}\n")

	for problem in tqdm(seed_problems, desc="Generating RAE examples"):
	if use_api and client:
	example = generate_rae_example_with_api(problem, client, api_model)
	else:
	example = generate_rae_example_template(problem)

	if example:
	variations = augment_with_variations(example, num_variations)
	all_examples.extend(variations)

	# Shuffle
	random.shuffle(all_examples)

	# Split
	split_idx = int(len(all_examples) * train_split)
	train_data = all_examples[:split_idx]
	eval_data = all_examples[split_idx:]

	# Write JSONL files
	train_path = output_path / "train.jsonl"
	eval_path = output_path / "validation.jsonl"

	with open(train_path, "w") as f:
	for example in train_data:
	f.write(json.dumps(example) + "\n")

	with open(eval_path, "w") as f:
	for example in eval_data:
	f.write(json.dumps(example) + "\n")

	# Write metadata
	metadata = {
	"total_examples": len(all_examples),
	"train_examples": len(train_data),
	"eval_examples": len(eval_data),
	"domains": list(set(e["metadata"]["domain"] for e in all_examples)),
	"rae_version": "1.0",
	"generation_method": "api" if use_api else "template",
	"methodology": "RAE-as-training-time-cognitive-installation",
	"description": (
	"Training data structured as 4-phase RAE cognitive cycles. "
	"Each example forces the model through Saturation → Abstraction → "
	"Descent → Integration, creating the ML equivalent of handwriting's "
	"multi-circuit co-activation under temporal bottleneck."
	),
	}

	with open(output_path / "metadata.json", "w") as f:
	json.dump(metadata, f, indent=2)

	print(f"\n{'═' * 60}")
	print(f" Dataset Generated")
	print(f" Train: {len(train_data)} examples → {train_path}")
	print(f" Eval: {len(eval_data)} examples → {eval_path}")
	print(f" Metadata → {output_path / 'metadata.json'}")
	print(f"{'═' * 60}\n")

	return train_data, eval_data


	def main():
	parser = argparse.ArgumentParser(description="RAE Dataset Generator")
	parser.add_argument("--seed_problems", type=str, default=None,
	help="Path to seed problems JSONL file")
	parser.add_argument("--output", type=str, default="data/rae_training_data",
	help="Output directory for training data")
	parser.add_argument("--use_api", action="store_true",
	help="Use Anthropic API for high-quality generation")
	parser.add_argument("--api_model", type=str, default="claude-sonnet-4-20250514",
	help="Anthropic model to use for generation")
	parser.add_argument("--num_variations", type=int, default=2,
	help="Number of variations per seed problem")
	parser.add_argument("--train_split", type=float, default=0.9,
	help="Fraction of data for training")

	args = parser.parse_args()

	# Load seed problems
	if args.seed_problems and Path(args.seed_problems).exists():
	with open(args.seed_problems) as f:
	seed_problems = [json.loads(line) for line in f]
	print(f"Loaded {len(seed_problems)} seed problems from {args.seed_problems}")
	else:
	seed_problems = ALL_SEED_PROBLEMS
	print(f"Using {len(seed_problems)} built-in seed problems")

	create_dataset(
	seed_problems=seed_problems,
	output_dir=args.output,
	use_api=args.use_api,
	api_model=args.api_model,
	num_variations=args.num_variations,
	train_split=args.train_split,
	)


	if __name__ == "__main__":
	main()