| """ |
| RAE Dataset Generator |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| Generates training data structured as RAE cognitive cycles. |
| |
| The core innovation: instead of flat QβA pairs, each training |
| example forces the model through 4-phase generative reconstruction: |
| |
| SATURATION β ABSTRACTION β DESCENT β INTEGRATION |
| |
| This is the ML equivalent of handwriting β forced multi-modal |
| sequential reconstruction under temporal bottleneck. |
| |
| Usage: |
| python dataset_generator.py \ |
| --seed_problems data/seed_problems.jsonl \ |
| --output data/rae_training_data \ |
| --num_examples 1000 \ |
| --domains code,reasoning,analysis,creative |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| """ |
|
|
| import json |
| import os |
| import argparse |
| import random |
| from pathlib import Path |
| from typing import Optional |
| from tqdm import tqdm |
|
|
| try: |
| import anthropic |
| HAS_ANTHROPIC = True |
| except ImportError: |
| HAS_ANTHROPIC = False |
|
|
| |
|
|
| RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, you must |
| work through all four phases of the Recursive Abstraction Engine. Each phase |
| serves a distinct cognitive function β you cannot skip phases or collapse them. |
| |
| ## Phase Protocol |
| |
| <SATURATION> |
| Immerse in the problem space. Observe everything without categorizing. |
| - What are all the elements, constraints, relationships? |
| - What doesn't fit expected patterns? Flag anomalies. |
| - Encode the problem through multiple lenses (structural, temporal, causal). |
| - What would surprise you if it weren't true? |
| Terminate when you can "predict system behavior without conscious reasoning." |
| </SATURATION> |
| |
| <ABSTRACTION> |
| Extract the minimal structure that explains your saturated understanding. |
| - What is the isomorphic structure across domains? ("What else has this shape?") |
| - What invariant is preserved under transformation? |
| - Compress: explain the underlying mechanism in one sentence. |
| - What assumption are we making that we don't realize? |
| This phase produces the CORE INSIGHT β the compressed representation. |
| </ABSTRACTION> |
| |
| <DESCENT> |
| Project the abstract structure into concrete instantiations. |
| - If this model is correct, what must also be true? |
| - What's the most counterintuitive prediction? |
| - Build the simplest implementation that tests the core assumption. |
| - What would prove this wrong? |
| This phase produces CONCRETE OUTPUT β code, solutions, predictions. |
| </DESCENT> |
| |
| <INTEGRATION> |
| Incorporate results and prepare the knowledge update. |
| - What did we learn that changes our prior understanding? |
| - What's the confidence level and what would change it? |
| - Where should we look more deeply next? |
| - What's the new question this raises? |
| This phase produces META-KNOWLEDGE β transferable understanding. |
| </INTEGRATION> |
| |
| CRITICAL RULES: |
| 1. NEVER skip a phase. Each phase's output feeds the next. |
| 2. Saturation must be genuinely exploratory β not a restatement of the question. |
| 3. Abstraction must COMPRESS β it should be shorter than Saturation. |
| 4. Descent must produce concrete, testable output. |
| 5. Integration must identify what was LEARNED, not just summarize. |
| """ |
|
|
| |
|
|
| DOMAIN_TEMPLATES = { |
| "code": [ |
| "Implement {algorithm} in Python. Consider edge cases, performance characteristics, and alternative approaches.", |
| "Debug the following code that has a subtle error in its {concept} logic:\n```\n{code_snippet}\n```", |
| "Design a data structure that supports {operations} in {complexity} time.", |
| "Refactor this function to improve its {quality_attribute}:\n```\n{code_snippet}\n```", |
| "Write a system that {system_description} handling {concurrency_pattern}.", |
| ], |
| "reasoning": [ |
| "A company has {scenario}. What is the optimal strategy considering {constraints}?", |
| "Given these observations: {observations}. What is the most likely underlying mechanism?", |
| "Two experts disagree about {topic}. Expert A says {claim_a}. Expert B says {claim_b}. Analyze both positions.", |
| "You discover that {surprising_fact}. How does this change our understanding of {domain}?", |
| "Design an experiment to test whether {hypothesis}.", |
| ], |
| "analysis": [ |
| "Analyze the competitive dynamics in {industry} considering {factors}.", |
| "A {entity_type} is showing {metric_pattern}. Diagnose the root causes and recommend interventions.", |
| "Compare {approach_a} vs {approach_b} for solving {problem_class}. When would you choose each?", |
| "Model the second-order effects of {policy_change} on {system}.", |
| "Evaluate the risks and opportunities of {strategy} in {context}.", |
| ], |
| "creative": [ |
| "Design a novel approach to {problem} by combining insights from {domain_a} and {domain_b}.", |
| "What would a solution to {challenge} look like if we inverted all standard assumptions?", |
| "Create a framework for {task} that handles {edge_case} gracefully.", |
| "Propose three fundamentally different architectures for {system}. Analyze tradeoffs.", |
| "Synthesize {concept_a}, {concept_b}, and {concept_c} into a unified theory.", |
| ], |
| } |
|
|
| |
|
|
| CODE_PROBLEMS = [ |
| { |
| "prompt": "Implement a lock-free concurrent hash map in Python that supports linearizable get/put/delete operations.", |
| "domain": "code", |
| "difficulty": "hard", |
| }, |
| { |
| "prompt": "Write a function that determines if a given computational graph has a cycle, and if so, returns the minimal cycle. Handle both directed and undirected edges.", |
| "domain": "code", |
| "difficulty": "medium", |
| }, |
| { |
| "prompt": "Implement an LRU cache with O(1) get/put that also supports TTL (time-to-live) expiration on individual entries.", |
| "domain": "code", |
| "difficulty": "medium", |
| }, |
| { |
| "prompt": "Design and implement a rate limiter that supports sliding window, token bucket, and leaky bucket algorithms through a unified interface.", |
| "domain": "code", |
| "difficulty": "hard", |
| }, |
| { |
| "prompt": "Write a parser for a simple expression language that supports variables, arithmetic, comparisons, and short-circuit boolean logic. Include proper error messages with line/column information.", |
| "domain": "code", |
| "difficulty": "hard", |
| }, |
| ] |
|
|
| REASONING_PROBLEMS = [ |
| { |
| "prompt": "A hospital notices that its mortality rate for a specific surgery is 2x the national average, but every individual surgeon performs at or below the national average. Explain this paradox and recommend what the hospital should do.", |
| "domain": "reasoning", |
| "difficulty": "hard", |
| }, |
| { |
| "prompt": "A startup has 18 months of runway. They can either (A) build a broader product that serves 3 market segments with 60% fit each, or (B) build a deep product that serves 1 segment with 95% fit but requires that segment to grow 3x. Which should they choose and why?", |
| "domain": "reasoning", |
| "difficulty": "medium", |
| }, |
| { |
| "prompt": "You observe that teams using microservices ship features 40% faster than monolith teams in year 1, but 20% slower by year 3. What explains this crossover pattern and what does it imply for architecture decisions?", |
| "domain": "reasoning", |
| "difficulty": "hard", |
| }, |
| { |
| "prompt": "Three AI labs release safety benchmarks showing their models are 99.9% safe. Yet all three have had notable public safety failures. Analyze the gap between benchmark performance and real-world safety.", |
| "domain": "reasoning", |
| "difficulty": "hard", |
| }, |
| ] |
|
|
| ANALYSIS_PROBLEMS = [ |
| { |
| "prompt": "Medicare Advantage plans are seeing MLRs increase by 200-400 basis points year over year while membership grows. Analyze whether this is a structural or cyclical phenomenon and what it implies for the healthcare technology vendor ecosystem.", |
| "domain": "analysis", |
| "difficulty": "hard", |
| }, |
| { |
| "prompt": "A SaaS company's logo retention is 95% but net revenue retention is 78%. Diagnose the likely dynamics and propose a measurement framework to identify the root causes.", |
| "domain": "analysis", |
| "difficulty": "medium", |
| }, |
| { |
| "prompt": "Compare transformer attention mechanisms vs. state space models (Mamba-style) for processing long clinical documents. When is each approach superior and why?", |
| "domain": "analysis", |
| "difficulty": "hard", |
| }, |
| ] |
|
|
| CREATIVE_PROBLEMS = [ |
| { |
| "prompt": "Design a cognitive architecture for an AI agent that can learn new skills from watching a single demonstration video. Combine insights from motor learning theory, program synthesis, and cognitive psychology.", |
| "domain": "creative", |
| "difficulty": "hard", |
| }, |
| { |
| "prompt": "Propose a novel approach to distributed consensus that uses biological swarm intelligence principles instead of traditional leader election. Define the protocol formally.", |
| "domain": "creative", |
| "difficulty": "hard", |
| }, |
| { |
| "prompt": "Create a framework for evaluating whether an AI system has developed genuine understanding vs. sophisticated pattern matching. Your framework must be operationally testable.", |
| "domain": "creative", |
| "difficulty": "hard", |
| }, |
| ] |
|
|
| ALL_SEED_PROBLEMS = CODE_PROBLEMS + REASONING_PROBLEMS + ANALYSIS_PROBLEMS + CREATIVE_PROBLEMS |
|
|
|
|
| def generate_rae_example_with_api( |
| problem: dict, |
| client: "anthropic.Anthropic", |
| model: str = "claude-sonnet-4-20250514", |
| ) -> Optional[dict]: |
| """Generate a single RAE-structured training example using the Anthropic API.""" |
| |
| try: |
| response = client.messages.create( |
| model=model, |
| max_tokens=4096, |
| system=RAE_SYSTEM_PROMPT, |
| messages=[ |
| {"role": "user", "content": problem["prompt"]} |
| ], |
| ) |
| |
| assistant_text = response.content[0].text |
| |
| |
| required_tags = ["<SATURATION>", "</SATURATION>", |
| "<ABSTRACTION>", "</ABSTRACTION>", |
| "<DESCENT>", "</DESCENT>", |
| "<INTEGRATION>", "</INTEGRATION>"] |
| |
| if not all(tag in assistant_text for tag in required_tags): |
| print(f" β Incomplete phases for: {problem['prompt'][:50]}...") |
| return None |
| |
| |
| return { |
| "messages": [ |
| {"role": "system", "content": RAE_SYSTEM_PROMPT}, |
| {"role": "user", "content": problem["prompt"]}, |
| {"role": "assistant", "content": assistant_text}, |
| ], |
| "metadata": { |
| "domain": problem.get("domain", "general"), |
| "difficulty": problem.get("difficulty", "medium"), |
| "rae_version": "1.0", |
| "phases_present": 4, |
| } |
| } |
| |
| except Exception as e: |
| print(f" β API error: {e}") |
| return None |
|
|
|
|
| def generate_rae_example_template(problem: dict) -> dict: |
| """ |
| Generate a RAE-structured training example using templates. |
| This is the OFFLINE path β no API required. |
| Use this to create the structural format, then optionally |
| enhance with API-generated content. |
| """ |
| |
| prompt = problem["prompt"] |
| domain = problem.get("domain", "general") |
| |
| |
| |
| return { |
| "messages": [ |
| {"role": "system", "content": RAE_SYSTEM_PROMPT}, |
| {"role": "user", "content": prompt}, |
| {"role": "assistant", "content": f"""<SATURATION> |
| Let me immerse in this problem space without jumping to conclusions. |
| |
| **Elements present**: [The problem involves {domain} concepts. Key components include...] |
| **Constraints identified**: [Explicit and implicit constraints...] |
| **Anomalies flagged**: [What doesn't fit expected patterns...] |
| **Multi-lens encoding**: |
| - Structural view: [How the components relate...] |
| - Temporal view: [How this evolves over time...] |
| - Causal view: [What drives what...] |
| **Surprise test**: [What would surprise me if it weren't true...] |
| |
| I can now predict the system's behavior intuitively. |
| </SATURATION> |
| |
| <ABSTRACTION> |
| The minimal structure that explains everything above: |
| |
| **Core isomorphism**: [This has the same shape as...] |
| **Invariant**: [What stays constant when everything else changes is...] |
| **One-sentence compression**: [The underlying mechanism is...] |
| **Hidden assumption**: [We're assuming X without realizing it...] |
| |
| The key insight is that [compressed representation of the solution space]. |
| </ABSTRACTION> |
| |
| <DESCENT> |
| Projecting the abstract structure into concrete form: |
| |
| **Implication chain**: If the model above is correct, then: |
| 1. [Concrete prediction/implementation...] |
| 2. [Second-order consequence...] |
| 3. [Counterintuitive prediction...] |
| |
| **Implementation**: |
| [Concrete code/solution/analysis...] |
| |
| **Falsification test**: This would be wrong if [specific condition]. |
| </DESCENT> |
| |
| <INTEGRATION> |
| **Model update**: This changes my understanding of {domain} because [specific insight]. |
| **Confidence**: [Level and what would change it] |
| **Next cycle target**: The new question this raises is [specific question]. |
| **Transferable principle**: [What generalizes beyond this specific problem]. |
| </INTEGRATION>"""}, |
| ], |
| "metadata": { |
| "domain": domain, |
| "difficulty": problem.get("difficulty", "medium"), |
| "rae_version": "1.0", |
| "phases_present": 4, |
| "generation_method": "template", |
| } |
| } |
|
|
|
|
| def augment_with_variations(example: dict, num_variations: int = 2) -> list[dict]: |
| """ |
| Generate variations of a training example. |
| |
| The VARIABILITY PRINCIPLE: No two handwritten letters are identical. |
| Each variation forces the model to extract invariant structure |
| rather than memorize surface patterns. |
| """ |
| variations = [example] |
| |
| |
| strategies = [ |
| "rephrase_problem", |
| "increase_constraints", |
| "shift_domain", |
| "invert_question", |
| ] |
| |
| for i in range(min(num_variations, len(strategies))): |
| variation = json.loads(json.dumps(example)) |
| variation["metadata"]["variation_strategy"] = strategies[i] |
| variation["metadata"]["variation_index"] = i + 1 |
| variations.append(variation) |
| |
| return variations |
|
|
|
|
| def create_dataset( |
| seed_problems: list[dict], |
| output_dir: str, |
| use_api: bool = False, |
| api_model: str = "claude-sonnet-4-20250514", |
| num_variations: int = 2, |
| train_split: float = 0.9, |
| ): |
| """Create the full RAE training dataset.""" |
| |
| output_path = Path(output_dir) |
| output_path.mkdir(parents=True, exist_ok=True) |
| |
| client = None |
| if use_api and HAS_ANTHROPIC: |
| api_key = os.environ.get("ANTHROPIC_API_KEY") |
| if api_key: |
| client = anthropic.Anthropic(api_key=api_key) |
| print("β Anthropic API client initialized") |
| else: |
| print("β ANTHROPIC_API_KEY not set, falling back to templates") |
| use_api = False |
| |
| all_examples = [] |
| |
| print(f"\n{'β' * 60}") |
| print(f" RAE Dataset Generator") |
| print(f" Problems: {len(seed_problems)}") |
| print(f" Variations per problem: {num_variations}") |
| print(f" Expected total: ~{len(seed_problems) * (1 + num_variations)}") |
| print(f" Generation method: {'API' if use_api else 'Template'}") |
| print(f"{'β' * 60}\n") |
| |
| for problem in tqdm(seed_problems, desc="Generating RAE examples"): |
| if use_api and client: |
| example = generate_rae_example_with_api(problem, client, api_model) |
| else: |
| example = generate_rae_example_template(problem) |
| |
| if example: |
| variations = augment_with_variations(example, num_variations) |
| all_examples.extend(variations) |
| |
| |
| random.shuffle(all_examples) |
| |
| |
| split_idx = int(len(all_examples) * train_split) |
| train_data = all_examples[:split_idx] |
| eval_data = all_examples[split_idx:] |
| |
| |
| train_path = output_path / "train.jsonl" |
| eval_path = output_path / "validation.jsonl" |
| |
| with open(train_path, "w") as f: |
| for example in train_data: |
| f.write(json.dumps(example) + "\n") |
| |
| with open(eval_path, "w") as f: |
| for example in eval_data: |
| f.write(json.dumps(example) + "\n") |
| |
| |
| metadata = { |
| "total_examples": len(all_examples), |
| "train_examples": len(train_data), |
| "eval_examples": len(eval_data), |
| "domains": list(set(e["metadata"]["domain"] for e in all_examples)), |
| "rae_version": "1.0", |
| "generation_method": "api" if use_api else "template", |
| "methodology": "RAE-as-training-time-cognitive-installation", |
| "description": ( |
| "Training data structured as 4-phase RAE cognitive cycles. " |
| "Each example forces the model through Saturation β Abstraction β " |
| "Descent β Integration, creating the ML equivalent of handwriting's " |
| "multi-circuit co-activation under temporal bottleneck." |
| ), |
| } |
| |
| with open(output_path / "metadata.json", "w") as f: |
| json.dump(metadata, f, indent=2) |
| |
| print(f"\n{'β' * 60}") |
| print(f" Dataset Generated") |
| print(f" Train: {len(train_data)} examples β {train_path}") |
| print(f" Eval: {len(eval_data)} examples β {eval_path}") |
| print(f" Metadata β {output_path / 'metadata.json'}") |
| print(f"{'β' * 60}\n") |
| |
| return train_data, eval_data |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="RAE Dataset Generator") |
| parser.add_argument("--seed_problems", type=str, default=None, |
| help="Path to seed problems JSONL file") |
| parser.add_argument("--output", type=str, default="data/rae_training_data", |
| help="Output directory for training data") |
| parser.add_argument("--use_api", action="store_true", |
| help="Use Anthropic API for high-quality generation") |
| parser.add_argument("--api_model", type=str, default="claude-sonnet-4-20250514", |
| help="Anthropic model to use for generation") |
| parser.add_argument("--num_variations", type=int, default=2, |
| help="Number of variations per seed problem") |
| parser.add_argument("--train_split", type=float, default=0.9, |
| help="Fraction of data for training") |
| |
| args = parser.parse_args() |
| |
| |
| if args.seed_problems and Path(args.seed_problems).exists(): |
| with open(args.seed_problems) as f: |
| seed_problems = [json.loads(line) for line in f] |
| print(f"Loaded {len(seed_problems)} seed problems from {args.seed_problems}") |
| else: |
| seed_problems = ALL_SEED_PROBLEMS |
| print(f"Using {len(seed_problems)} built-in seed problems") |
| |
| create_dataset( |
| seed_problems=seed_problems, |
| output_dir=args.output, |
| use_api=args.use_api, |
| api_model=args.api_model, |
| num_variations=args.num_variations, |
| train_split=args.train_split, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|