TrueV1sion123
/

rae-training

Model card Files Files and versions

xet

Community

TrueV1sion123 commited on 20 days ago

Commit

6970bcf

verified ·

1 Parent(s): 433042d

Upload src/dataset_generator.py with huggingface_hub

Browse files

Files changed (1) hide show

src/dataset_generator.py +494 -0

src/dataset_generator.py ADDED Viewed

	@@ -0,0 +1,494 @@

+"""
+RAE Dataset Generator
+═══════════════════════════════════════════════════════════════
+Generates training data structured as RAE cognitive cycles.
+The core innovation: instead of flat Q→A pairs, each training
+example forces the model through 4-phase generative reconstruction:
+  SATURATION → ABSTRACTION → DESCENT → INTEGRATION
+This is the ML equivalent of handwriting — forced multi-modal
+sequential reconstruction under temporal bottleneck.
+Usage:
+    python dataset_generator.py \
+        --seed_problems data/seed_problems.jsonl \
+        --output data/rae_training_data \
+        --num_examples 1000 \
+        --domains code,reasoning,analysis,creative
+═══════════════════════════════════════════════════════════════
+"""
+import json
+import os
+import argparse
+import random
+from pathlib import Path
+from typing import Optional
+from tqdm import tqdm
+try:
+    import anthropic
+    HAS_ANTHROPIC = True
+except ImportError:
+    HAS_ANTHROPIC = False
+# ── RAE System Prompt ─────────────────────────────────────────
+RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, you must
+work through all four phases of the Recursive Abstraction Engine. Each phase
+serves a distinct cognitive function — you cannot skip phases or collapse them.
+## Phase Protocol
+<SATURATION>
+Immerse in the problem space. Observe everything without categorizing.
+- What are all the elements, constraints, relationships?
+- What doesn't fit expected patterns? Flag anomalies.
+- Encode the problem through multiple lenses (structural, temporal, causal).
+- What would surprise you if it weren't true?
+Terminate when you can "predict system behavior without conscious reasoning."
+</SATURATION>
+<ABSTRACTION>
+Extract the minimal structure that explains your saturated understanding.
+- What is the isomorphic structure across domains? ("What else has this shape?")
+- What invariant is preserved under transformation?
+- Compress: explain the underlying mechanism in one sentence.
+- What assumption are we making that we don't realize?
+This phase produces the CORE INSIGHT — the compressed representation.
+</ABSTRACTION>
+<DESCENT>
+Project the abstract structure into concrete instantiations.
+- If this model is correct, what must also be true?
+- What's the most counterintuitive prediction?
+- Build the simplest implementation that tests the core assumption.
+- What would prove this wrong?
+This phase produces CONCRETE OUTPUT — code, solutions, predictions.
+</DESCENT>
+<INTEGRATION>
+Incorporate results and prepare the knowledge update.
+- What did we learn that changes our prior understanding?
+- What's the confidence level and what would change it?
+- Where should we look more deeply next?
+- What's the new question this raises?
+This phase produces META-KNOWLEDGE — transferable understanding.
+</INTEGRATION>
+CRITICAL RULES:
+1. NEVER skip a phase. Each phase's output feeds the next.
+2. Saturation must be genuinely exploratory — not a restatement of the question.
+3. Abstraction must COMPRESS — it should be shorter than Saturation.
+4. Descent must produce concrete, testable output.
+5. Integration must identify what was LEARNED, not just summarize.
+"""
+# ── Domain-Specific Problem Templates ─────────────────────────
+DOMAIN_TEMPLATES = {
+    "code": [
+        "Implement {algorithm} in Python. Consider edge cases, performance characteristics, and alternative approaches.",
+        "Debug the following code that has a subtle error in its {concept} logic:\n```\n{code_snippet}\n```",
+        "Design a data structure that supports {operations} in {complexity} time.",
+        "Refactor this function to improve its {quality_attribute}:\n```\n{code_snippet}\n```",
+        "Write a system that {system_description} handling {concurrency_pattern}.",
+    ],
+    "reasoning": [
+        "A company has {scenario}. What is the optimal strategy considering {constraints}?",
+        "Given these observations: {observations}. What is the most likely underlying mechanism?",
+        "Two experts disagree about {topic}. Expert A says {claim_a}. Expert B says {claim_b}. Analyze both positions.",
+        "You discover that {surprising_fact}. How does this change our understanding of {domain}?",
+        "Design an experiment to test whether {hypothesis}.",
+    ],
+    "analysis": [
+        "Analyze the competitive dynamics in {industry} considering {factors}.",
+        "A {entity_type} is showing {metric_pattern}. Diagnose the root causes and recommend interventions.",
+        "Compare {approach_a} vs {approach_b} for solving {problem_class}. When would you choose each?",
+        "Model the second-order effects of {policy_change} on {system}.",
+        "Evaluate the risks and opportunities of {strategy} in {context}.",
+    ],
+    "creative": [
+        "Design a novel approach to {problem} by combining insights from {domain_a} and {domain_b}.",
+        "What would a solution to {challenge} look like if we inverted all standard assumptions?",
+        "Create a framework for {task} that handles {edge_case} gracefully.",
+        "Propose three fundamentally different architectures for {system}. Analyze tradeoffs.",
+        "Synthesize {concept_a}, {concept_b}, and {concept_c} into a unified theory.",
+    ],
+}
+# ── Seed Problem Generators ───────────────────────────────────
+CODE_PROBLEMS = [
+    {
+        "prompt": "Implement a lock-free concurrent hash map in Python that supports linearizable get/put/delete operations.",
+        "domain": "code",
+        "difficulty": "hard",
+    },
+    {
+        "prompt": "Write a function that determines if a given computational graph has a cycle, and if so, returns the minimal cycle. Handle both directed and undirected edges.",
+        "domain": "code",
+        "difficulty": "medium",
+    },
+    {
+        "prompt": "Implement an LRU cache with O(1) get/put that also supports TTL (time-to-live) expiration on individual entries.",
+        "domain": "code",
+        "difficulty": "medium",
+    },
+    {
+        "prompt": "Design and implement a rate limiter that supports sliding window, token bucket, and leaky bucket algorithms through a unified interface.",
+        "domain": "code",
+        "difficulty": "hard",
+    },
+    {
+        "prompt": "Write a parser for a simple expression language that supports variables, arithmetic, comparisons, and short-circuit boolean logic. Include proper error messages with line/column information.",
+        "domain": "code",
+        "difficulty": "hard",
+    },
+]
+REASONING_PROBLEMS = [
+    {
+        "prompt": "A hospital notices that its mortality rate for a specific surgery is 2x the national average, but every individual surgeon performs at or below the national average. Explain this paradox and recommend what the hospital should do.",
+        "domain": "reasoning",
+        "difficulty": "hard",
+    },
+    {
+        "prompt": "A startup has 18 months of runway. They can either (A) build a broader product that serves 3 market segments with 60% fit each, or (B) build a deep product that serves 1 segment with 95% fit but requires that segment to grow 3x. Which should they choose and why?",
+        "domain": "reasoning",
+        "difficulty": "medium",
+    },
+    {
+        "prompt": "You observe that teams using microservices ship features 40% faster than monolith teams in year 1, but 20% slower by year 3. What explains this crossover pattern and what does it imply for architecture decisions?",
+        "domain": "reasoning",
+        "difficulty": "hard",
+    },
+    {
+        "prompt": "Three AI labs release safety benchmarks showing their models are 99.9% safe. Yet all three have had notable public safety failures. Analyze the gap between benchmark performance and real-world safety.",
+        "domain": "reasoning",
+        "difficulty": "hard",
+    },
+]
+ANALYSIS_PROBLEMS = [
+    {
+        "prompt": "Medicare Advantage plans are seeing MLRs increase by 200-400 basis points year over year while membership grows. Analyze whether this is a structural or cyclical phenomenon and what it implies for the healthcare technology vendor ecosystem.",
+        "domain": "analysis",
+        "difficulty": "hard",
+    },
+    {
+        "prompt": "A SaaS company's logo retention is 95% but net revenue retention is 78%. Diagnose the likely dynamics and propose a measurement framework to identify the root causes.",
+        "domain": "analysis",
+        "difficulty": "medium",
+    },
+    {
+        "prompt": "Compare transformer attention mechanisms vs. state space models (Mamba-style) for processing long clinical documents. When is each approach superior and why?",
+        "domain": "analysis",
+        "difficulty": "hard",
+    },
+]
+CREATIVE_PROBLEMS = [
+    {
+        "prompt": "Design a cognitive architecture for an AI agent that can learn new skills from watching a single demonstration video. Combine insights from motor learning theory, program synthesis, and cognitive psychology.",
+        "domain": "creative",
+        "difficulty": "hard",
+    },
+    {
+        "prompt": "Propose a novel approach to distributed consensus that uses biological swarm intelligence principles instead of traditional leader election. Define the protocol formally.",
+        "domain": "creative",
+        "difficulty": "hard",
+    },
+    {
+        "prompt": "Create a framework for evaluating whether an AI system has developed genuine understanding vs. sophisticated pattern matching. Your framework must be operationally testable.",
+        "domain": "creative",
+        "difficulty": "hard",
+    },
+]
+ALL_SEED_PROBLEMS = CODE_PROBLEMS + REASONING_PROBLEMS + ANALYSIS_PROBLEMS + CREATIVE_PROBLEMS
+def generate_rae_example_with_api(
+    problem: dict,
+    client: "anthropic.Anthropic",
+    model: str = "claude-sonnet-4-20250514",
+) -> Optional[dict]:
+    """Generate a single RAE-structured training example using the Anthropic API."""
+    try:
+        response = client.messages.create(
+            model=model,
+            max_tokens=4096,
+            system=RAE_SYSTEM_PROMPT,
+            messages=[
+                {"role": "user", "content": problem["prompt"]}
+            ],
+        )
+        assistant_text = response.content[0].text
+        # Validate all 4 phases are present
+        required_tags = ["<SATURATION>", "</SATURATION>",
+                        "<ABSTRACTION>", "</ABSTRACTION>",
+                        "<DESCENT>", "</DESCENT>",
+                        "<INTEGRATION>", "</INTEGRATION>"]
+        if not all(tag in assistant_text for tag in required_tags):
+            print(f"  ⚠ Incomplete phases for: {problem['prompt'][:50]}...")
+            return None
+        # Format as chat messages for SFT training
+        return {
+            "messages": [
+                {"role": "system", "content": RAE_SYSTEM_PROMPT},
+                {"role": "user", "content": problem["prompt"]},
+                {"role": "assistant", "content": assistant_text},
+            ],
+            "metadata": {
+                "domain": problem.get("domain", "general"),
+                "difficulty": problem.get("difficulty", "medium"),
+                "rae_version": "1.0",
+                "phases_present": 4,
+            }
+        }
+    except Exception as e:
+        print(f"  ✗ API error: {e}")
+        return None
+def generate_rae_example_template(problem: dict) -> dict:
+    """
+    Generate a RAE-structured training example using templates.
+    This is the OFFLINE path — no API required.
+    Use this to create the structural format, then optionally
+    enhance with API-generated content.
+    """
+    prompt = problem["prompt"]
+    domain = problem.get("domain", "general")
+    # Template-based RAE structure
+    # In production, these would be filled by the API or human experts
+    return {
+        "messages": [
+            {"role": "system", "content": RAE_SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": f"""<SATURATION>
+Let me immerse in this problem space without jumping to conclusions.
+**Elements present**: [The problem involves {domain} concepts. Key components include...]
+**Constraints identified**: [Explicit and implicit constraints...]
+**Anomalies flagged**: [What doesn't fit expected patterns...]
+**Multi-lens encoding**:
+  - Structural view: [How the components relate...]
+  - Temporal view: [How this evolves over time...]
+  - Causal view: [What drives what...]
+**Surprise test**: [What would surprise me if it weren't true...]
+I can now predict the system's behavior intuitively.
+</SATURATION>
+<ABSTRACTION>
+The minimal structure that explains everything above:
+**Core isomorphism**: [This has the same shape as...]
+**Invariant**: [What stays constant when everything else changes is...]
+**One-sentence compression**: [The underlying mechanism is...]
+**Hidden assumption**: [We're assuming X without realizing it...]
+The key insight is that [compressed representation of the solution space].
+</ABSTRACTION>
+<DESCENT>
+Projecting the abstract structure into concrete form:
+**Implication chain**: If the model above is correct, then:
+1. [Concrete prediction/implementation...]
+2. [Second-order consequence...]
+3. [Counterintuitive prediction...]
+**Implementation**:
+[Concrete code/solution/analysis...]
+**Falsification test**: This would be wrong if [specific condition].
+</DESCENT>
+<INTEGRATION>
+**Model update**: This changes my understanding of {domain} because [specific insight].
+**Confidence**: [Level and what would change it]
+**Next cycle target**: The new question this raises is [specific question].
+**Transferable principle**: [What generalizes beyond this specific problem].
+</INTEGRATION>"""},
+        ],
+        "metadata": {
+            "domain": domain,
+            "difficulty": problem.get("difficulty", "medium"),
+            "rae_version": "1.0",
+            "phases_present": 4,
+            "generation_method": "template",
+        }
+    }
+def augment_with_variations(example: dict, num_variations: int = 2) -> list[dict]:
+    """
+    Generate variations of a training example.
+    The VARIABILITY PRINCIPLE: No two handwritten letters are identical.
+    Each variation forces the model to extract invariant structure
+    rather than memorize surface patterns.
+    """
+    variations = [example]  # Original is first variation
+    # Variation strategies
+    strategies = [
+        "rephrase_problem",     # Same problem, different framing
+        "increase_constraints", # Add constraints to force deeper reasoning
+        "shift_domain",         # Apply same structure to different domain
+        "invert_question",      # Ask the opposite question
+    ]
+    for i in range(min(num_variations, len(strategies))):
+        variation = json.loads(json.dumps(example))  # Deep copy
+        variation["metadata"]["variation_strategy"] = strategies[i]
+        variation["metadata"]["variation_index"] = i + 1
+        variations.append(variation)
+    return variations
+def create_dataset(
+    seed_problems: list[dict],
+    output_dir: str,
+    use_api: bool = False,
+    api_model: str = "claude-sonnet-4-20250514",
+    num_variations: int = 2,
+    train_split: float = 0.9,
+):
+    """Create the full RAE training dataset."""
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    client = None
+    if use_api and HAS_ANTHROPIC:
+        api_key = os.environ.get("ANTHROPIC_API_KEY")
+        if api_key:
+            client = anthropic.Anthropic(api_key=api_key)
+            print("✓ Anthropic API client initialized")
+        else:
+            print("⚠ ANTHROPIC_API_KEY not set, falling back to templates")
+            use_api = False
+    all_examples = []
+    print(f"\n{'═' * 60}")
+    print(f"  RAE Dataset Generator")
+    print(f"  Problems: {len(seed_problems)}")
+    print(f"  Variations per problem: {num_variations}")
+    print(f"  Expected total: ~{len(seed_problems) * (1 + num_variations)}")
+    print(f"  Generation method: {'API' if use_api else 'Template'}")
+    print(f"{'═' * 60}\n")
+    for problem in tqdm(seed_problems, desc="Generating RAE examples"):
+        if use_api and client:
+            example = generate_rae_example_with_api(problem, client, api_model)
+        else:
+            example = generate_rae_example_template(problem)
+        if example:
+            variations = augment_with_variations(example, num_variations)
+            all_examples.extend(variations)
+    # Shuffle
+    random.shuffle(all_examples)
+    # Split
+    split_idx = int(len(all_examples) * train_split)
+    train_data = all_examples[:split_idx]
+    eval_data = all_examples[split_idx:]
+    # Write JSONL files
+    train_path = output_path / "train.jsonl"
+    eval_path = output_path / "validation.jsonl"
+    with open(train_path, "w") as f:
+        for example in train_data:
+            f.write(json.dumps(example) + "\n")
+    with open(eval_path, "w") as f:
+        for example in eval_data:
+            f.write(json.dumps(example) + "\n")
+    # Write metadata
+    metadata = {
+        "total_examples": len(all_examples),
+        "train_examples": len(train_data),
+        "eval_examples": len(eval_data),
+        "domains": list(set(e["metadata"]["domain"] for e in all_examples)),
+        "rae_version": "1.0",
+        "generation_method": "api" if use_api else "template",
+        "methodology": "RAE-as-training-time-cognitive-installation",
+        "description": (
+            "Training data structured as 4-phase RAE cognitive cycles. "
+            "Each example forces the model through Saturation → Abstraction → "
+            "Descent → Integration, creating the ML equivalent of handwriting's "
+            "multi-circuit co-activation under temporal bottleneck."
+        ),
+    }
+    with open(output_path / "metadata.json", "w") as f:
+        json.dump(metadata, f, indent=2)
+    print(f"\n{'═' * 60}")
+    print(f"  Dataset Generated")
+    print(f"  Train: {len(train_data)} examples → {train_path}")
+    print(f"  Eval:  {len(eval_data)} examples → {eval_path}")
+    print(f"  Metadata → {output_path / 'metadata.json'}")
+    print(f"{'═' * 60}\n")
+    return train_data, eval_data
+def main():
+    parser = argparse.ArgumentParser(description="RAE Dataset Generator")
+    parser.add_argument("--seed_problems", type=str, default=None,
+                       help="Path to seed problems JSONL file")
+    parser.add_argument("--output", type=str, default="data/rae_training_data",
+                       help="Output directory for training data")
+    parser.add_argument("--use_api", action="store_true",
+                       help="Use Anthropic API for high-quality generation")
+    parser.add_argument("--api_model", type=str, default="claude-sonnet-4-20250514",
+                       help="Anthropic model to use for generation")
+    parser.add_argument("--num_variations", type=int, default=2,
+                       help="Number of variations per seed problem")
+    parser.add_argument("--train_split", type=float, default=0.9,
+                       help="Fraction of data for training")
+    args = parser.parse_args()
+    # Load seed problems
+    if args.seed_problems and Path(args.seed_problems).exists():
+        with open(args.seed_problems) as f:
+            seed_problems = [json.loads(line) for line in f]
+        print(f"Loaded {len(seed_problems)} seed problems from {args.seed_problems}")
+    else:
+        seed_problems = ALL_SEED_PROBLEMS
+        print(f"Using {len(seed_problems)} built-in seed problems")
+    create_dataset(
+        seed_problems=seed_problems,
+        output_dir=args.output,
+        use_api=args.use_api,
+        api_model=args.api_model,
+        num_variations=args.num_variations,
+        train_split=args.train_split,
+    )
+if __name__ == "__main__":
+    main()