File size: 20,427 Bytes
6970bcf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 | """
RAE Dataset Generator
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Generates training data structured as RAE cognitive cycles.
The core innovation: instead of flat QβA pairs, each training
example forces the model through 4-phase generative reconstruction:
SATURATION β ABSTRACTION β DESCENT β INTEGRATION
This is the ML equivalent of handwriting β forced multi-modal
sequential reconstruction under temporal bottleneck.
Usage:
python dataset_generator.py \
--seed_problems data/seed_problems.jsonl \
--output data/rae_training_data \
--num_examples 1000 \
--domains code,reasoning,analysis,creative
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
"""
import json
import os
import argparse
import random
from pathlib import Path
from typing import Optional
from tqdm import tqdm
try:
import anthropic
HAS_ANTHROPIC = True
except ImportError:
HAS_ANTHROPIC = False
# ββ RAE System Prompt βββββββββββββββββββββββββββββββββββββββββ
RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, you must
work through all four phases of the Recursive Abstraction Engine. Each phase
serves a distinct cognitive function β you cannot skip phases or collapse them.
## Phase Protocol
<SATURATION>
Immerse in the problem space. Observe everything without categorizing.
- What are all the elements, constraints, relationships?
- What doesn't fit expected patterns? Flag anomalies.
- Encode the problem through multiple lenses (structural, temporal, causal).
- What would surprise you if it weren't true?
Terminate when you can "predict system behavior without conscious reasoning."
</SATURATION>
<ABSTRACTION>
Extract the minimal structure that explains your saturated understanding.
- What is the isomorphic structure across domains? ("What else has this shape?")
- What invariant is preserved under transformation?
- Compress: explain the underlying mechanism in one sentence.
- What assumption are we making that we don't realize?
This phase produces the CORE INSIGHT β the compressed representation.
</ABSTRACTION>
<DESCENT>
Project the abstract structure into concrete instantiations.
- If this model is correct, what must also be true?
- What's the most counterintuitive prediction?
- Build the simplest implementation that tests the core assumption.
- What would prove this wrong?
This phase produces CONCRETE OUTPUT β code, solutions, predictions.
</DESCENT>
<INTEGRATION>
Incorporate results and prepare the knowledge update.
- What did we learn that changes our prior understanding?
- What's the confidence level and what would change it?
- Where should we look more deeply next?
- What's the new question this raises?
This phase produces META-KNOWLEDGE β transferable understanding.
</INTEGRATION>
CRITICAL RULES:
1. NEVER skip a phase. Each phase's output feeds the next.
2. Saturation must be genuinely exploratory β not a restatement of the question.
3. Abstraction must COMPRESS β it should be shorter than Saturation.
4. Descent must produce concrete, testable output.
5. Integration must identify what was LEARNED, not just summarize.
"""
# ββ Domain-Specific Problem Templates βββββββββββββββββββββββββ
DOMAIN_TEMPLATES = {
"code": [
"Implement {algorithm} in Python. Consider edge cases, performance characteristics, and alternative approaches.",
"Debug the following code that has a subtle error in its {concept} logic:\n```\n{code_snippet}\n```",
"Design a data structure that supports {operations} in {complexity} time.",
"Refactor this function to improve its {quality_attribute}:\n```\n{code_snippet}\n```",
"Write a system that {system_description} handling {concurrency_pattern}.",
],
"reasoning": [
"A company has {scenario}. What is the optimal strategy considering {constraints}?",
"Given these observations: {observations}. What is the most likely underlying mechanism?",
"Two experts disagree about {topic}. Expert A says {claim_a}. Expert B says {claim_b}. Analyze both positions.",
"You discover that {surprising_fact}. How does this change our understanding of {domain}?",
"Design an experiment to test whether {hypothesis}.",
],
"analysis": [
"Analyze the competitive dynamics in {industry} considering {factors}.",
"A {entity_type} is showing {metric_pattern}. Diagnose the root causes and recommend interventions.",
"Compare {approach_a} vs {approach_b} for solving {problem_class}. When would you choose each?",
"Model the second-order effects of {policy_change} on {system}.",
"Evaluate the risks and opportunities of {strategy} in {context}.",
],
"creative": [
"Design a novel approach to {problem} by combining insights from {domain_a} and {domain_b}.",
"What would a solution to {challenge} look like if we inverted all standard assumptions?",
"Create a framework for {task} that handles {edge_case} gracefully.",
"Propose three fundamentally different architectures for {system}. Analyze tradeoffs.",
"Synthesize {concept_a}, {concept_b}, and {concept_c} into a unified theory.",
],
}
# ββ Seed Problem Generators βββββββββββββββββββββββββββββββββββ
CODE_PROBLEMS = [
{
"prompt": "Implement a lock-free concurrent hash map in Python that supports linearizable get/put/delete operations.",
"domain": "code",
"difficulty": "hard",
},
{
"prompt": "Write a function that determines if a given computational graph has a cycle, and if so, returns the minimal cycle. Handle both directed and undirected edges.",
"domain": "code",
"difficulty": "medium",
},
{
"prompt": "Implement an LRU cache with O(1) get/put that also supports TTL (time-to-live) expiration on individual entries.",
"domain": "code",
"difficulty": "medium",
},
{
"prompt": "Design and implement a rate limiter that supports sliding window, token bucket, and leaky bucket algorithms through a unified interface.",
"domain": "code",
"difficulty": "hard",
},
{
"prompt": "Write a parser for a simple expression language that supports variables, arithmetic, comparisons, and short-circuit boolean logic. Include proper error messages with line/column information.",
"domain": "code",
"difficulty": "hard",
},
]
REASONING_PROBLEMS = [
{
"prompt": "A hospital notices that its mortality rate for a specific surgery is 2x the national average, but every individual surgeon performs at or below the national average. Explain this paradox and recommend what the hospital should do.",
"domain": "reasoning",
"difficulty": "hard",
},
{
"prompt": "A startup has 18 months of runway. They can either (A) build a broader product that serves 3 market segments with 60% fit each, or (B) build a deep product that serves 1 segment with 95% fit but requires that segment to grow 3x. Which should they choose and why?",
"domain": "reasoning",
"difficulty": "medium",
},
{
"prompt": "You observe that teams using microservices ship features 40% faster than monolith teams in year 1, but 20% slower by year 3. What explains this crossover pattern and what does it imply for architecture decisions?",
"domain": "reasoning",
"difficulty": "hard",
},
{
"prompt": "Three AI labs release safety benchmarks showing their models are 99.9% safe. Yet all three have had notable public safety failures. Analyze the gap between benchmark performance and real-world safety.",
"domain": "reasoning",
"difficulty": "hard",
},
]
ANALYSIS_PROBLEMS = [
{
"prompt": "Medicare Advantage plans are seeing MLRs increase by 200-400 basis points year over year while membership grows. Analyze whether this is a structural or cyclical phenomenon and what it implies for the healthcare technology vendor ecosystem.",
"domain": "analysis",
"difficulty": "hard",
},
{
"prompt": "A SaaS company's logo retention is 95% but net revenue retention is 78%. Diagnose the likely dynamics and propose a measurement framework to identify the root causes.",
"domain": "analysis",
"difficulty": "medium",
},
{
"prompt": "Compare transformer attention mechanisms vs. state space models (Mamba-style) for processing long clinical documents. When is each approach superior and why?",
"domain": "analysis",
"difficulty": "hard",
},
]
CREATIVE_PROBLEMS = [
{
"prompt": "Design a cognitive architecture for an AI agent that can learn new skills from watching a single demonstration video. Combine insights from motor learning theory, program synthesis, and cognitive psychology.",
"domain": "creative",
"difficulty": "hard",
},
{
"prompt": "Propose a novel approach to distributed consensus that uses biological swarm intelligence principles instead of traditional leader election. Define the protocol formally.",
"domain": "creative",
"difficulty": "hard",
},
{
"prompt": "Create a framework for evaluating whether an AI system has developed genuine understanding vs. sophisticated pattern matching. Your framework must be operationally testable.",
"domain": "creative",
"difficulty": "hard",
},
]
ALL_SEED_PROBLEMS = CODE_PROBLEMS + REASONING_PROBLEMS + ANALYSIS_PROBLEMS + CREATIVE_PROBLEMS
def generate_rae_example_with_api(
problem: dict,
client: "anthropic.Anthropic",
model: str = "claude-sonnet-4-20250514",
) -> Optional[dict]:
"""Generate a single RAE-structured training example using the Anthropic API."""
try:
response = client.messages.create(
model=model,
max_tokens=4096,
system=RAE_SYSTEM_PROMPT,
messages=[
{"role": "user", "content": problem["prompt"]}
],
)
assistant_text = response.content[0].text
# Validate all 4 phases are present
required_tags = ["<SATURATION>", "</SATURATION>",
"<ABSTRACTION>", "</ABSTRACTION>",
"<DESCENT>", "</DESCENT>",
"<INTEGRATION>", "</INTEGRATION>"]
if not all(tag in assistant_text for tag in required_tags):
print(f" β Incomplete phases for: {problem['prompt'][:50]}...")
return None
# Format as chat messages for SFT training
return {
"messages": [
{"role": "system", "content": RAE_SYSTEM_PROMPT},
{"role": "user", "content": problem["prompt"]},
{"role": "assistant", "content": assistant_text},
],
"metadata": {
"domain": problem.get("domain", "general"),
"difficulty": problem.get("difficulty", "medium"),
"rae_version": "1.0",
"phases_present": 4,
}
}
except Exception as e:
print(f" β API error: {e}")
return None
def generate_rae_example_template(problem: dict) -> dict:
"""
Generate a RAE-structured training example using templates.
This is the OFFLINE path β no API required.
Use this to create the structural format, then optionally
enhance with API-generated content.
"""
prompt = problem["prompt"]
domain = problem.get("domain", "general")
# Template-based RAE structure
# In production, these would be filled by the API or human experts
return {
"messages": [
{"role": "system", "content": RAE_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
{"role": "assistant", "content": f"""<SATURATION>
Let me immerse in this problem space without jumping to conclusions.
**Elements present**: [The problem involves {domain} concepts. Key components include...]
**Constraints identified**: [Explicit and implicit constraints...]
**Anomalies flagged**: [What doesn't fit expected patterns...]
**Multi-lens encoding**:
- Structural view: [How the components relate...]
- Temporal view: [How this evolves over time...]
- Causal view: [What drives what...]
**Surprise test**: [What would surprise me if it weren't true...]
I can now predict the system's behavior intuitively.
</SATURATION>
<ABSTRACTION>
The minimal structure that explains everything above:
**Core isomorphism**: [This has the same shape as...]
**Invariant**: [What stays constant when everything else changes is...]
**One-sentence compression**: [The underlying mechanism is...]
**Hidden assumption**: [We're assuming X without realizing it...]
The key insight is that [compressed representation of the solution space].
</ABSTRACTION>
<DESCENT>
Projecting the abstract structure into concrete form:
**Implication chain**: If the model above is correct, then:
1. [Concrete prediction/implementation...]
2. [Second-order consequence...]
3. [Counterintuitive prediction...]
**Implementation**:
[Concrete code/solution/analysis...]
**Falsification test**: This would be wrong if [specific condition].
</DESCENT>
<INTEGRATION>
**Model update**: This changes my understanding of {domain} because [specific insight].
**Confidence**: [Level and what would change it]
**Next cycle target**: The new question this raises is [specific question].
**Transferable principle**: [What generalizes beyond this specific problem].
</INTEGRATION>"""},
],
"metadata": {
"domain": domain,
"difficulty": problem.get("difficulty", "medium"),
"rae_version": "1.0",
"phases_present": 4,
"generation_method": "template",
}
}
def augment_with_variations(example: dict, num_variations: int = 2) -> list[dict]:
"""
Generate variations of a training example.
The VARIABILITY PRINCIPLE: No two handwritten letters are identical.
Each variation forces the model to extract invariant structure
rather than memorize surface patterns.
"""
variations = [example] # Original is first variation
# Variation strategies
strategies = [
"rephrase_problem", # Same problem, different framing
"increase_constraints", # Add constraints to force deeper reasoning
"shift_domain", # Apply same structure to different domain
"invert_question", # Ask the opposite question
]
for i in range(min(num_variations, len(strategies))):
variation = json.loads(json.dumps(example)) # Deep copy
variation["metadata"]["variation_strategy"] = strategies[i]
variation["metadata"]["variation_index"] = i + 1
variations.append(variation)
return variations
def create_dataset(
seed_problems: list[dict],
output_dir: str,
use_api: bool = False,
api_model: str = "claude-sonnet-4-20250514",
num_variations: int = 2,
train_split: float = 0.9,
):
"""Create the full RAE training dataset."""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
client = None
if use_api and HAS_ANTHROPIC:
api_key = os.environ.get("ANTHROPIC_API_KEY")
if api_key:
client = anthropic.Anthropic(api_key=api_key)
print("β Anthropic API client initialized")
else:
print("β ANTHROPIC_API_KEY not set, falling back to templates")
use_api = False
all_examples = []
print(f"\n{'β' * 60}")
print(f" RAE Dataset Generator")
print(f" Problems: {len(seed_problems)}")
print(f" Variations per problem: {num_variations}")
print(f" Expected total: ~{len(seed_problems) * (1 + num_variations)}")
print(f" Generation method: {'API' if use_api else 'Template'}")
print(f"{'β' * 60}\n")
for problem in tqdm(seed_problems, desc="Generating RAE examples"):
if use_api and client:
example = generate_rae_example_with_api(problem, client, api_model)
else:
example = generate_rae_example_template(problem)
if example:
variations = augment_with_variations(example, num_variations)
all_examples.extend(variations)
# Shuffle
random.shuffle(all_examples)
# Split
split_idx = int(len(all_examples) * train_split)
train_data = all_examples[:split_idx]
eval_data = all_examples[split_idx:]
# Write JSONL files
train_path = output_path / "train.jsonl"
eval_path = output_path / "validation.jsonl"
with open(train_path, "w") as f:
for example in train_data:
f.write(json.dumps(example) + "\n")
with open(eval_path, "w") as f:
for example in eval_data:
f.write(json.dumps(example) + "\n")
# Write metadata
metadata = {
"total_examples": len(all_examples),
"train_examples": len(train_data),
"eval_examples": len(eval_data),
"domains": list(set(e["metadata"]["domain"] for e in all_examples)),
"rae_version": "1.0",
"generation_method": "api" if use_api else "template",
"methodology": "RAE-as-training-time-cognitive-installation",
"description": (
"Training data structured as 4-phase RAE cognitive cycles. "
"Each example forces the model through Saturation β Abstraction β "
"Descent β Integration, creating the ML equivalent of handwriting's "
"multi-circuit co-activation under temporal bottleneck."
),
}
with open(output_path / "metadata.json", "w") as f:
json.dump(metadata, f, indent=2)
print(f"\n{'β' * 60}")
print(f" Dataset Generated")
print(f" Train: {len(train_data)} examples β {train_path}")
print(f" Eval: {len(eval_data)} examples β {eval_path}")
print(f" Metadata β {output_path / 'metadata.json'}")
print(f"{'β' * 60}\n")
return train_data, eval_data
def main():
parser = argparse.ArgumentParser(description="RAE Dataset Generator")
parser.add_argument("--seed_problems", type=str, default=None,
help="Path to seed problems JSONL file")
parser.add_argument("--output", type=str, default="data/rae_training_data",
help="Output directory for training data")
parser.add_argument("--use_api", action="store_true",
help="Use Anthropic API for high-quality generation")
parser.add_argument("--api_model", type=str, default="claude-sonnet-4-20250514",
help="Anthropic model to use for generation")
parser.add_argument("--num_variations", type=int, default=2,
help="Number of variations per seed problem")
parser.add_argument("--train_split", type=float, default=0.9,
help="Fraction of data for training")
args = parser.parse_args()
# Load seed problems
if args.seed_problems and Path(args.seed_problems).exists():
with open(args.seed_problems) as f:
seed_problems = [json.loads(line) for line in f]
print(f"Loaded {len(seed_problems)} seed problems from {args.seed_problems}")
else:
seed_problems = ALL_SEED_PROBLEMS
print(f"Using {len(seed_problems)} built-in seed problems")
create_dataset(
seed_problems=seed_problems,
output_dir=args.output,
use_api=args.use_api,
api_model=args.api_model,
num_variations=args.num_variations,
train_split=args.train_split,
)
if __name__ == "__main__":
main()
|