"""
RAE Evaluation Harness
═══════════════════════════════════════════════════════════════
Evaluates the training effect by comparing base vs. RAE-trained
model on structured reasoning tasks.
Key metrics:
1. Phase Completeness: Does the model produce all 4 RAE phases?
2. Compression Ratio: Is Abstraction shorter than Saturation?
3. Prediction Quality: Are Descent predictions accurate?
4. Integration Depth: Does Integration produce novel insight?
5. Task Accuracy: Does the final answer match ground truth?
6. Inference Speed: Is the trained model faster (handwriting hypothesis)?
═══════════════════════════════════════════════════════════════
"""
import json
import time
import re
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@dataclass
class RAEEvalResult:
problem_id: str
model_name: str
# Phase completeness (0-4)
phases_present: int
saturation_present: bool
abstraction_present: bool
descent_present: bool
integration_present: bool
# Phase quality metrics
saturation_length: int
abstraction_length: int
descent_length: int
integration_length: int
compression_ratio: float # abstraction_len / saturation_len
# Performance
inference_time_seconds: float
total_tokens_generated: int
tokens_per_second: float
# Quality (requires ground truth or human eval)
task_accuracy: Optional[float] = None
def to_dict(self):
return self.__dict__
class RAEEvaluator:
"""Evaluates RAE training effectiveness."""
PHASE_PATTERNS = {
"saturation": (r"(.*?)", re.DOTALL),
"abstraction": (r"(.*?)", re.DOTALL),
"descent": (r"(.*?)", re.DOTALL),
"integration": (r"(.*?)", re.DOTALL),
}
RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, work through all four phases:
Immerse in the problem. Observe without categorizing. Flag anomalies. Multi-lens encoding.
Extract minimal structure. Find isomorphisms. Compress to core insight.
Project into concrete form. Generate predictions. Build implementation. Identify falsification.
Update model. Assess confidence. Identify next questions. Extract transferable principles.
Always produce all 4 phases with their XML tags."""
def __init__(self, model_path: str, device: str = "auto"):
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map=device,
trust_remote_code=True,
)
self.model.eval()
def evaluate_single(self, problem: dict) -> RAEEvalResult:
"""Evaluate a single problem."""
prompt = problem["prompt"]
problem_id = problem.get("id", "unknown")
# Format as chat
messages = [
{"role": "system", "content": self.RAE_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
]
text = self.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
input_len = inputs["input_ids"].shape[1]
# Generate with timing
start_time = time.time()
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=3072,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
)
elapsed = time.time() - start_time
# Decode
generated_ids = outputs[0][input_len:]
response = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
total_tokens = len(generated_ids)
# Extract phases
phases = {}
for phase_name, (pattern, flags) in self.PHASE_PATTERNS.items():
match = re.search(pattern, response, flags)
phases[phase_name] = match.group(1).strip() if match else ""
# Compute metrics
sat_len = len(phases["saturation"].split())
abs_len = len(phases["abstraction"].split())
desc_len = len(phases["descent"].split())
int_len = len(phases["integration"].split())
compression_ratio = abs_len / max(sat_len, 1)
return RAEEvalResult(
problem_id=problem_id,
model_name=self.model_path,
phases_present=sum(1 for v in phases.values() if v),
saturation_present=bool(phases["saturation"]),
abstraction_present=bool(phases["abstraction"]),
descent_present=bool(phases["descent"]),
integration_present=bool(phases["integration"]),
saturation_length=sat_len,
abstraction_length=abs_len,
descent_length=desc_len,
integration_length=int_len,
compression_ratio=compression_ratio,
inference_time_seconds=elapsed,
total_tokens_generated=total_tokens,
tokens_per_second=total_tokens / max(elapsed, 0.001),
)
def evaluate_benchmark(self, benchmark_path: str, output_path: str = None) -> list[RAEEvalResult]:
"""Evaluate a full benchmark suite."""
with open(benchmark_path) as f:
problems = json.load(f)
results = []
for i, problem in enumerate(problems):
print(f" [{i+1}/{len(problems)}] {problem.get('id', 'unknown')}...", end=" ")
result = self.evaluate_single(problem)
results.append(result)
print(f"phases={result.phases_present}/4, "
f"compression={result.compression_ratio:.2f}, "
f"{result.tokens_per_second:.1f} tok/s")
# Summary
avg_phases = sum(r.phases_present for r in results) / len(results)
avg_compression = sum(r.compression_ratio for r in results) / len(results)
avg_speed = sum(r.tokens_per_second for r in results) / len(results)
complete_rate = sum(1 for r in results if r.phases_present == 4) / len(results)
print(f"\n{'═' * 50}")
print(f" EVALUATION SUMMARY — {self.model_path}")
print(f"{'═' * 50}")
print(f" Avg phases present: {avg_phases:.2f}/4")
print(f" Complete rate (4/4): {complete_rate:.1%}")
print(f" Avg compression: {avg_compression:.2f}")
print(f" Avg speed: {avg_speed:.1f} tok/s")
print(f"{'═' * 50}")
# Save results
if output_path:
with open(output_path, "w") as f:
json.dump(
{
"model": self.model_path,
"summary": {
"avg_phases": avg_phases,
"complete_rate": complete_rate,
"avg_compression": avg_compression,
"avg_speed": avg_speed,
},
"results": [r.to_dict() for r in results],
},
f,
indent=2,
)
print(f" Results saved: {output_path}")
return results
def compare_models(base_path: str, trained_path: str, benchmark_path: str):
"""
Compare base model vs RAE-trained model.
This is the handwriting hypothesis test:
Does training with RAE-structured data produce better reasoning?
"""
print("=" * 60)
print(" RAE TRAINING EFFECT COMPARISON")
print(" Base vs. RAE-Trained")
print("=" * 60)
# Evaluate base model
print(f"\n▶ Evaluating BASE model: {base_path}")
base_eval = RAEEvaluator(base_path)
base_results = base_eval.evaluate_benchmark(
benchmark_path, "evaluation/base_results.json"
)
del base_eval # Free GPU memory
torch.cuda.empty_cache()
# Evaluate trained model
print(f"\n▶ Evaluating RAE-TRAINED model: {trained_path}")
trained_eval = RAEEvaluator(trained_path)
trained_results = trained_eval.evaluate_benchmark(
benchmark_path, "evaluation/trained_results.json"
)
# Comparison
def avg(results, attr):
return sum(getattr(r, attr) for r in results) / len(results)
print(f"\n{'═' * 60}")
print(f" COMPARISON: BASE vs RAE-TRAINED")
print(f"{'═' * 60}")
print(f" {'Metric':<25} {'Base':>10} {'Trained':>10} {'Delta':>10}")
print(f" {'-'*55}")
metrics = [
("Avg Phases (of 4)", "phases_present"),
("Compression Ratio", "compression_ratio"),
("Tokens/sec", "tokens_per_second"),
]
for name, attr in metrics:
base_val = avg(base_results, attr)
trained_val = avg(trained_results, attr)
delta = trained_val - base_val
sign = "+" if delta > 0 else ""
print(f" {name:<25} {base_val:>10.2f} {trained_val:>10.2f} {sign}{delta:>9.2f}")
# Complete rate
base_cr = sum(1 for r in base_results if r.phases_present == 4) / len(base_results)
trained_cr = sum(1 for r in trained_results if r.phases_present == 4) / len(trained_results)
delta_cr = trained_cr - base_cr
sign = "+" if delta_cr > 0 else ""
print(f" {'4/4 Complete Rate':<25} {base_cr:>9.1%} {trained_cr:>9.1%} {sign}{delta_cr:>8.1%}")
print(f"{'═' * 60}")
if __name__ == "__main__":
import sys
if len(sys.argv) >= 4:
compare_models(sys.argv[1], sys.argv[2], sys.argv[3])
elif len(sys.argv) >= 3:
evaluator = RAEEvaluator(sys.argv[1])
evaluator.evaluate_benchmark(sys.argv[2], "evaluation/results.json")
else:
print("Usage:")
print(" Compare: python eval_rae_model.py ")
print(" Single: python eval_rae_model.py ")