""" RAE Evaluation Harness ═══════════════════════════════════════════════════════════════ Evaluates the training effect by comparing base vs. RAE-trained model on structured reasoning tasks. Key metrics: 1. Phase Completeness: Does the model produce all 4 RAE phases? 2. Compression Ratio: Is Abstraction shorter than Saturation? 3. Prediction Quality: Are Descent predictions accurate? 4. Integration Depth: Does Integration produce novel insight? 5. Task Accuracy: Does the final answer match ground truth? 6. Inference Speed: Is the trained model faster (handwriting hypothesis)? ═══════════════════════════════════════════════════════════════ """ import json import time import re from pathlib import Path from typing import Optional from dataclasses import dataclass import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline @dataclass class RAEEvalResult: problem_id: str model_name: str # Phase completeness (0-4) phases_present: int saturation_present: bool abstraction_present: bool descent_present: bool integration_present: bool # Phase quality metrics saturation_length: int abstraction_length: int descent_length: int integration_length: int compression_ratio: float # abstraction_len / saturation_len # Performance inference_time_seconds: float total_tokens_generated: int tokens_per_second: float # Quality (requires ground truth or human eval) task_accuracy: Optional[float] = None def to_dict(self): return self.__dict__ class RAEEvaluator: """Evaluates RAE training effectiveness.""" PHASE_PATTERNS = { "saturation": (r"(.*?)", re.DOTALL), "abstraction": (r"(.*?)", re.DOTALL), "descent": (r"(.*?)", re.DOTALL), "integration": (r"(.*?)", re.DOTALL), } RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, work through all four phases: Immerse in the problem. Observe without categorizing. Flag anomalies. Multi-lens encoding. Extract minimal structure. Find isomorphisms. Compress to core insight. Project into concrete form. Generate predictions. Build implementation. Identify falsification. Update model. Assess confidence. Identify next questions. Extract transferable principles. Always produce all 4 phases with their XML tags.""" def __init__(self, model_path: str, device: str = "auto"): self.model_path = model_path self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) self.model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.bfloat16, device_map=device, trust_remote_code=True, ) self.model.eval() def evaluate_single(self, problem: dict) -> RAEEvalResult: """Evaluate a single problem.""" prompt = problem["prompt"] problem_id = problem.get("id", "unknown") # Format as chat messages = [ {"role": "system", "content": self.RAE_SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ] text = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device) input_len = inputs["input_ids"].shape[1] # Generate with timing start_time = time.time() with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=3072, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, ) elapsed = time.time() - start_time # Decode generated_ids = outputs[0][input_len:] response = self.tokenizer.decode(generated_ids, skip_special_tokens=True) total_tokens = len(generated_ids) # Extract phases phases = {} for phase_name, (pattern, flags) in self.PHASE_PATTERNS.items(): match = re.search(pattern, response, flags) phases[phase_name] = match.group(1).strip() if match else "" # Compute metrics sat_len = len(phases["saturation"].split()) abs_len = len(phases["abstraction"].split()) desc_len = len(phases["descent"].split()) int_len = len(phases["integration"].split()) compression_ratio = abs_len / max(sat_len, 1) return RAEEvalResult( problem_id=problem_id, model_name=self.model_path, phases_present=sum(1 for v in phases.values() if v), saturation_present=bool(phases["saturation"]), abstraction_present=bool(phases["abstraction"]), descent_present=bool(phases["descent"]), integration_present=bool(phases["integration"]), saturation_length=sat_len, abstraction_length=abs_len, descent_length=desc_len, integration_length=int_len, compression_ratio=compression_ratio, inference_time_seconds=elapsed, total_tokens_generated=total_tokens, tokens_per_second=total_tokens / max(elapsed, 0.001), ) def evaluate_benchmark(self, benchmark_path: str, output_path: str = None) -> list[RAEEvalResult]: """Evaluate a full benchmark suite.""" with open(benchmark_path) as f: problems = json.load(f) results = [] for i, problem in enumerate(problems): print(f" [{i+1}/{len(problems)}] {problem.get('id', 'unknown')}...", end=" ") result = self.evaluate_single(problem) results.append(result) print(f"phases={result.phases_present}/4, " f"compression={result.compression_ratio:.2f}, " f"{result.tokens_per_second:.1f} tok/s") # Summary avg_phases = sum(r.phases_present for r in results) / len(results) avg_compression = sum(r.compression_ratio for r in results) / len(results) avg_speed = sum(r.tokens_per_second for r in results) / len(results) complete_rate = sum(1 for r in results if r.phases_present == 4) / len(results) print(f"\n{'═' * 50}") print(f" EVALUATION SUMMARY — {self.model_path}") print(f"{'═' * 50}") print(f" Avg phases present: {avg_phases:.2f}/4") print(f" Complete rate (4/4): {complete_rate:.1%}") print(f" Avg compression: {avg_compression:.2f}") print(f" Avg speed: {avg_speed:.1f} tok/s") print(f"{'═' * 50}") # Save results if output_path: with open(output_path, "w") as f: json.dump( { "model": self.model_path, "summary": { "avg_phases": avg_phases, "complete_rate": complete_rate, "avg_compression": avg_compression, "avg_speed": avg_speed, }, "results": [r.to_dict() for r in results], }, f, indent=2, ) print(f" Results saved: {output_path}") return results def compare_models(base_path: str, trained_path: str, benchmark_path: str): """ Compare base model vs RAE-trained model. This is the handwriting hypothesis test: Does training with RAE-structured data produce better reasoning? """ print("=" * 60) print(" RAE TRAINING EFFECT COMPARISON") print(" Base vs. RAE-Trained") print("=" * 60) # Evaluate base model print(f"\n▶ Evaluating BASE model: {base_path}") base_eval = RAEEvaluator(base_path) base_results = base_eval.evaluate_benchmark( benchmark_path, "evaluation/base_results.json" ) del base_eval # Free GPU memory torch.cuda.empty_cache() # Evaluate trained model print(f"\n▶ Evaluating RAE-TRAINED model: {trained_path}") trained_eval = RAEEvaluator(trained_path) trained_results = trained_eval.evaluate_benchmark( benchmark_path, "evaluation/trained_results.json" ) # Comparison def avg(results, attr): return sum(getattr(r, attr) for r in results) / len(results) print(f"\n{'═' * 60}") print(f" COMPARISON: BASE vs RAE-TRAINED") print(f"{'═' * 60}") print(f" {'Metric':<25} {'Base':>10} {'Trained':>10} {'Delta':>10}") print(f" {'-'*55}") metrics = [ ("Avg Phases (of 4)", "phases_present"), ("Compression Ratio", "compression_ratio"), ("Tokens/sec", "tokens_per_second"), ] for name, attr in metrics: base_val = avg(base_results, attr) trained_val = avg(trained_results, attr) delta = trained_val - base_val sign = "+" if delta > 0 else "" print(f" {name:<25} {base_val:>10.2f} {trained_val:>10.2f} {sign}{delta:>9.2f}") # Complete rate base_cr = sum(1 for r in base_results if r.phases_present == 4) / len(base_results) trained_cr = sum(1 for r in trained_results if r.phases_present == 4) / len(trained_results) delta_cr = trained_cr - base_cr sign = "+" if delta_cr > 0 else "" print(f" {'4/4 Complete Rate':<25} {base_cr:>9.1%} {trained_cr:>9.1%} {sign}{delta_cr:>8.1%}") print(f"{'═' * 60}") if __name__ == "__main__": import sys if len(sys.argv) >= 4: compare_models(sys.argv[1], sys.argv[2], sys.argv[3]) elif len(sys.argv) >= 3: evaluator = RAEEvaluator(sys.argv[1]) evaluator.evaluate_benchmark(sys.argv[2], "evaluation/results.json") else: print("Usage:") print(" Compare: python eval_rae_model.py ") print(" Single: python eval_rae_model.py ")