| """ |
| RAE Evaluation Harness |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| Evaluates the training effect by comparing base vs. RAE-trained |
| model on structured reasoning tasks. |
| |
| Key metrics: |
| 1. Phase Completeness: Does the model produce all 4 RAE phases? |
| 2. Compression Ratio: Is Abstraction shorter than Saturation? |
| 3. Prediction Quality: Are Descent predictions accurate? |
| 4. Integration Depth: Does Integration produce novel insight? |
| 5. Task Accuracy: Does the final answer match ground truth? |
| 6. Inference Speed: Is the trained model faster (handwriting hypothesis)? |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| """ |
|
|
| import json |
| import time |
| import re |
| from pathlib import Path |
| from typing import Optional |
| from dataclasses import dataclass |
|
|
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
|
|
| @dataclass |
| class RAEEvalResult: |
| problem_id: str |
| model_name: str |
| |
| |
| phases_present: int |
| saturation_present: bool |
| abstraction_present: bool |
| descent_present: bool |
| integration_present: bool |
| |
| |
| saturation_length: int |
| abstraction_length: int |
| descent_length: int |
| integration_length: int |
| compression_ratio: float |
| |
| |
| inference_time_seconds: float |
| total_tokens_generated: int |
| tokens_per_second: float |
| |
| |
| task_accuracy: Optional[float] = None |
| |
| def to_dict(self): |
| return self.__dict__ |
|
|
|
|
| class RAEEvaluator: |
| """Evaluates RAE training effectiveness.""" |
| |
| PHASE_PATTERNS = { |
| "saturation": (r"<SATURATION>(.*?)</SATURATION>", re.DOTALL), |
| "abstraction": (r"<ABSTRACTION>(.*?)</ABSTRACTION>", re.DOTALL), |
| "descent": (r"<DESCENT>(.*?)</DESCENT>", re.DOTALL), |
| "integration": (r"<INTEGRATION>(.*?)</INTEGRATION>", re.DOTALL), |
| } |
| |
| RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, work through all four phases: |
| |
| <SATURATION>Immerse in the problem. Observe without categorizing. Flag anomalies. Multi-lens encoding.</SATURATION> |
| <ABSTRACTION>Extract minimal structure. Find isomorphisms. Compress to core insight.</ABSTRACTION> |
| <DESCENT>Project into concrete form. Generate predictions. Build implementation. Identify falsification.</DESCENT> |
| <INTEGRATION>Update model. Assess confidence. Identify next questions. Extract transferable principles.</INTEGRATION> |
| |
| Always produce all 4 phases with their XML tags.""" |
| |
| def __init__(self, model_path: str, device: str = "auto"): |
| self.model_path = model_path |
| self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
| self.model = AutoModelForCausalLM.from_pretrained( |
| model_path, |
| torch_dtype=torch.bfloat16, |
| device_map=device, |
| trust_remote_code=True, |
| ) |
| self.model.eval() |
| |
| def evaluate_single(self, problem: dict) -> RAEEvalResult: |
| """Evaluate a single problem.""" |
| |
| prompt = problem["prompt"] |
| problem_id = problem.get("id", "unknown") |
| |
| |
| messages = [ |
| {"role": "system", "content": self.RAE_SYSTEM_PROMPT}, |
| {"role": "user", "content": prompt}, |
| ] |
| |
| text = self.tokenizer.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| |
| inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device) |
| input_len = inputs["input_ids"].shape[1] |
| |
| |
| start_time = time.time() |
| with torch.no_grad(): |
| outputs = self.model.generate( |
| **inputs, |
| max_new_tokens=3072, |
| temperature=0.7, |
| top_p=0.9, |
| do_sample=True, |
| pad_token_id=self.tokenizer.eos_token_id, |
| ) |
| elapsed = time.time() - start_time |
| |
| |
| generated_ids = outputs[0][input_len:] |
| response = self.tokenizer.decode(generated_ids, skip_special_tokens=True) |
| total_tokens = len(generated_ids) |
| |
| |
| phases = {} |
| for phase_name, (pattern, flags) in self.PHASE_PATTERNS.items(): |
| match = re.search(pattern, response, flags) |
| phases[phase_name] = match.group(1).strip() if match else "" |
| |
| |
| sat_len = len(phases["saturation"].split()) |
| abs_len = len(phases["abstraction"].split()) |
| desc_len = len(phases["descent"].split()) |
| int_len = len(phases["integration"].split()) |
| |
| compression_ratio = abs_len / max(sat_len, 1) |
| |
| return RAEEvalResult( |
| problem_id=problem_id, |
| model_name=self.model_path, |
| phases_present=sum(1 for v in phases.values() if v), |
| saturation_present=bool(phases["saturation"]), |
| abstraction_present=bool(phases["abstraction"]), |
| descent_present=bool(phases["descent"]), |
| integration_present=bool(phases["integration"]), |
| saturation_length=sat_len, |
| abstraction_length=abs_len, |
| descent_length=desc_len, |
| integration_length=int_len, |
| compression_ratio=compression_ratio, |
| inference_time_seconds=elapsed, |
| total_tokens_generated=total_tokens, |
| tokens_per_second=total_tokens / max(elapsed, 0.001), |
| ) |
| |
| def evaluate_benchmark(self, benchmark_path: str, output_path: str = None) -> list[RAEEvalResult]: |
| """Evaluate a full benchmark suite.""" |
| |
| with open(benchmark_path) as f: |
| problems = json.load(f) |
| |
| results = [] |
| for i, problem in enumerate(problems): |
| print(f" [{i+1}/{len(problems)}] {problem.get('id', 'unknown')}...", end=" ") |
| result = self.evaluate_single(problem) |
| results.append(result) |
| print(f"phases={result.phases_present}/4, " |
| f"compression={result.compression_ratio:.2f}, " |
| f"{result.tokens_per_second:.1f} tok/s") |
| |
| |
| avg_phases = sum(r.phases_present for r in results) / len(results) |
| avg_compression = sum(r.compression_ratio for r in results) / len(results) |
| avg_speed = sum(r.tokens_per_second for r in results) / len(results) |
| complete_rate = sum(1 for r in results if r.phases_present == 4) / len(results) |
| |
| print(f"\n{'β' * 50}") |
| print(f" EVALUATION SUMMARY β {self.model_path}") |
| print(f"{'β' * 50}") |
| print(f" Avg phases present: {avg_phases:.2f}/4") |
| print(f" Complete rate (4/4): {complete_rate:.1%}") |
| print(f" Avg compression: {avg_compression:.2f}") |
| print(f" Avg speed: {avg_speed:.1f} tok/s") |
| print(f"{'β' * 50}") |
| |
| |
| if output_path: |
| with open(output_path, "w") as f: |
| json.dump( |
| { |
| "model": self.model_path, |
| "summary": { |
| "avg_phases": avg_phases, |
| "complete_rate": complete_rate, |
| "avg_compression": avg_compression, |
| "avg_speed": avg_speed, |
| }, |
| "results": [r.to_dict() for r in results], |
| }, |
| f, |
| indent=2, |
| ) |
| print(f" Results saved: {output_path}") |
| |
| return results |
|
|
|
|
| def compare_models(base_path: str, trained_path: str, benchmark_path: str): |
| """ |
| Compare base model vs RAE-trained model. |
| This is the handwriting hypothesis test: |
| Does training with RAE-structured data produce better reasoning? |
| """ |
| |
| print("=" * 60) |
| print(" RAE TRAINING EFFECT COMPARISON") |
| print(" Base vs. RAE-Trained") |
| print("=" * 60) |
| |
| |
| print(f"\nβΆ Evaluating BASE model: {base_path}") |
| base_eval = RAEEvaluator(base_path) |
| base_results = base_eval.evaluate_benchmark( |
| benchmark_path, "evaluation/base_results.json" |
| ) |
| del base_eval |
| torch.cuda.empty_cache() |
| |
| |
| print(f"\nβΆ Evaluating RAE-TRAINED model: {trained_path}") |
| trained_eval = RAEEvaluator(trained_path) |
| trained_results = trained_eval.evaluate_benchmark( |
| benchmark_path, "evaluation/trained_results.json" |
| ) |
| |
| |
| def avg(results, attr): |
| return sum(getattr(r, attr) for r in results) / len(results) |
| |
| print(f"\n{'β' * 60}") |
| print(f" COMPARISON: BASE vs RAE-TRAINED") |
| print(f"{'β' * 60}") |
| print(f" {'Metric':<25} {'Base':>10} {'Trained':>10} {'Delta':>10}") |
| print(f" {'-'*55}") |
| |
| metrics = [ |
| ("Avg Phases (of 4)", "phases_present"), |
| ("Compression Ratio", "compression_ratio"), |
| ("Tokens/sec", "tokens_per_second"), |
| ] |
| |
| for name, attr in metrics: |
| base_val = avg(base_results, attr) |
| trained_val = avg(trained_results, attr) |
| delta = trained_val - base_val |
| sign = "+" if delta > 0 else "" |
| print(f" {name:<25} {base_val:>10.2f} {trained_val:>10.2f} {sign}{delta:>9.2f}") |
| |
| |
| base_cr = sum(1 for r in base_results if r.phases_present == 4) / len(base_results) |
| trained_cr = sum(1 for r in trained_results if r.phases_present == 4) / len(trained_results) |
| delta_cr = trained_cr - base_cr |
| sign = "+" if delta_cr > 0 else "" |
| print(f" {'4/4 Complete Rate':<25} {base_cr:>9.1%} {trained_cr:>9.1%} {sign}{delta_cr:>8.1%}") |
| |
| print(f"{'β' * 60}") |
|
|
|
|
| if __name__ == "__main__": |
| import sys |
| |
| if len(sys.argv) >= 4: |
| compare_models(sys.argv[1], sys.argv[2], sys.argv[3]) |
| elif len(sys.argv) >= 3: |
| evaluator = RAEEvaluator(sys.argv[1]) |
| evaluator.evaluate_benchmark(sys.argv[2], "evaluation/results.json") |
| else: |
| print("Usage:") |
| print(" Compare: python eval_rae_model.py <base_model> <trained_model> <benchmark.json>") |
| print(" Single: python eval_rae_model.py <model_path> <benchmark.json>") |
|
|