rae-training / evaluation /eval_rae_model.py
TrueV1sion123's picture
Upload evaluation/eval_rae_model.py with huggingface_hub
3304098 verified
"""
RAE Evaluation Harness
═══════════════════════════════════════════════════════════════
Evaluates the training effect by comparing base vs. RAE-trained
model on structured reasoning tasks.
Key metrics:
1. Phase Completeness: Does the model produce all 4 RAE phases?
2. Compression Ratio: Is Abstraction shorter than Saturation?
3. Prediction Quality: Are Descent predictions accurate?
4. Integration Depth: Does Integration produce novel insight?
5. Task Accuracy: Does the final answer match ground truth?
6. Inference Speed: Is the trained model faster (handwriting hypothesis)?
═══════════════════════════════════════════════════════════════
"""
import json
import time
import re
from pathlib import Path
from typing import Optional
from dataclasses import dataclass
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@dataclass
class RAEEvalResult:
problem_id: str
model_name: str
# Phase completeness (0-4)
phases_present: int
saturation_present: bool
abstraction_present: bool
descent_present: bool
integration_present: bool
# Phase quality metrics
saturation_length: int
abstraction_length: int
descent_length: int
integration_length: int
compression_ratio: float # abstraction_len / saturation_len
# Performance
inference_time_seconds: float
total_tokens_generated: int
tokens_per_second: float
# Quality (requires ground truth or human eval)
task_accuracy: Optional[float] = None
def to_dict(self):
return self.__dict__
class RAEEvaluator:
"""Evaluates RAE training effectiveness."""
PHASE_PATTERNS = {
"saturation": (r"<SATURATION>(.*?)</SATURATION>", re.DOTALL),
"abstraction": (r"<ABSTRACTION>(.*?)</ABSTRACTION>", re.DOTALL),
"descent": (r"<DESCENT>(.*?)</DESCENT>", re.DOTALL),
"integration": (r"<INTEGRATION>(.*?)</INTEGRATION>", re.DOTALL),
}
RAE_SYSTEM_PROMPT = """You are an RAE-trained cognitive reasoner. For EVERY problem, work through all four phases:
<SATURATION>Immerse in the problem. Observe without categorizing. Flag anomalies. Multi-lens encoding.</SATURATION>
<ABSTRACTION>Extract minimal structure. Find isomorphisms. Compress to core insight.</ABSTRACTION>
<DESCENT>Project into concrete form. Generate predictions. Build implementation. Identify falsification.</DESCENT>
<INTEGRATION>Update model. Assess confidence. Identify next questions. Extract transferable principles.</INTEGRATION>
Always produce all 4 phases with their XML tags."""
def __init__(self, model_path: str, device: str = "auto"):
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map=device,
trust_remote_code=True,
)
self.model.eval()
def evaluate_single(self, problem: dict) -> RAEEvalResult:
"""Evaluate a single problem."""
prompt = problem["prompt"]
problem_id = problem.get("id", "unknown")
# Format as chat
messages = [
{"role": "system", "content": self.RAE_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
]
text = self.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
input_len = inputs["input_ids"].shape[1]
# Generate with timing
start_time = time.time()
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=3072,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
)
elapsed = time.time() - start_time
# Decode
generated_ids = outputs[0][input_len:]
response = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
total_tokens = len(generated_ids)
# Extract phases
phases = {}
for phase_name, (pattern, flags) in self.PHASE_PATTERNS.items():
match = re.search(pattern, response, flags)
phases[phase_name] = match.group(1).strip() if match else ""
# Compute metrics
sat_len = len(phases["saturation"].split())
abs_len = len(phases["abstraction"].split())
desc_len = len(phases["descent"].split())
int_len = len(phases["integration"].split())
compression_ratio = abs_len / max(sat_len, 1)
return RAEEvalResult(
problem_id=problem_id,
model_name=self.model_path,
phases_present=sum(1 for v in phases.values() if v),
saturation_present=bool(phases["saturation"]),
abstraction_present=bool(phases["abstraction"]),
descent_present=bool(phases["descent"]),
integration_present=bool(phases["integration"]),
saturation_length=sat_len,
abstraction_length=abs_len,
descent_length=desc_len,
integration_length=int_len,
compression_ratio=compression_ratio,
inference_time_seconds=elapsed,
total_tokens_generated=total_tokens,
tokens_per_second=total_tokens / max(elapsed, 0.001),
)
def evaluate_benchmark(self, benchmark_path: str, output_path: str = None) -> list[RAEEvalResult]:
"""Evaluate a full benchmark suite."""
with open(benchmark_path) as f:
problems = json.load(f)
results = []
for i, problem in enumerate(problems):
print(f" [{i+1}/{len(problems)}] {problem.get('id', 'unknown')}...", end=" ")
result = self.evaluate_single(problem)
results.append(result)
print(f"phases={result.phases_present}/4, "
f"compression={result.compression_ratio:.2f}, "
f"{result.tokens_per_second:.1f} tok/s")
# Summary
avg_phases = sum(r.phases_present for r in results) / len(results)
avg_compression = sum(r.compression_ratio for r in results) / len(results)
avg_speed = sum(r.tokens_per_second for r in results) / len(results)
complete_rate = sum(1 for r in results if r.phases_present == 4) / len(results)
print(f"\n{'═' * 50}")
print(f" EVALUATION SUMMARY β€” {self.model_path}")
print(f"{'═' * 50}")
print(f" Avg phases present: {avg_phases:.2f}/4")
print(f" Complete rate (4/4): {complete_rate:.1%}")
print(f" Avg compression: {avg_compression:.2f}")
print(f" Avg speed: {avg_speed:.1f} tok/s")
print(f"{'═' * 50}")
# Save results
if output_path:
with open(output_path, "w") as f:
json.dump(
{
"model": self.model_path,
"summary": {
"avg_phases": avg_phases,
"complete_rate": complete_rate,
"avg_compression": avg_compression,
"avg_speed": avg_speed,
},
"results": [r.to_dict() for r in results],
},
f,
indent=2,
)
print(f" Results saved: {output_path}")
return results
def compare_models(base_path: str, trained_path: str, benchmark_path: str):
"""
Compare base model vs RAE-trained model.
This is the handwriting hypothesis test:
Does training with RAE-structured data produce better reasoning?
"""
print("=" * 60)
print(" RAE TRAINING EFFECT COMPARISON")
print(" Base vs. RAE-Trained")
print("=" * 60)
# Evaluate base model
print(f"\nβ–Ά Evaluating BASE model: {base_path}")
base_eval = RAEEvaluator(base_path)
base_results = base_eval.evaluate_benchmark(
benchmark_path, "evaluation/base_results.json"
)
del base_eval # Free GPU memory
torch.cuda.empty_cache()
# Evaluate trained model
print(f"\nβ–Ά Evaluating RAE-TRAINED model: {trained_path}")
trained_eval = RAEEvaluator(trained_path)
trained_results = trained_eval.evaluate_benchmark(
benchmark_path, "evaluation/trained_results.json"
)
# Comparison
def avg(results, attr):
return sum(getattr(r, attr) for r in results) / len(results)
print(f"\n{'═' * 60}")
print(f" COMPARISON: BASE vs RAE-TRAINED")
print(f"{'═' * 60}")
print(f" {'Metric':<25} {'Base':>10} {'Trained':>10} {'Delta':>10}")
print(f" {'-'*55}")
metrics = [
("Avg Phases (of 4)", "phases_present"),
("Compression Ratio", "compression_ratio"),
("Tokens/sec", "tokens_per_second"),
]
for name, attr in metrics:
base_val = avg(base_results, attr)
trained_val = avg(trained_results, attr)
delta = trained_val - base_val
sign = "+" if delta > 0 else ""
print(f" {name:<25} {base_val:>10.2f} {trained_val:>10.2f} {sign}{delta:>9.2f}")
# Complete rate
base_cr = sum(1 for r in base_results if r.phases_present == 4) / len(base_results)
trained_cr = sum(1 for r in trained_results if r.phases_present == 4) / len(trained_results)
delta_cr = trained_cr - base_cr
sign = "+" if delta_cr > 0 else ""
print(f" {'4/4 Complete Rate':<25} {base_cr:>9.1%} {trained_cr:>9.1%} {sign}{delta_cr:>8.1%}")
print(f"{'═' * 60}")
if __name__ == "__main__":
import sys
if len(sys.argv) >= 4:
compare_models(sys.argv[1], sys.argv[2], sys.argv[3])
elif len(sys.argv) >= 3:
evaluator = RAEEvaluator(sys.argv[1])
evaluator.evaluate_benchmark(sys.argv[2], "evaluation/results.json")
else:
print("Usage:")
print(" Compare: python eval_rae_model.py <base_model> <trained_model> <benchmark.json>")
print(" Single: python eval_rae_model.py <model_path> <benchmark.json>")