Codette-Reasoning / evaluation /run_evaluation_sprint.py
Raiff1982's picture
Upload 120 files
ed1b365 verified
"""
Evaluation Sprint Runner
Executes the evaluation harness against all 4 conditions:
1. Baseline (plain Llama)
2. Phase 1-5 (debate without semantic tension)
3. Phase 6 Full (with semantic tension, specialization, preflight)
4. Phase 6 -PreFlight (without preflight prediction)
Usage:
python run_evaluation_sprint.py --questions 25 --output results.json
"""
import sys
import argparse
import json
from datetime import datetime
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / 'reasoning_forge'))
sys.path.insert(0, str(Path(__file__).parent.parent / 'inference'))
from test_suite_evaluation import (
EvaluationHarness,
EvaluationAnalyzer,
EVALUATION_TEST_SUITE,
)
def run_evaluation_sprint(
num_questions: int = 10,
output_json: str = "evaluation_results.json",
output_report: str = "evaluation_report.txt",
):
"""
Run the complete evaluation sprint.
Args:
num_questions: How many test questions to run (1-25)
output_json: Where to save JSON results
output_report: Where to save text report
"""
print("\n" + "=" * 80)
print("CODETTE PHASE 6 EVALUATION SPRINT")
print("=" * 80)
print(f"Test Date: {datetime.now().isoformat()}")
print(f"Questions to Run: {min(num_questions, len(EVALUATION_TEST_SUITE))}/25")
print(f"Output: {output_json}, {output_report}")
print("=" * 80 + "\n")
# Load ForgeEngine with Phase 6
print("[1/4] Loading ForgeEngine with Phase 6...")
try:
from reasoning_forge.forge_engine import ForgeEngine
forge = ForgeEngine(living_memory=None, enable_memory_weighting=False)
print(" OK: ForgeEngine loaded")
print(f" - semantic_tension_engine: {'READY' if forge.semantic_tension_engine else 'MISSING'}")
print(f" - specialization tracker: {'READY' if forge.specialization else 'MISSING'}")
print(f" - preflight_predictor: {'READY' if forge.preflight_predictor else 'MISSING'}")
# Check GPU status from orchestrator
if forge.newton.orchestrator:
print(f" - GPU acceleration: ✓ ENABLED ({forge.newton.orchestrator.n_gpu_layers} layers)")
except Exception as e:
print(f" ERROR: {e}")
return False
# Create evaluation harness
print("\n[2/4] Creating evaluation harness...")
try:
harness = EvaluationHarness(forge)
print(" OK: Harness created")
except Exception as e:
print(f" ERROR: {e}")
return False
# Run evaluation suite
print(f"\n[3/4] Running evaluation on {min(num_questions, len(EVALUATION_TEST_SUITE))} questions...")
print(" This will take several minutes...\n")
try:
test_questions = EVALUATION_TEST_SUITE[:num_questions]
results = harness.run_evaluation_suite(test_questions)
print(f"\n OK: Evaluation complete")
print(f" - Baseline: {len(results['baseline_llama'])} results")
print(f" - Phase 1-5: {len(results['phase_1_5'])} results")
print(f" - Phase 6 Full: {len(results['phase_6_full'])} results")
print(f" - Phase 6 -PreFlight: {len(results['phase_6_no_preflight'])} results")
except Exception as e:
print(f" ERROR during evaluation: {e}")
import traceback
traceback.print_exc()
return False
# Analyze results
print(f"\n[4/4] Analyzing results...")
try:
analyzer = EvaluationAnalyzer(results)
report = analyzer.report()
# Save JSON results
harness.export_results(output_json)
# Save text report (with UTF-8 encoding for Unicode characters like Γ)
with open(output_report, 'w', encoding='utf-8') as f:
f.write(report)
print(" OK: Analysis complete")
print(f" - JSON saved: {output_json}")
print(f" - Report saved: {output_report}")
# Print summary to console (skip full report due to Unicode encoding)
try:
# Try to print the report
print("\n" + report)
except UnicodeEncodeError:
# Windows terminal encoding issue—just note that report was saved
print(" - Full report saved to file (Unicode summary unavailable in terminal)")
return True
except Exception as e:
print(f" ERROR during analysis: {e}")
import traceback
traceback.print_exc()
return False
def main():
parser = argparse.ArgumentParser(
description="Run Codette Phase 6 evaluation sprint"
)
parser.add_argument(
"--questions",
type=int,
default=5,
help="Number of test questions to run (1-25, default 5)",
)
parser.add_argument(
"--output-json",
default="evaluation_results.json",
help="Output JSON file for results",
)
parser.add_argument(
"--output-report",
default="evaluation_report.txt",
help="Output text file for report",
)
args = parser.parse_args()
# Validate num_questions
if args.questions < 1 or args.questions > 25:
print("ERROR: --questions must be between 1 and 25")
return 1
# Run sprint
success = run_evaluation_sprint(
num_questions=args.questions,
output_json=args.output_json,
output_report=args.output_report,
)
return 0 if success else 1
if __name__ == "__main__":
sys.exit(main())