| """ |
| Evaluation Sprint Runner |
| |
| Executes the evaluation harness against all 4 conditions: |
| 1. Baseline (plain Llama) |
| 2. Phase 1-5 (debate without semantic tension) |
| 3. Phase 6 Full (with semantic tension, specialization, preflight) |
| 4. Phase 6 -PreFlight (without preflight prediction) |
| |
| Usage: |
| python run_evaluation_sprint.py --questions 25 --output results.json |
| """ |
|
|
| import sys |
| import argparse |
| import json |
| from datetime import datetime |
| from pathlib import Path |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent / 'reasoning_forge')) |
| sys.path.insert(0, str(Path(__file__).parent.parent / 'inference')) |
|
|
| from test_suite_evaluation import ( |
| EvaluationHarness, |
| EvaluationAnalyzer, |
| EVALUATION_TEST_SUITE, |
| ) |
|
|
|
|
| def run_evaluation_sprint( |
| num_questions: int = 10, |
| output_json: str = "evaluation_results.json", |
| output_report: str = "evaluation_report.txt", |
| ): |
| """ |
| Run the complete evaluation sprint. |
| |
| Args: |
| num_questions: How many test questions to run (1-25) |
| output_json: Where to save JSON results |
| output_report: Where to save text report |
| """ |
|
|
| print("\n" + "=" * 80) |
| print("CODETTE PHASE 6 EVALUATION SPRINT") |
| print("=" * 80) |
| print(f"Test Date: {datetime.now().isoformat()}") |
| print(f"Questions to Run: {min(num_questions, len(EVALUATION_TEST_SUITE))}/25") |
| print(f"Output: {output_json}, {output_report}") |
| print("=" * 80 + "\n") |
|
|
| |
| print("[1/4] Loading ForgeEngine with Phase 6...") |
| try: |
| from reasoning_forge.forge_engine import ForgeEngine |
|
|
| forge = ForgeEngine(living_memory=None, enable_memory_weighting=False) |
|
|
| print(" OK: ForgeEngine loaded") |
| print(f" - semantic_tension_engine: {'READY' if forge.semantic_tension_engine else 'MISSING'}") |
| print(f" - specialization tracker: {'READY' if forge.specialization else 'MISSING'}") |
| print(f" - preflight_predictor: {'READY' if forge.preflight_predictor else 'MISSING'}") |
|
|
| |
| if forge.newton.orchestrator: |
| print(f" - GPU acceleration: ✓ ENABLED ({forge.newton.orchestrator.n_gpu_layers} layers)") |
|
|
| except Exception as e: |
| print(f" ERROR: {e}") |
| return False |
|
|
| |
| print("\n[2/4] Creating evaluation harness...") |
| try: |
| harness = EvaluationHarness(forge) |
| print(" OK: Harness created") |
| except Exception as e: |
| print(f" ERROR: {e}") |
| return False |
|
|
| |
| print(f"\n[3/4] Running evaluation on {min(num_questions, len(EVALUATION_TEST_SUITE))} questions...") |
| print(" This will take several minutes...\n") |
|
|
| try: |
| test_questions = EVALUATION_TEST_SUITE[:num_questions] |
| results = harness.run_evaluation_suite(test_questions) |
| print(f"\n OK: Evaluation complete") |
| print(f" - Baseline: {len(results['baseline_llama'])} results") |
| print(f" - Phase 1-5: {len(results['phase_1_5'])} results") |
| print(f" - Phase 6 Full: {len(results['phase_6_full'])} results") |
| print(f" - Phase 6 -PreFlight: {len(results['phase_6_no_preflight'])} results") |
| except Exception as e: |
| print(f" ERROR during evaluation: {e}") |
| import traceback |
|
|
| traceback.print_exc() |
| return False |
|
|
| |
| print(f"\n[4/4] Analyzing results...") |
| try: |
| analyzer = EvaluationAnalyzer(results) |
| report = analyzer.report() |
|
|
| |
| harness.export_results(output_json) |
|
|
| |
| with open(output_report, 'w', encoding='utf-8') as f: |
| f.write(report) |
|
|
| print(" OK: Analysis complete") |
| print(f" - JSON saved: {output_json}") |
| print(f" - Report saved: {output_report}") |
|
|
| |
| try: |
| |
| print("\n" + report) |
| except UnicodeEncodeError: |
| |
| print(" - Full report saved to file (Unicode summary unavailable in terminal)") |
|
|
| return True |
|
|
| except Exception as e: |
| print(f" ERROR during analysis: {e}") |
| import traceback |
|
|
| traceback.print_exc() |
| return False |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Run Codette Phase 6 evaluation sprint" |
| ) |
| parser.add_argument( |
| "--questions", |
| type=int, |
| default=5, |
| help="Number of test questions to run (1-25, default 5)", |
| ) |
| parser.add_argument( |
| "--output-json", |
| default="evaluation_results.json", |
| help="Output JSON file for results", |
| ) |
| parser.add_argument( |
| "--output-report", |
| default="evaluation_report.txt", |
| help="Output text file for report", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| |
| if args.questions < 1 or args.questions > 25: |
| print("ERROR: --questions must be between 1 and 25") |
| return 1 |
|
|
| |
| success = run_evaluation_sprint( |
| num_questions=args.questions, |
| output_json=args.output_json, |
| output_report=args.output_report, |
| ) |
|
|
| return 0 if success else 1 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|