File size: 5,463 Bytes

ed1b365

"""
Evaluation Sprint Runner

Executes the evaluation harness against all 4 conditions:
1. Baseline (plain Llama)
2. Phase 1-5 (debate without semantic tension)
3. Phase 6 Full (with semantic tension, specialization, preflight)
4. Phase 6 -PreFlight (without preflight prediction)

Usage:
    python run_evaluation_sprint.py --questions 25 --output results.json
"""

import sys
import argparse
import json
from datetime import datetime
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent / 'reasoning_forge'))
sys.path.insert(0, str(Path(__file__).parent.parent / 'inference'))

from test_suite_evaluation import (
    EvaluationHarness,
    EvaluationAnalyzer,
    EVALUATION_TEST_SUITE,
)


def run_evaluation_sprint(
    num_questions: int = 10,
    output_json: str = "evaluation_results.json",
    output_report: str = "evaluation_report.txt",
):
    """
    Run the complete evaluation sprint.

    Args:
        num_questions: How many test questions to run (1-25)
        output_json: Where to save JSON results
        output_report: Where to save text report
    """

    print("\n" + "=" * 80)
    print("CODETTE PHASE 6 EVALUATION SPRINT")
    print("=" * 80)
    print(f"Test Date: {datetime.now().isoformat()}")
    print(f"Questions to Run: {min(num_questions, len(EVALUATION_TEST_SUITE))}/25")
    print(f"Output: {output_json}, {output_report}")
    print("=" * 80 + "\n")

    # Load ForgeEngine with Phase 6
    print("[1/4] Loading ForgeEngine with Phase 6...")
    try:
        from reasoning_forge.forge_engine import ForgeEngine

        forge = ForgeEngine(living_memory=None, enable_memory_weighting=False)

        print("  OK: ForgeEngine loaded")
        print(f"  - semantic_tension_engine: {'READY' if forge.semantic_tension_engine else 'MISSING'}")
        print(f"  - specialization tracker: {'READY' if forge.specialization else 'MISSING'}")
        print(f"  - preflight_predictor: {'READY' if forge.preflight_predictor else 'MISSING'}")

        # Check GPU status from orchestrator
        if forge.newton.orchestrator:
            print(f"  - GPU acceleration: ✓ ENABLED ({forge.newton.orchestrator.n_gpu_layers} layers)")

    except Exception as e:
        print(f"  ERROR: {e}")
        return False

    # Create evaluation harness
    print("\n[2/4] Creating evaluation harness...")
    try:
        harness = EvaluationHarness(forge)
        print("  OK: Harness created")
    except Exception as e:
        print(f"  ERROR: {e}")
        return False

    # Run evaluation suite
    print(f"\n[3/4] Running evaluation on {min(num_questions, len(EVALUATION_TEST_SUITE))} questions...")
    print("  This will take several minutes...\n")

    try:
        test_questions = EVALUATION_TEST_SUITE[:num_questions]
        results = harness.run_evaluation_suite(test_questions)
        print(f"\n  OK: Evaluation complete")
        print(f"    - Baseline: {len(results['baseline_llama'])} results")
        print(f"    - Phase 1-5: {len(results['phase_1_5'])} results")
        print(f"    - Phase 6 Full: {len(results['phase_6_full'])} results")
        print(f"    - Phase 6 -PreFlight: {len(results['phase_6_no_preflight'])} results")
    except Exception as e:
        print(f"  ERROR during evaluation: {e}")
        import traceback

        traceback.print_exc()
        return False

    # Analyze results
    print(f"\n[4/4] Analyzing results...")
    try:
        analyzer = EvaluationAnalyzer(results)
        report = analyzer.report()

        # Save JSON results
        harness.export_results(output_json)

        # Save text report (with UTF-8 encoding for Unicode characters like Γ)
        with open(output_report, 'w', encoding='utf-8') as f:
            f.write(report)

        print("  OK: Analysis complete")
        print(f"    - JSON saved: {output_json}")
        print(f"    - Report saved: {output_report}")

        # Print summary to console (skip full report due to Unicode encoding)
        try:
            # Try to print the report
            print("\n" + report)
        except UnicodeEncodeError:
            # Windows terminal encoding issue—just note that report was saved
            print("    - Full report saved to file (Unicode summary unavailable in terminal)")

        return True

    except Exception as e:
        print(f"  ERROR during analysis: {e}")
        import traceback

        traceback.print_exc()
        return False


def main():
    parser = argparse.ArgumentParser(
        description="Run Codette Phase 6 evaluation sprint"
    )
    parser.add_argument(
        "--questions",
        type=int,
        default=5,
        help="Number of test questions to run (1-25, default 5)",
    )
    parser.add_argument(
        "--output-json",
        default="evaluation_results.json",
        help="Output JSON file for results",
    )
    parser.add_argument(
        "--output-report",
        default="evaluation_report.txt",
        help="Output text file for report",
    )

    args = parser.parse_args()

    # Validate num_questions
    if args.questions < 1 or args.questions > 25:
        print("ERROR: --questions must be between 1 and 25")
        return 1

    # Run sprint
    success = run_evaluation_sprint(
        num_questions=args.questions,
        output_json=args.output_json,
        output_report=args.output_report,
    )

    return 0 if success else 1


if __name__ == "__main__":
    sys.exit(main())