File size: 5,463 Bytes
ed1b365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Evaluation Sprint Runner

Executes the evaluation harness against all 4 conditions:
1. Baseline (plain Llama)
2. Phase 1-5 (debate without semantic tension)
3. Phase 6 Full (with semantic tension, specialization, preflight)
4. Phase 6 -PreFlight (without preflight prediction)

Usage:
    python run_evaluation_sprint.py --questions 25 --output results.json
"""

import sys
import argparse
import json
from datetime import datetime
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent / 'reasoning_forge'))
sys.path.insert(0, str(Path(__file__).parent.parent / 'inference'))

from test_suite_evaluation import (
    EvaluationHarness,
    EvaluationAnalyzer,
    EVALUATION_TEST_SUITE,
)


def run_evaluation_sprint(
    num_questions: int = 10,
    output_json: str = "evaluation_results.json",
    output_report: str = "evaluation_report.txt",
):
    """
    Run the complete evaluation sprint.

    Args:
        num_questions: How many test questions to run (1-25)
        output_json: Where to save JSON results
        output_report: Where to save text report
    """

    print("\n" + "=" * 80)
    print("CODETTE PHASE 6 EVALUATION SPRINT")
    print("=" * 80)
    print(f"Test Date: {datetime.now().isoformat()}")
    print(f"Questions to Run: {min(num_questions, len(EVALUATION_TEST_SUITE))}/25")
    print(f"Output: {output_json}, {output_report}")
    print("=" * 80 + "\n")

    # Load ForgeEngine with Phase 6
    print("[1/4] Loading ForgeEngine with Phase 6...")
    try:
        from reasoning_forge.forge_engine import ForgeEngine

        forge = ForgeEngine(living_memory=None, enable_memory_weighting=False)

        print("  OK: ForgeEngine loaded")
        print(f"  - semantic_tension_engine: {'READY' if forge.semantic_tension_engine else 'MISSING'}")
        print(f"  - specialization tracker: {'READY' if forge.specialization else 'MISSING'}")
        print(f"  - preflight_predictor: {'READY' if forge.preflight_predictor else 'MISSING'}")

        # Check GPU status from orchestrator
        if forge.newton.orchestrator:
            print(f"  - GPU acceleration: ✓ ENABLED ({forge.newton.orchestrator.n_gpu_layers} layers)")

    except Exception as e:
        print(f"  ERROR: {e}")
        return False

    # Create evaluation harness
    print("\n[2/4] Creating evaluation harness...")
    try:
        harness = EvaluationHarness(forge)
        print("  OK: Harness created")
    except Exception as e:
        print(f"  ERROR: {e}")
        return False

    # Run evaluation suite
    print(f"\n[3/4] Running evaluation on {min(num_questions, len(EVALUATION_TEST_SUITE))} questions...")
    print("  This will take several minutes...\n")

    try:
        test_questions = EVALUATION_TEST_SUITE[:num_questions]
        results = harness.run_evaluation_suite(test_questions)
        print(f"\n  OK: Evaluation complete")
        print(f"    - Baseline: {len(results['baseline_llama'])} results")
        print(f"    - Phase 1-5: {len(results['phase_1_5'])} results")
        print(f"    - Phase 6 Full: {len(results['phase_6_full'])} results")
        print(f"    - Phase 6 -PreFlight: {len(results['phase_6_no_preflight'])} results")
    except Exception as e:
        print(f"  ERROR during evaluation: {e}")
        import traceback

        traceback.print_exc()
        return False

    # Analyze results
    print(f"\n[4/4] Analyzing results...")
    try:
        analyzer = EvaluationAnalyzer(results)
        report = analyzer.report()

        # Save JSON results
        harness.export_results(output_json)

        # Save text report (with UTF-8 encoding for Unicode characters like Γ)
        with open(output_report, 'w', encoding='utf-8') as f:
            f.write(report)

        print("  OK: Analysis complete")
        print(f"    - JSON saved: {output_json}")
        print(f"    - Report saved: {output_report}")

        # Print summary to console (skip full report due to Unicode encoding)
        try:
            # Try to print the report
            print("\n" + report)
        except UnicodeEncodeError:
            # Windows terminal encoding issue—just note that report was saved
            print("    - Full report saved to file (Unicode summary unavailable in terminal)")

        return True

    except Exception as e:
        print(f"  ERROR during analysis: {e}")
        import traceback

        traceback.print_exc()
        return False


def main():
    parser = argparse.ArgumentParser(
        description="Run Codette Phase 6 evaluation sprint"
    )
    parser.add_argument(
        "--questions",
        type=int,
        default=5,
        help="Number of test questions to run (1-25, default 5)",
    )
    parser.add_argument(
        "--output-json",
        default="evaluation_results.json",
        help="Output JSON file for results",
    )
    parser.add_argument(
        "--output-report",
        default="evaluation_report.txt",
        help="Output text file for report",
    )

    args = parser.parse_args()

    # Validate num_questions
    if args.questions < 1 or args.questions > 25:
        print("ERROR: --questions must be between 1 and 25")
        return 1

    # Run sprint
    success = run_evaluation_sprint(
        num_questions=args.questions,
        output_json=args.output_json,
        output_report=args.output_report,
    )

    return 0 if success else 1


if __name__ == "__main__":
    sys.exit(main())