File size: 4,253 Bytes
1bb4678
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Path: QAgents-workflos/tests/final_eval.py
# Final evaluation - NAKED vs BLACKBOARD on all difficulties
"""Final mode evaluation: NAKED vs fixed BLACKBOARD."""

import sys
import os
import time
from datetime import datetime
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent.absolute()))

api_key = "$env:GOOGLE_API_KEY"
os.environ['GOOGLE_API_KEY'] = api_key

from tests.test_problems import ALL_PROBLEMS
from orchestrators import create_orchestrator
from config import set_api_key
import re

set_api_key(api_key)


def extract_gates(qasm):
    if not qasm:
        return 0
    gate_pattern = r'\b(h|x|y|z|s|t|cx|cz|swap|ccx|rz|rx|ry|cp)\b'
    return len(re.findall(gate_pattern, qasm, re.IGNORECASE))


def test_problem(problem, mode):
    start = time.perf_counter()
    
    try:
        orch = create_orchestrator(mode)
        result = orch.run(problem.prompt)
        
        llm = 1 if mode == "naked" else len(result.agent_results) if result.agent_results else 0
        
        return {
            "success": result.success, 
            "time_ms": (time.perf_counter()-start)*1000,
            "llm": llm, 
            "gates": extract_gates(result.final_output),
            "error": "; ".join(result.errors[:2]) if result.errors else None
        }
            
    except Exception as e:
        return {
            "success": False, 
            "time_ms": (time.perf_counter()-start)*1000, 
            "llm": 0, 
            "gates": 0, 
            "error": str(e)[:60]
        }


print("=" * 80)
print("FINAL MODE EVALUATION: NAKED vs BLACKBOARD")
print("=" * 80)
print(f"Date: {datetime.now().isoformat()}")
print(f"Problems: {len(ALL_PROBLEMS)}")
print()

modes = ["naked", "blackboard"]
results_by_difficulty = {"easy": {}, "medium": {}, "hard": {}, "very_hard": {}}

for problem in ALL_PROBLEMS:
    diff = problem.difficulty.value
    print(f"\n{diff.upper()}: {problem.name}")
    
    if diff not in results_by_difficulty:
        results_by_difficulty[diff] = {}
    
    for mode in modes:
        print(f"  {mode:12}", end=" ", flush=True)
        result = test_problem(problem, mode)
        
        if mode not in results_by_difficulty[diff]:
            results_by_difficulty[diff][mode] = []
        results_by_difficulty[diff][mode].append(result)
        
        status = "✅" if result["success"] else "❌"
        print(f"{status} {result['time_ms']:5.0f}ms LLM:{result['llm']} Gates:{result['gates']}")
        
        if result["error"] and not result["success"]:
            print(f"             ⚠️ {result['error'][:50]}...")
        
        time.sleep(4)

# Summary
print("\n\n" + "=" * 80)
print("FINAL SUMMARY")
print("=" * 80)

for mode in modes:
    print(f"\n{mode.upper()}")
    print("-" * 40)
    
    total_success = 0
    total_problems = 0
    total_time = 0
    total_llm = 0
    
    for diff in ["easy", "medium", "hard", "very_hard"]:
        if diff in results_by_difficulty and mode in results_by_difficulty[diff]:
            results = results_by_difficulty[diff][mode]
            successes = sum(1 for r in results if r["success"])
            total_success += successes
            total_problems += len(results)
            total_time += sum(r["time_ms"] for r in results)
            total_llm += sum(r["llm"] for r in results)
            
            print(f"  {diff:10}: {successes}/{len(results)}")
    
    print(f"\n  TOTAL: {total_success}/{total_problems} ({100*total_success/total_problems:.0f}%)")
    print(f"  Time: {total_time:.0f}ms total ({total_time/total_problems:.0f}ms avg)")
    print(f"  LLM calls: {total_llm}")

print("\n" + "=" * 80)
print("WINNER DETERMINATION")
print("=" * 80)

for diff in ["easy", "medium", "hard", "very_hard"]:
    if diff not in results_by_difficulty:
        continue
        
    print(f"\n{diff.upper()}:")
    for mode in modes:
        if mode in results_by_difficulty[diff]:
            results = results_by_difficulty[diff][mode]
            successes = sum(1 for r in results if r["success"])
            avg_time = sum(r["time_ms"] for r in results) / len(results)
            print(f"  {mode}: {successes}/{len(results)} ({avg_time:.0f}ms avg)")

print("\n" + "=" * 80)
print("DONE")