File size: 10,902 Bytes
1bb4678
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# Path: QAgents-workflos/tests/comprehensive_test_v2.py
# Relations: Uses orchestrators, test_problems, client/mcp_client
# Description: Full diagnostic test comparing all 5 modes including QUASAR and HYBRID
"""
Comprehensive Test V2: Compare all orchestration modes

Modes tested:
1. NAKED - Direct LLM (baseline)
2. GUIDED - Multi-agent pipeline  
3. BLACKBOARD - Event-driven agents
4. QUASAR - Tool-augmented LLM with hierarchical validation
5. HYBRID - NAKED first, QUASAR fallback

Problems:
- 3 EASY
- 3 MEDIUM  
- 3 HARD
- 4 VERY_HARD (new - to find NAKED limits)
"""

import sys
import os
import json
import time
from datetime import datetime
from pathlib import Path

# Setup paths
sys.path.insert(0, str(Path(__file__).parent.parent.absolute()))

# Set API key BEFORE any imports
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
    api_key = "$env:GOOGLE_API_KEY"
    os.environ['GOOGLE_API_KEY'] = api_key

from tests.test_problems import (
    ALL_PROBLEMS, EASY_PROBLEMS, MEDIUM_PROBLEMS, 
    HARD_PROBLEMS, VERY_HARD_PROBLEMS,
    ProblemDifficulty
)
from orchestrators import create_orchestrator
from orchestrators.quasar_orchestrator import QuasarOrchestrator, HybridOrchestrator
from config import reset_cost_tracking, get_cost_summary, set_api_key
from client.mcp_client import get_client

# Set API key in config
set_api_key(api_key)


def extract_qasm_metrics(qasm: str) -> dict:
    """Extract metrics from QASM code."""
    if not qasm:
        return {"gate_count": 0, "depth": 0, "qubits": 0}
    
    import re
    
    # Count qubits
    qreg_match = re.search(r'qreg\s+\w+\[(\d+)\]', qasm)
    qubits = int(qreg_match.group(1)) if qreg_match else 0
    
    # Count gates (excluding declarations and measurements)
    gate_pattern = r'\b(h|x|y|z|s|t|sdg|tdg|cx|cz|cy|swap|ccx|rz|rx|ry|u1|u2|u3|p|cp)\b'
    gates = re.findall(gate_pattern, qasm, re.IGNORECASE)
    
    # Estimate depth (simplified)
    lines = [l.strip() for l in qasm.split('\n') if l.strip() and not l.strip().startswith(('OPENQASM', 'include', 'qreg', 'creg', '//'))]
    depth = len([l for l in lines if any(g in l.lower() for g in ['h ', 'x ', 'y ', 'z ', 'cx', 'cz', 'swap', 'rx', 'ry', 'rz', 'ccx'])])
    
    return {"gate_count": len(gates), "depth": depth, "qubits": qubits}


def run_test(problem, mode: str) -> dict:
    """Run a single test and return results."""
    result = {
        "problem_id": problem.id,
        "problem_name": problem.name,
        "difficulty": problem.difficulty.value,
        "category": problem.category.value,
        "mode": mode,
        "success": False,
        "qasm_valid": False,
        "time_ms": 0,
        "llm_calls": 0,
        "tokens": 0,
        "gate_count": 0,
        "depth": 0,
        "qasm": None,
        "error": None,
        "tiers_passed": [],
        "iterations": 0
    }
    
    start = time.perf_counter()
    reset_cost_tracking()
    
    try:
        if mode in ["quasar", "hybrid"]:
            # Use new orchestrators with expected values
            if mode == "quasar":
                orchestrator = QuasarOrchestrator(max_iterations=3)
            else:
                orchestrator = HybridOrchestrator()
            
            quasar_result = orchestrator.run(
                goal=problem.prompt,
                expected_qubits=problem.expected.min_qubits,
                expected_states=problem.expected.expected_states if problem.expected.expected_states else None,
                max_depth=problem.expected.max_depth
            )
            
            result["success"] = quasar_result.success
            result["qasm"] = quasar_result.final_qasm
            result["llm_calls"] = quasar_result.llm_calls
            result["tokens"] = quasar_result.tokens_used
            result["tiers_passed"] = quasar_result.tiers_passed
            result["iterations"] = quasar_result.iterations
            
            if quasar_result.final_qasm:
                result["qasm_valid"] = True
                metrics = extract_qasm_metrics(quasar_result.final_qasm)
                result["gate_count"] = metrics["gate_count"]
                result["depth"] = metrics["depth"]
            
            if quasar_result.errors:
                result["error"] = "; ".join(quasar_result.errors)
                
        else:
            # Use standard orchestrators
            orchestrator = create_orchestrator(mode)
            orch_result = orchestrator.run(problem.prompt)
            
            result["success"] = orch_result.success
            result["qasm"] = orch_result.final_output
            
            # Get LLM stats
            cost = get_cost_summary()
            result["llm_calls"] = cost.get("llm_requests", 0)
            result["tokens"] = cost.get("total_tokens", 0)
            
            if orch_result.final_output:
                result["qasm_valid"] = True
                metrics = extract_qasm_metrics(orch_result.final_output)
                result["gate_count"] = metrics["gate_count"]
                result["depth"] = metrics["depth"]
            
            if orch_result.errors:
                result["error"] = "; ".join(orch_result.errors)
                
    except Exception as e:
        result["error"] = str(e)
        
    result["time_ms"] = (time.perf_counter() - start) * 1000
    return result


def main():
    print("=" * 100)
    print("COMPREHENSIVE TEST V2 - ALL MODES INCLUDING QUASAR & HYBRID")
    print("=" * 100)
    print(f"Date: {datetime.now().isoformat()}")
    print(f"Problems: {len(ALL_PROBLEMS)} total")
    print(f"  - Easy: {len(EASY_PROBLEMS)}")
    print(f"  - Medium: {len(MEDIUM_PROBLEMS)}")
    print(f"  - Hard: {len(HARD_PROBLEMS)}")
    print(f"  - Very Hard: {len(VERY_HARD_PROBLEMS)}")
    print(f"Modes: naked, guided, blackboard, quasar, hybrid")
    print("=" * 100)
    
    # Check MCP server
    try:
        client = get_client()
        if client.health_check():
            print("✅ MCP Server connected")
        else:
            print("⚠️ MCP Server not responding - some validations may use fallback")
    except:
        print("⚠️ MCP Server not available")
    
    all_results = []
    modes = ["naked", "quasar", "hybrid", "guided", "blackboard"]  # Order: fastest to slowest
    
    # Group problems by difficulty
    problem_groups = [
        ("EASY", EASY_PROBLEMS),
        ("MEDIUM", MEDIUM_PROBLEMS),
        ("HARD", HARD_PROBLEMS),
        ("VERY_HARD", VERY_HARD_PROBLEMS)
    ]
    
    for diff_name, problems in problem_groups:
        print(f"\n{'='*100}")
        print(f"DIFFICULTY: {diff_name}")
        print("=" * 100)
        
        for problem in problems:
            print(f"\n--- Problem: {problem.id} - {problem.name} ---")
            
            for mode in modes:
                print(f"  Testing {mode}...", end=" ", flush=True)
                
                result = run_test(problem, mode)
                all_results.append(result)
                
                status = "✅" if result["success"] else "❌"
                time_str = f"{result['time_ms']:.0f}ms"
                llm_str = f"LLM:{result['llm_calls']}"
                gates_str = f"Gates:{result['gate_count']}"
                
                extra = ""
                if mode in ["quasar", "hybrid"]:
                    tiers = result.get("tiers_passed", [])
                    extra = f" Tiers:{tiers}"
                
                print(f"{status} {time_str} {llm_str} {gates_str}{extra}")
                
                if result["error"] and not result["success"]:
                    print(f"    Error: {result['error'][:80]}...")
                
                # Rate limiting
                time.sleep(5)
    
    # Summary
    print("\n\n" + "=" * 100)
    print("FINAL SUMMARY BY MODE")
    print("=" * 100)
    
    for mode in modes:
        mode_results = [r for r in all_results if r["mode"] == mode]
        successes = sum(1 for r in mode_results if r["success"])
        total = len(mode_results)
        total_time = sum(r["time_ms"] for r in mode_results)
        total_llm = sum(r["llm_calls"] for r in mode_results)
        avg_gates = sum(r["gate_count"] for r in mode_results if r["success"]) / max(successes, 1)
        
        print(f"\n{mode.upper()}:")
        print(f"  Success: {successes}/{total} ({100*successes/total:.1f}%)")
        print(f"  Total Time: {total_time:.0f}ms ({total_time/total:.0f}ms avg)")
        print(f"  LLM Calls: {total_llm} ({total_llm/total:.1f} avg)")
        print(f"  Avg Gates (success): {avg_gates:.1f}")
        
        # Per difficulty
        for diff in ["easy", "medium", "hard", "very_hard"]:
            diff_results = [r for r in mode_results if r["difficulty"] == diff]
            if diff_results:
                diff_success = sum(1 for r in diff_results if r["success"])
                print(f"    {diff}: {diff_success}/{len(diff_results)}")
    
    # Efficiency comparison
    print("\n" + "=" * 100)
    print("EFFICIENCY COMPARISON (Success per LLM call)")
    print("=" * 100)
    
    for mode in modes:
        mode_results = [r for r in all_results if r["mode"] == mode]
        successes = sum(1 for r in mode_results if r["success"])
        total_llm = sum(r["llm_calls"] for r in mode_results)
        efficiency = successes / max(total_llm, 1)
        print(f"  {mode}: {efficiency:.3f} successes per LLM call")
    
    # Winner determination
    print("\n" + "=" * 100)
    print("WINNER BY DIFFICULTY")
    print("=" * 100)
    
    for diff in ["easy", "medium", "hard", "very_hard"]:
        print(f"\n{diff.upper()}:")
        best_mode = None
        best_success = -1
        best_efficiency = -1
        
        for mode in modes:
            mode_results = [r for r in all_results if r["mode"] == mode and r["difficulty"] == diff]
            if mode_results:
                successes = sum(1 for r in mode_results if r["success"])
                total_llm = sum(r["llm_calls"] for r in mode_results)
                efficiency = successes / max(total_llm, 1)
                
                if successes > best_success or (successes == best_success and efficiency > best_efficiency):
                    best_success = successes
                    best_efficiency = efficiency
                    best_mode = mode
        
        if best_mode:
            print(f"  🏆 Winner: {best_mode.upper()} ({best_success} successes)")
    
    # Save results
    output_path = Path(__file__).parent.parent / "research" / f"comprehensive_test_v2_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    print(f"\n\nResults saved to: {output_path}")
    print("=" * 100)


if __name__ == "__main__":
    main()