# Path: QAgents-workflos/tests/comprehensive_test_v2.py # Relations: Uses orchestrators, test_problems, client/mcp_client # Description: Full diagnostic test comparing all 5 modes including QUASAR and HYBRID """ Comprehensive Test V2: Compare all orchestration modes Modes tested: 1. NAKED - Direct LLM (baseline) 2. GUIDED - Multi-agent pipeline 3. BLACKBOARD - Event-driven agents 4. QUASAR - Tool-augmented LLM with hierarchical validation 5. HYBRID - NAKED first, QUASAR fallback Problems: - 3 EASY - 3 MEDIUM - 3 HARD - 4 VERY_HARD (new - to find NAKED limits) """ import sys import os import json import time from datetime import datetime from pathlib import Path # Setup paths sys.path.insert(0, str(Path(__file__).parent.parent.absolute())) # Set API key BEFORE any imports api_key = os.getenv('GOOGLE_API_KEY') if not api_key: api_key = "$env:GOOGLE_API_KEY" os.environ['GOOGLE_API_KEY'] = api_key from tests.test_problems import ( ALL_PROBLEMS, EASY_PROBLEMS, MEDIUM_PROBLEMS, HARD_PROBLEMS, VERY_HARD_PROBLEMS, ProblemDifficulty ) from orchestrators import create_orchestrator from orchestrators.quasar_orchestrator import QuasarOrchestrator, HybridOrchestrator from config import reset_cost_tracking, get_cost_summary, set_api_key from client.mcp_client import get_client # Set API key in config set_api_key(api_key) def extract_qasm_metrics(qasm: str) -> dict: """Extract metrics from QASM code.""" if not qasm: return {"gate_count": 0, "depth": 0, "qubits": 0} import re # Count qubits qreg_match = re.search(r'qreg\s+\w+\[(\d+)\]', qasm) qubits = int(qreg_match.group(1)) if qreg_match else 0 # Count gates (excluding declarations and measurements) gate_pattern = r'\b(h|x|y|z|s|t|sdg|tdg|cx|cz|cy|swap|ccx|rz|rx|ry|u1|u2|u3|p|cp)\b' gates = re.findall(gate_pattern, qasm, re.IGNORECASE) # Estimate depth (simplified) lines = [l.strip() for l in qasm.split('\n') if l.strip() and not l.strip().startswith(('OPENQASM', 'include', 'qreg', 'creg', '//'))] depth = len([l for l in lines if any(g in l.lower() for g in ['h ', 'x ', 'y ', 'z ', 'cx', 'cz', 'swap', 'rx', 'ry', 'rz', 'ccx'])]) return {"gate_count": len(gates), "depth": depth, "qubits": qubits} def run_test(problem, mode: str) -> dict: """Run a single test and return results.""" result = { "problem_id": problem.id, "problem_name": problem.name, "difficulty": problem.difficulty.value, "category": problem.category.value, "mode": mode, "success": False, "qasm_valid": False, "time_ms": 0, "llm_calls": 0, "tokens": 0, "gate_count": 0, "depth": 0, "qasm": None, "error": None, "tiers_passed": [], "iterations": 0 } start = time.perf_counter() reset_cost_tracking() try: if mode in ["quasar", "hybrid"]: # Use new orchestrators with expected values if mode == "quasar": orchestrator = QuasarOrchestrator(max_iterations=3) else: orchestrator = HybridOrchestrator() quasar_result = orchestrator.run( goal=problem.prompt, expected_qubits=problem.expected.min_qubits, expected_states=problem.expected.expected_states if problem.expected.expected_states else None, max_depth=problem.expected.max_depth ) result["success"] = quasar_result.success result["qasm"] = quasar_result.final_qasm result["llm_calls"] = quasar_result.llm_calls result["tokens"] = quasar_result.tokens_used result["tiers_passed"] = quasar_result.tiers_passed result["iterations"] = quasar_result.iterations if quasar_result.final_qasm: result["qasm_valid"] = True metrics = extract_qasm_metrics(quasar_result.final_qasm) result["gate_count"] = metrics["gate_count"] result["depth"] = metrics["depth"] if quasar_result.errors: result["error"] = "; ".join(quasar_result.errors) else: # Use standard orchestrators orchestrator = create_orchestrator(mode) orch_result = orchestrator.run(problem.prompt) result["success"] = orch_result.success result["qasm"] = orch_result.final_output # Get LLM stats cost = get_cost_summary() result["llm_calls"] = cost.get("llm_requests", 0) result["tokens"] = cost.get("total_tokens", 0) if orch_result.final_output: result["qasm_valid"] = True metrics = extract_qasm_metrics(orch_result.final_output) result["gate_count"] = metrics["gate_count"] result["depth"] = metrics["depth"] if orch_result.errors: result["error"] = "; ".join(orch_result.errors) except Exception as e: result["error"] = str(e) result["time_ms"] = (time.perf_counter() - start) * 1000 return result def main(): print("=" * 100) print("COMPREHENSIVE TEST V2 - ALL MODES INCLUDING QUASAR & HYBRID") print("=" * 100) print(f"Date: {datetime.now().isoformat()}") print(f"Problems: {len(ALL_PROBLEMS)} total") print(f" - Easy: {len(EASY_PROBLEMS)}") print(f" - Medium: {len(MEDIUM_PROBLEMS)}") print(f" - Hard: {len(HARD_PROBLEMS)}") print(f" - Very Hard: {len(VERY_HARD_PROBLEMS)}") print(f"Modes: naked, guided, blackboard, quasar, hybrid") print("=" * 100) # Check MCP server try: client = get_client() if client.health_check(): print("✅ MCP Server connected") else: print("⚠️ MCP Server not responding - some validations may use fallback") except: print("⚠️ MCP Server not available") all_results = [] modes = ["naked", "quasar", "hybrid", "guided", "blackboard"] # Order: fastest to slowest # Group problems by difficulty problem_groups = [ ("EASY", EASY_PROBLEMS), ("MEDIUM", MEDIUM_PROBLEMS), ("HARD", HARD_PROBLEMS), ("VERY_HARD", VERY_HARD_PROBLEMS) ] for diff_name, problems in problem_groups: print(f"\n{'='*100}") print(f"DIFFICULTY: {diff_name}") print("=" * 100) for problem in problems: print(f"\n--- Problem: {problem.id} - {problem.name} ---") for mode in modes: print(f" Testing {mode}...", end=" ", flush=True) result = run_test(problem, mode) all_results.append(result) status = "✅" if result["success"] else "❌" time_str = f"{result['time_ms']:.0f}ms" llm_str = f"LLM:{result['llm_calls']}" gates_str = f"Gates:{result['gate_count']}" extra = "" if mode in ["quasar", "hybrid"]: tiers = result.get("tiers_passed", []) extra = f" Tiers:{tiers}" print(f"{status} {time_str} {llm_str} {gates_str}{extra}") if result["error"] and not result["success"]: print(f" Error: {result['error'][:80]}...") # Rate limiting time.sleep(5) # Summary print("\n\n" + "=" * 100) print("FINAL SUMMARY BY MODE") print("=" * 100) for mode in modes: mode_results = [r for r in all_results if r["mode"] == mode] successes = sum(1 for r in mode_results if r["success"]) total = len(mode_results) total_time = sum(r["time_ms"] for r in mode_results) total_llm = sum(r["llm_calls"] for r in mode_results) avg_gates = sum(r["gate_count"] for r in mode_results if r["success"]) / max(successes, 1) print(f"\n{mode.upper()}:") print(f" Success: {successes}/{total} ({100*successes/total:.1f}%)") print(f" Total Time: {total_time:.0f}ms ({total_time/total:.0f}ms avg)") print(f" LLM Calls: {total_llm} ({total_llm/total:.1f} avg)") print(f" Avg Gates (success): {avg_gates:.1f}") # Per difficulty for diff in ["easy", "medium", "hard", "very_hard"]: diff_results = [r for r in mode_results if r["difficulty"] == diff] if diff_results: diff_success = sum(1 for r in diff_results if r["success"]) print(f" {diff}: {diff_success}/{len(diff_results)}") # Efficiency comparison print("\n" + "=" * 100) print("EFFICIENCY COMPARISON (Success per LLM call)") print("=" * 100) for mode in modes: mode_results = [r for r in all_results if r["mode"] == mode] successes = sum(1 for r in mode_results if r["success"]) total_llm = sum(r["llm_calls"] for r in mode_results) efficiency = successes / max(total_llm, 1) print(f" {mode}: {efficiency:.3f} successes per LLM call") # Winner determination print("\n" + "=" * 100) print("WINNER BY DIFFICULTY") print("=" * 100) for diff in ["easy", "medium", "hard", "very_hard"]: print(f"\n{diff.upper()}:") best_mode = None best_success = -1 best_efficiency = -1 for mode in modes: mode_results = [r for r in all_results if r["mode"] == mode and r["difficulty"] == diff] if mode_results: successes = sum(1 for r in mode_results if r["success"]) total_llm = sum(r["llm_calls"] for r in mode_results) efficiency = successes / max(total_llm, 1) if successes > best_success or (successes == best_success and efficiency > best_efficiency): best_success = successes best_efficiency = efficiency best_mode = mode if best_mode: print(f" 🏆 Winner: {best_mode.upper()} ({best_success} successes)") # Save results output_path = Path(__file__).parent.parent / "research" / f"comprehensive_test_v2_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w') as f: json.dump(all_results, f, indent=2) print(f"\n\nResults saved to: {output_path}") print("=" * 100) if __name__ == "__main__": main()