# Path: QAgents-workflos/tests/final_eval.py # Final evaluation - NAKED vs BLACKBOARD on all difficulties """Final mode evaluation: NAKED vs fixed BLACKBOARD.""" import sys import os import time from datetime import datetime from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.absolute())) api_key = "$env:GOOGLE_API_KEY" os.environ['GOOGLE_API_KEY'] = api_key from tests.test_problems import ALL_PROBLEMS from orchestrators import create_orchestrator from config import set_api_key import re set_api_key(api_key) def extract_gates(qasm): if not qasm: return 0 gate_pattern = r'\b(h|x|y|z|s|t|cx|cz|swap|ccx|rz|rx|ry|cp)\b' return len(re.findall(gate_pattern, qasm, re.IGNORECASE)) def test_problem(problem, mode): start = time.perf_counter() try: orch = create_orchestrator(mode) result = orch.run(problem.prompt) llm = 1 if mode == "naked" else len(result.agent_results) if result.agent_results else 0 return { "success": result.success, "time_ms": (time.perf_counter()-start)*1000, "llm": llm, "gates": extract_gates(result.final_output), "error": "; ".join(result.errors[:2]) if result.errors else None } except Exception as e: return { "success": False, "time_ms": (time.perf_counter()-start)*1000, "llm": 0, "gates": 0, "error": str(e)[:60] } print("=" * 80) print("FINAL MODE EVALUATION: NAKED vs BLACKBOARD") print("=" * 80) print(f"Date: {datetime.now().isoformat()}") print(f"Problems: {len(ALL_PROBLEMS)}") print() modes = ["naked", "blackboard"] results_by_difficulty = {"easy": {}, "medium": {}, "hard": {}, "very_hard": {}} for problem in ALL_PROBLEMS: diff = problem.difficulty.value print(f"\n{diff.upper()}: {problem.name}") if diff not in results_by_difficulty: results_by_difficulty[diff] = {} for mode in modes: print(f" {mode:12}", end=" ", flush=True) result = test_problem(problem, mode) if mode not in results_by_difficulty[diff]: results_by_difficulty[diff][mode] = [] results_by_difficulty[diff][mode].append(result) status = "✅" if result["success"] else "❌" print(f"{status} {result['time_ms']:5.0f}ms LLM:{result['llm']} Gates:{result['gates']}") if result["error"] and not result["success"]: print(f" ⚠️ {result['error'][:50]}...") time.sleep(4) # Summary print("\n\n" + "=" * 80) print("FINAL SUMMARY") print("=" * 80) for mode in modes: print(f"\n{mode.upper()}") print("-" * 40) total_success = 0 total_problems = 0 total_time = 0 total_llm = 0 for diff in ["easy", "medium", "hard", "very_hard"]: if diff in results_by_difficulty and mode in results_by_difficulty[diff]: results = results_by_difficulty[diff][mode] successes = sum(1 for r in results if r["success"]) total_success += successes total_problems += len(results) total_time += sum(r["time_ms"] for r in results) total_llm += sum(r["llm"] for r in results) print(f" {diff:10}: {successes}/{len(results)}") print(f"\n TOTAL: {total_success}/{total_problems} ({100*total_success/total_problems:.0f}%)") print(f" Time: {total_time:.0f}ms total ({total_time/total_problems:.0f}ms avg)") print(f" LLM calls: {total_llm}") print("\n" + "=" * 80) print("WINNER DETERMINATION") print("=" * 80) for diff in ["easy", "medium", "hard", "very_hard"]: if diff not in results_by_difficulty: continue print(f"\n{diff.upper()}:") for mode in modes: if mode in results_by_difficulty[diff]: results = results_by_difficulty[diff][mode] successes = sum(1 for r in results if r["success"]) avg_time = sum(r["time_ms"] for r in results) / len(results) print(f" {mode}: {successes}/{len(results)} ({avg_time:.0f}ms avg)") print("\n" + "=" * 80) print("DONE")