#!/usr/bin/env python """ QAgents-Workflows: Main Evaluation Runner Runs comparative tests between Blackboard, Guided, and Naked modes. Usage: python run_evaluation.py # Run all tests python run_evaluation.py --mode naked # Test specific mode python run_evaluation.py --problem easy_001 # Test specific problem python run_evaluation.py --quick # Quick test (1 run per problem) """ import argparse import logging import sys from pathlib import Path # Add parent to path for imports sys.path.insert(0, str(Path(__file__).parent)) from config import config, set_mode from client import get_client from tests import ( EvaluationHarness, ALL_PROBLEMS, EASY_PROBLEMS, get_problem ) def setup_logging(verbose: bool = True): """Configure logging.""" level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( level=level, format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s", datefmt="%H:%M:%S" ) def check_mcp_server(): """Check if MCP server is running.""" client = get_client() if not client.health_check(): print("\nāŒ ERROR: QuantumArchitect-MCP server is not running!") print("\nPlease start it with:") print(" cd D:\\teach\\quantum-circuits") print(" & .venv\\Scripts\\Activate.ps1") print(" python QuantumArchitect-MCP\\app.py") print() return False print("āœ… MCP server is running") return True def run_quick_test(): """Run a quick sanity test.""" print("\n Running Quick Test (Naked mode, Bell State)") print("-" * 50) from orchestrators import create_orchestrator from tests import BELL_STATE_PROBLEM orchestrator = create_orchestrator("naked") result = orchestrator.run(BELL_STATE_PROBLEM.goal) print(f"Success: {result.success}") print(f"Time: {result.execution_time_ms:.1f}ms") print(f"Steps: {result.steps_completed}") if result.final_output: print(f"\nGenerated Circuit:") print(result.final_output[:500] if len(result.final_output) > 500 else result.final_output) if result.errors: print(f"\nErrors: {result.errors}") return result.success def run_full_evaluation(problems=None, modes=None, num_runs=3): """Run full comparative evaluation.""" print("\n Starting Full Evaluation") print("=" * 60) if problems is None: problems = EASY_PROBLEMS # Start with easy problems if modes is None: modes = ["blackboard", "guided", "naked"] print(f"Problems: {len(problems)}") print(f"Modes: {modes}") print(f"Runs per problem: {num_runs}") print() harness = EvaluationHarness(num_runs=num_runs) try: results = harness.evaluate_all(problems=problems, modes=modes) # Generate and print report report = harness.generate_report() print("\n" + report) # Save report to file report_path = Path(__file__).parent / "evaluation_report.txt" report_path.write_text(report) print(f"\n Report saved to: {report_path}") # Export CSV for research csv_path = harness.export_csv() print(f" CSV exported to: {csv_path}") # Print summary stats stats = harness.get_summary_stats() print("\n Summary Statistics:") for mode, mode_stats in stats.get('modes', {}).items(): print(f" {mode}: {mode_stats['success_rate']*100:.1f}% success, " f"{mode_stats['total_llm_requests']} LLM calls, " f"{mode_stats['total_tokens']} tokens") return True except Exception as e: logging.exception(f"Evaluation failed: {e}") return False def main(): parser = argparse.ArgumentParser( description="QAgents Comparative Evaluation Runner", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python run_evaluation.py # Full evaluation python run_evaluation.py --quick # Quick sanity test python run_evaluation.py --mode naked # Test naked mode only python run_evaluation.py --easy # Only easy problems python run_evaluation.py --runs 10 # 10 runs per problem """ ) parser.add_argument("--quick", action="store_true", help="Run quick sanity test only") parser.add_argument("--mode", choices=["blackboard", "guided", "naked"], help="Test specific mode only") parser.add_argument("--problem", type=str, help="Test specific problem by ID") parser.add_argument("--easy", action="store_true", help="Only easy problems") parser.add_argument("--runs", type=int, default=3, help="Number of runs per problem (default: 3)") parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") args = parser.parse_args() setup_logging(args.verbose) print("=" * 60) print("[EVALUATION] QAgents-Workflows Comparative Evaluation") print("=" * 60) # Check MCP server if not check_mcp_server(): sys.exit(1) # Quick test mode if args.quick: success = run_quick_test() sys.exit(0 if success else 1) # Determine problems to run if args.problem: problem = get_problem(args.problem) if not problem: print(f"āŒ Unknown problem: {args.problem}") sys.exit(1) problems = [problem] elif args.easy: problems = EASY_PROBLEMS else: problems = ALL_PROBLEMS # Determine modes to test modes = [args.mode] if args.mode else None # Run evaluation success = run_full_evaluation( problems=problems, modes=modes, num_runs=args.runs ) sys.exit(0 if success else 1) if __name__ == "__main__": main()