Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| """ | |
| QAgents-Workflows: Main Evaluation Runner | |
| Runs comparative tests between Blackboard, Guided, and Naked modes. | |
| Usage: | |
| python run_evaluation.py # Run all tests | |
| python run_evaluation.py --mode naked # Test specific mode | |
| python run_evaluation.py --problem easy_001 # Test specific problem | |
| python run_evaluation.py --quick # Quick test (1 run per problem) | |
| """ | |
| import argparse | |
| import logging | |
| import sys | |
| from pathlib import Path | |
| # Add parent to path for imports | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| from config import config, set_mode | |
| from client import get_client | |
| from tests import ( | |
| EvaluationHarness, | |
| ALL_PROBLEMS, | |
| EASY_PROBLEMS, | |
| get_problem | |
| ) | |
| def setup_logging(verbose: bool = True): | |
| """Configure logging.""" | |
| level = logging.DEBUG if verbose else logging.INFO | |
| logging.basicConfig( | |
| level=level, | |
| format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s", | |
| datefmt="%H:%M:%S" | |
| ) | |
| def check_mcp_server(): | |
| """Check if MCP server is running.""" | |
| client = get_client() | |
| if not client.health_check(): | |
| print("\n❌ ERROR: QuantumArchitect-MCP server is not running!") | |
| print("\nPlease start it with:") | |
| print(" cd D:\\teach\\quantum-circuits") | |
| print(" & .venv\\Scripts\\Activate.ps1") | |
| print(" python QuantumArchitect-MCP\\app.py") | |
| print() | |
| return False | |
| print("✅ MCP server is running") | |
| return True | |
| def run_quick_test(): | |
| """Run a quick sanity test.""" | |
| print("\n Running Quick Test (Naked mode, Bell State)") | |
| print("-" * 50) | |
| from orchestrators import create_orchestrator | |
| from tests import BELL_STATE_PROBLEM | |
| orchestrator = create_orchestrator("naked") | |
| result = orchestrator.run(BELL_STATE_PROBLEM.goal) | |
| print(f"Success: {result.success}") | |
| print(f"Time: {result.execution_time_ms:.1f}ms") | |
| print(f"Steps: {result.steps_completed}") | |
| if result.final_output: | |
| print(f"\nGenerated Circuit:") | |
| print(result.final_output[:500] if len(result.final_output) > 500 else result.final_output) | |
| if result.errors: | |
| print(f"\nErrors: {result.errors}") | |
| return result.success | |
| def run_full_evaluation(problems=None, modes=None, num_runs=3): | |
| """Run full comparative evaluation.""" | |
| print("\n Starting Full Evaluation") | |
| print("=" * 60) | |
| if problems is None: | |
| problems = EASY_PROBLEMS # Start with easy problems | |
| if modes is None: | |
| modes = ["blackboard", "guided", "naked"] | |
| print(f"Problems: {len(problems)}") | |
| print(f"Modes: {modes}") | |
| print(f"Runs per problem: {num_runs}") | |
| print() | |
| harness = EvaluationHarness(num_runs=num_runs) | |
| try: | |
| results = harness.evaluate_all(problems=problems, modes=modes) | |
| # Generate and print report | |
| report = harness.generate_report() | |
| print("\n" + report) | |
| # Save report to file | |
| report_path = Path(__file__).parent / "evaluation_report.txt" | |
| report_path.write_text(report) | |
| print(f"\n Report saved to: {report_path}") | |
| # Export CSV for research | |
| csv_path = harness.export_csv() | |
| print(f" CSV exported to: {csv_path}") | |
| # Print summary stats | |
| stats = harness.get_summary_stats() | |
| print("\n Summary Statistics:") | |
| for mode, mode_stats in stats.get('modes', {}).items(): | |
| print(f" {mode}: {mode_stats['success_rate']*100:.1f}% success, " | |
| f"{mode_stats['total_llm_requests']} LLM calls, " | |
| f"{mode_stats['total_tokens']} tokens") | |
| return True | |
| except Exception as e: | |
| logging.exception(f"Evaluation failed: {e}") | |
| return False | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="QAgents Comparative Evaluation Runner", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| python run_evaluation.py # Full evaluation | |
| python run_evaluation.py --quick # Quick sanity test | |
| python run_evaluation.py --mode naked # Test naked mode only | |
| python run_evaluation.py --easy # Only easy problems | |
| python run_evaluation.py --runs 10 # 10 runs per problem | |
| """ | |
| ) | |
| parser.add_argument("--quick", action="store_true", | |
| help="Run quick sanity test only") | |
| parser.add_argument("--mode", choices=["blackboard", "guided", "naked"], | |
| help="Test specific mode only") | |
| parser.add_argument("--problem", type=str, | |
| help="Test specific problem by ID") | |
| parser.add_argument("--easy", action="store_true", | |
| help="Only easy problems") | |
| parser.add_argument("--runs", type=int, default=3, | |
| help="Number of runs per problem (default: 3)") | |
| parser.add_argument("--verbose", "-v", action="store_true", | |
| help="Verbose output") | |
| args = parser.parse_args() | |
| setup_logging(args.verbose) | |
| print("=" * 60) | |
| print("[EVALUATION] QAgents-Workflows Comparative Evaluation") | |
| print("=" * 60) | |
| # Check MCP server | |
| if not check_mcp_server(): | |
| sys.exit(1) | |
| # Quick test mode | |
| if args.quick: | |
| success = run_quick_test() | |
| sys.exit(0 if success else 1) | |
| # Determine problems to run | |
| if args.problem: | |
| problem = get_problem(args.problem) | |
| if not problem: | |
| print(f"❌ Unknown problem: {args.problem}") | |
| sys.exit(1) | |
| problems = [problem] | |
| elif args.easy: | |
| problems = EASY_PROBLEMS | |
| else: | |
| problems = ALL_PROBLEMS | |
| # Determine modes to test | |
| modes = [args.mode] if args.mode else None | |
| # Run evaluation | |
| success = run_full_evaluation( | |
| problems=problems, | |
| modes=modes, | |
| num_runs=args.runs | |
| ) | |
| sys.exit(0 if success else 1) | |
| if __name__ == "__main__": | |
| main() | |