Spaces:

NLarchive
/

Qagents-workflows

Sleeping

Qagents-workflows / tests /run_evaluation.py

Deminiko

Initial commit: QAgents-workflos multi-agent quantum circuit optimization system

1bb4678 about 1 month ago

6.09 kB

	#!/usr/bin/env python
	"""
	QAgents-Workflows: Main Evaluation Runner
	Runs comparative tests between Blackboard, Guided, and Naked modes.

	Usage:
	python run_evaluation.py # Run all tests
	python run_evaluation.py --mode naked # Test specific mode
	python run_evaluation.py --problem easy_001 # Test specific problem
	python run_evaluation.py --quick # Quick test (1 run per problem)
	"""

	import argparse
	import logging
	import sys
	from pathlib import Path

	# Add parent to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	from config import config, set_mode
	from client import get_client
	from tests import (
	EvaluationHarness,
	ALL_PROBLEMS,
	EASY_PROBLEMS,
	get_problem
	)


	def setup_logging(verbose: bool = True):
	"""Configure logging."""
	level = logging.DEBUG if verbose else logging.INFO
	logging.basicConfig(
	level=level,
	format="%(asctime)s \| %(levelname)-8s \| %(name)s \| %(message)s",
	datefmt="%H:%M:%S"
	)


	def check_mcp_server():
	"""Check if MCP server is running."""
	client = get_client()
	if not client.health_check():
	print("\n❌ ERROR: QuantumArchitect-MCP server is not running!")
	print("\nPlease start it with:")
	print(" cd D:\\teach\\quantum-circuits")
	print(" & .venv\\Scripts\\Activate.ps1")
	print(" python QuantumArchitect-MCP\\app.py")
	print()
	return False
	print("✅ MCP server is running")
	return True


	def run_quick_test():
	"""Run a quick sanity test."""
	print("\n Running Quick Test (Naked mode, Bell State)")
	print("-" * 50)

	from orchestrators import create_orchestrator
	from tests import BELL_STATE_PROBLEM

	orchestrator = create_orchestrator("naked")
	result = orchestrator.run(BELL_STATE_PROBLEM.goal)

	print(f"Success: {result.success}")
	print(f"Time: {result.execution_time_ms:.1f}ms")
	print(f"Steps: {result.steps_completed}")

	if result.final_output:
	print(f"\nGenerated Circuit:")
	print(result.final_output[:500] if len(result.final_output) > 500 else result.final_output)

	if result.errors:
	print(f"\nErrors: {result.errors}")

	return result.success


	def run_full_evaluation(problems=None, modes=None, num_runs=3):
	"""Run full comparative evaluation."""
	print("\n Starting Full Evaluation")
	print("=" * 60)

	if problems is None:
	problems = EASY_PROBLEMS # Start with easy problems
	if modes is None:
	modes = ["blackboard", "guided", "naked"]

	print(f"Problems: {len(problems)}")
	print(f"Modes: {modes}")
	print(f"Runs per problem: {num_runs}")
	print()

	harness = EvaluationHarness(num_runs=num_runs)

	try:
	results = harness.evaluate_all(problems=problems, modes=modes)

	# Generate and print report
	report = harness.generate_report()
	print("\n" + report)

	# Save report to file
	report_path = Path(__file__).parent / "evaluation_report.txt"
	report_path.write_text(report)
	print(f"\n Report saved to: {report_path}")

	# Export CSV for research
	csv_path = harness.export_csv()
	print(f" CSV exported to: {csv_path}")

	# Print summary stats
	stats = harness.get_summary_stats()
	print("\n Summary Statistics:")
	for mode, mode_stats in stats.get('modes', {}).items():
	print(f" {mode}: {mode_stats['success_rate']*100:.1f}% success, "
	f"{mode_stats['total_llm_requests']} LLM calls, "
	f"{mode_stats['total_tokens']} tokens")

	return True

	except Exception as e:
	logging.exception(f"Evaluation failed: {e}")
	return False
	def main():
	parser = argparse.ArgumentParser(
	description="QAgents Comparative Evaluation Runner",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	python run_evaluation.py # Full evaluation
	python run_evaluation.py --quick # Quick sanity test
	python run_evaluation.py --mode naked # Test naked mode only
	python run_evaluation.py --easy # Only easy problems
	python run_evaluation.py --runs 10 # 10 runs per problem
	"""
	)

	parser.add_argument("--quick", action="store_true",
	help="Run quick sanity test only")
	parser.add_argument("--mode", choices=["blackboard", "guided", "naked"],
	help="Test specific mode only")
	parser.add_argument("--problem", type=str,
	help="Test specific problem by ID")
	parser.add_argument("--easy", action="store_true",
	help="Only easy problems")
	parser.add_argument("--runs", type=int, default=3,
	help="Number of runs per problem (default: 3)")
	parser.add_argument("--verbose", "-v", action="store_true",
	help="Verbose output")

	args = parser.parse_args()

	setup_logging(args.verbose)

	print("=" * 60)
	print("[EVALUATION] QAgents-Workflows Comparative Evaluation")
	print("=" * 60)

	# Check MCP server
	if not check_mcp_server():
	sys.exit(1)

	# Quick test mode
	if args.quick:
	success = run_quick_test()
	sys.exit(0 if success else 1)

	# Determine problems to run
	if args.problem:
	problem = get_problem(args.problem)
	if not problem:
	print(f"❌ Unknown problem: {args.problem}")
	sys.exit(1)
	problems = [problem]
	elif args.easy:
	problems = EASY_PROBLEMS
	else:
	problems = ALL_PROBLEMS

	# Determine modes to test
	modes = [args.mode] if args.mode else None

	# Run evaluation
	success = run_full_evaluation(
	problems=problems,
	modes=modes,
	num_runs=args.runs
	)

	sys.exit(0 if success else 1)


	if __name__ == "__main__":
	main()