Spaces:
Sleeping
Sleeping
| # Path: QAgents-workflos/run_quality_eval.py | |
| # Relations: Uses tests/quality_evaluation_harness.py, database/circuit_quality_db.py | |
| # Description: CLI entry point for quality-focused evaluation | |
| # Run with: python run_quality_eval.py --mode all --difficulty easy | |
| # Generates quality comparison report with actual QASM circuits | |
| """ | |
| Quality Evaluation Runner: CLI entry point for circuit quality comparison. | |
| Usage: | |
| python run_quality_eval.py --mode all --difficulty easy | |
| python run_quality_eval.py --mode naked --problem easy_001 | |
| python run_quality_eval.py --report RUN_ID | |
| """ | |
| import argparse | |
| import logging | |
| import sys | |
| import os | |
| from pathlib import Path | |
| from datetime import datetime | |
| # Add project root to path | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| # Ensure API key is set BEFORE importing config | |
| api_key = os.getenv("GOOGLE_API_KEY") or os.getenv("GENAI_API_KEY") | |
| if api_key: | |
| os.environ["GOOGLE_API_KEY"] = api_key | |
| from tests.quality_evaluation_harness import QualityEvaluationHarness, run_quick_quality_test | |
| from tests.test_problems import get_problem, get_problems_by_difficulty | |
| from database.circuit_quality_db import get_quality_db | |
| from config import set_api_key | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| # Explicitly set API key in config after logging is ready | |
| if api_key: | |
| set_api_key(api_key) | |
| logger.info(f"API Key configured: {api_key[:10]}...") | |
| else: | |
| logger.warning("No GOOGLE_API_KEY or GENAI_API_KEY found in environment") | |
| def run_evaluation(args): | |
| """Run quality evaluation based on arguments.""" | |
| harness = QualityEvaluationHarness() | |
| # Parse modes | |
| if args.mode == 'all': | |
| modes = ['naked', 'guided', 'blackboard'] | |
| else: | |
| modes = [args.mode] | |
| # Parse difficulties | |
| if args.difficulty == 'all': | |
| difficulties = ['easy', 'medium', 'hard'] | |
| else: | |
| difficulties = [args.difficulty] | |
| # Check if specific problem | |
| if args.problem: | |
| problem = get_problem(args.problem) | |
| if not problem: | |
| print(f"ERROR: Problem not found: {args.problem}") | |
| return | |
| print(f"\n{'='*60}") | |
| print(f"Running quality evaluation for: {args.problem}") | |
| print(f"Modes: {modes}") | |
| print(f"{'='*60}\n") | |
| results = harness.evaluate_problem_all_modes(problem, modes) | |
| # Print results | |
| for mode, result in results.items(): | |
| print(f"\n{mode.upper()}:") | |
| print(f" Success: {'✅' if result.success else '❌'}") | |
| print(f" Quality Score: {result.quality_metrics.overall_score()}/100") | |
| print(f" Depth: {result.quality_metrics.depth}") | |
| print(f" Gates: {result.quality_metrics.gate_count}") | |
| print(f" CX: {result.quality_metrics.cx_count}") | |
| print(f" Time: {result.execution_time_ms:.0f}ms") | |
| print(f" LLM Calls: {result.llm_requests}") | |
| if result.qasm_code: | |
| print(f" QASM ({len(result.qasm_code)} chars):") | |
| lines = result.qasm_code.split('\n')[:10] | |
| for line in lines: | |
| print(f" {line}") | |
| if len(result.qasm_code.split('\n')) > 10: | |
| print(" ...") | |
| else: | |
| # Full evaluation | |
| print(f"\n{'='*60}") | |
| print(f"Running full quality evaluation") | |
| print(f"Difficulties: {difficulties}") | |
| print(f"Modes: {modes}") | |
| print(f"Max problems: {args.max_problems or 'all'}") | |
| print(f"{'='*60}\n") | |
| run_id = harness.run_full_evaluation( | |
| difficulties=difficulties, | |
| modes=modes, | |
| max_problems=args.max_problems | |
| ) | |
| # Print summary | |
| harness.print_summary(run_id) | |
| # Generate report file | |
| report = harness.generate_report(run_id) | |
| report_path = Path(__file__).parent / f"QUALITY_REPORT_{run_id}.md" | |
| report_path.write_text(report, encoding='utf-8') | |
| print(f"\nFull report saved to: {report_path}") | |
| print(f"\nRun ID: {run_id}") | |
| print("Use --report <run_id> to regenerate report later") | |
| def show_report(run_id: str): | |
| """Show report for a specific run.""" | |
| harness = QualityEvaluationHarness() | |
| harness.run_id = run_id # Set to existing run | |
| report = harness.generate_report(run_id) | |
| print(report) | |
| def list_runs(): | |
| """List all evaluation runs.""" | |
| db = get_quality_db() | |
| query = "SELECT run_id, timestamp, description, num_problems FROM comparison_runs ORDER BY timestamp DESC LIMIT 20" | |
| import sqlite3 | |
| with sqlite3.connect(db.db_file) as conn: | |
| conn.row_factory = sqlite3.Row | |
| rows = conn.execute(query).fetchall() | |
| if not rows: | |
| print("No evaluation runs found.") | |
| return | |
| print("\nRecent Evaluation Runs:") | |
| print("-" * 80) | |
| for row in rows: | |
| print(f"{row['run_id']} | {row['timestamp']} | {row['num_problems']} problems | {row['description'] or 'N/A'}") | |
| print("-" * 80) | |
| def quick_test(args): | |
| """Run a quick single test.""" | |
| mode = args.mode if args.mode != 'all' else 'naked' | |
| problem_id = args.problem or 'easy_001' | |
| print(f"\nQuick test: {problem_id} with {mode} mode") | |
| print("-" * 40) | |
| try: | |
| result = run_quick_quality_test(mode, problem_id) | |
| print(f"Success: {'✅' if result.success else '❌'}") | |
| print(f"Quality Score: {result.quality_metrics.overall_score()}/100") | |
| print(f"Depth: {result.quality_metrics.depth}") | |
| print(f"Gates: {result.quality_metrics.gate_count}") | |
| if result.qasm_code: | |
| print(f"\nQASM:\n{result.qasm_code[:500]}") | |
| if result.errors: | |
| print(f"\nErrors: {result.errors}") | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Quality-focused quantum circuit evaluation", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| python run_quality_eval.py --quick # Quick test | |
| python run_quality_eval.py --mode all --difficulty easy | |
| python run_quality_eval.py --problem easy_001 --mode all | |
| python run_quality_eval.py --list # List previous runs | |
| python run_quality_eval.py --report quality_20241128_120000 | |
| """ | |
| ) | |
| parser.add_argument('--mode', choices=['naked', 'guided', 'blackboard', 'all'], | |
| default='all', help='Orchestration mode(s) to test') | |
| parser.add_argument('--difficulty', choices=['easy', 'medium', 'hard', 'all'], | |
| default='easy', help='Problem difficulty level(s)') | |
| parser.add_argument('--problem', type=str, help='Specific problem ID to test') | |
| parser.add_argument('--max-problems', type=int, help='Maximum problems to test') | |
| parser.add_argument('--quick', action='store_true', help='Run quick single test') | |
| parser.add_argument('--report', type=str, help='Generate report for run ID') | |
| parser.add_argument('--list', action='store_true', help='List previous runs') | |
| args = parser.parse_args() | |
| if args.list: | |
| list_runs() | |
| elif args.report: | |
| show_report(args.report) | |
| elif args.quick: | |
| quick_test(args) | |
| else: | |
| run_evaluation(args) | |
| if __name__ == "__main__": | |
| main() | |