# Path: QAgents-workflos/tests/quality_evaluation_harness.py # Relations: Uses orchestrators/, tests/circuit_quality_analyzer.py, database/circuit_quality_db.py # Description: Quality-focused evaluation harness that stores QASM circuits # Runs all 3 modes, measures quality via MCP, stores in database # Generates comparison reports with actual circuit outputs """ Quality Evaluation Harness: Run evaluations focused on CIRCUIT QUALITY. Key difference from regular harness: stores actual QASM and measures quality. """ import time import json import logging from datetime import datetime from typing import Dict, List, Optional, Any from pathlib import Path import uuid from .test_problems import TestProblem, ALL_PROBLEMS, get_problem, get_problems_by_difficulty, ProblemDifficulty from .circuit_quality_analyzer import CircuitQualityAnalyzer, AnalysisResult from database.circuit_quality_db import ( CircuitQualityDB, CircuitEvaluation, QualityMetrics, get_quality_db ) logger = logging.getLogger(__name__) class QualityEvaluationHarness: """ Runs quality-focused evaluations across all orchestration modes. PRIMARY FOCUS: Circuit quality, not just success rate. STORES: Full QASM code in database for later analysis. """ def __init__(self, mcp_url: str = "http://127.0.0.1:7861"): self.mcp_url = mcp_url self.analyzer = CircuitQualityAnalyzer(mcp_url) self.db = get_quality_db() self.run_id = f"quality_{datetime.now().strftime('%Y%m%d_%H%M%S')}" def evaluate_single(self, problem: TestProblem, mode: str) -> CircuitEvaluation: """ Run a single evaluation and return full CircuitEvaluation with QASM. Args: problem: The test problem to solve mode: 'naked', 'guided', or 'blackboard' Returns: CircuitEvaluation with full QASM and quality metrics """ from orchestrators import create_orchestrator logger.info(f"Evaluating {problem.id} with {mode} mode") # Reset cost tracking try: from config import reset_cost_tracking, get_cost_summary reset_cost_tracking() except ImportError: get_cost_summary = lambda: {} # Initialize result eval_result = CircuitEvaluation( run_id=self.run_id, timestamp=datetime.now().isoformat(), problem_id=problem.id, problem_goal=problem.goal, mode=mode ) start_time = time.perf_counter() try: # Create and run orchestrator orchestrator = create_orchestrator(mode) result = orchestrator.run(problem.goal) elapsed_ms = (time.perf_counter() - start_time) * 1000 eval_result.execution_time_ms = elapsed_ms # Extract QASM qasm = result.final_output if isinstance(qasm, list): qasm = qasm[0] if qasm else None if qasm is not None: qasm = str(qasm) if not isinstance(qasm, str) else qasm eval_result.qasm_code = qasm or "" eval_result.success = result.success and bool(qasm) if not eval_result.success: eval_result.errors = result.errors except Exception as e: elapsed_ms = (time.perf_counter() - start_time) * 1000 eval_result.execution_time_ms = elapsed_ms eval_result.success = False eval_result.errors = [str(e)] logger.error(f"Evaluation failed for {problem.id}/{mode}: {e}") # Get cost metrics try: cost = get_cost_summary() eval_result.llm_requests = cost.get('total_requests', 0) eval_result.tokens_used = cost.get('total_tokens', 0) except Exception: pass # Analyze quality if we have QASM if eval_result.qasm_code: expected = problem.expected.expected_states if problem.expected else None analysis = self.analyzer.analyze_circuit(eval_result.qasm_code, expected) eval_result.quality_metrics = QualityMetrics( depth=analysis.depth, gate_count=analysis.gate_count, cx_count=analysis.cx_count, single_qubit_count=analysis.single_qubit_count, hardware_fitness=analysis.hardware_fitness, syntax_valid=analysis.syntax_valid, state_correctness=analysis.state_correctness, complexity_score=analysis.complexity_score, noise_estimate=analysis.noise_estimate ) if analysis.errors: eval_result.errors.extend(analysis.errors) # Store in database eval_id = self.db.save_evaluation(eval_result) eval_result.id = eval_id logger.info(f"Stored evaluation {eval_id}: {problem.id}/{mode} - " f"success={eval_result.success}, score={eval_result.quality_metrics.overall_score()}") return eval_result def evaluate_problem_all_modes(self, problem: TestProblem, modes: List[str] = None) -> Dict[str, CircuitEvaluation]: """Evaluate a single problem with all modes.""" if modes is None: modes = ['naked', 'guided', 'blackboard'] results = {} for mode in modes: results[mode] = self.evaluate_single(problem, mode) return results def run_full_evaluation(self, difficulties: List[str] = None, modes: List[str] = None, max_problems: int = None) -> str: """ Run full evaluation across problems and modes. Args: difficulties: List of difficulties to test ('easy', 'medium', 'hard') modes: List of modes to test ('naked', 'guided', 'blackboard') max_problems: Maximum number of problems to test (for quick runs) Returns: run_id for this evaluation run """ if difficulties is None: difficulties = ['easy', 'medium', 'hard'] if modes is None: modes = ['naked', 'guided', 'blackboard'] # Gather problems all_probs = [] for diff in difficulties: # Convert string to enum if needed if isinstance(diff, str): try: diff_enum = ProblemDifficulty(diff) except ValueError: logger.warning(f"Invalid difficulty: {diff}") continue else: diff_enum = diff probs = get_problems_by_difficulty(diff_enum) all_probs.extend(probs) if max_problems: all_probs = all_probs[:max_problems] logger.info(f"Starting quality evaluation run {self.run_id}") logger.info(f"Problems: {len(all_probs)}, Modes: {modes}") # Run evaluations total = len(all_probs) * len(modes) completed = 0 for problem in all_probs: for mode in modes: try: self.evaluate_single(problem, mode) completed += 1 logger.info(f"Progress: {completed}/{total}") except Exception as e: logger.error(f"Failed {problem.id}/{mode}: {e}") completed += 1 # Save run summary summary = self.db.get_quality_summary(self.run_id) self.db.save_comparison_run( run_id=self.run_id, description=f"Quality evaluation: {len(all_probs)} problems, {modes}", num_problems=len(all_probs), modes=modes, summary=summary ) return self.run_id def generate_report(self, run_id: Optional[str] = None) -> str: """Generate a comprehensive quality comparison report.""" if run_id is None: run_id = self.run_id # Get summary summary = self.db.get_quality_summary(run_id) # Get full circuit export circuits_md = self.db.export_circuits_markdown(run_id) # Build report report = [] report.append("# CIRCUIT QUALITY EVALUATION REPORT\n") report.append(f"Run ID: {run_id}\n") report.append(f"Generated: {datetime.now().isoformat()}\n\n") report.append("## EXECUTIVE SUMMARY\n\n") # Summary table report.append("| Mode | Success Rate | Quality Score | Avg Depth | Avg Gates | Avg CX | HW Fitness | LLM Calls |\n") report.append("|------|-------------|---------------|-----------|-----------|--------|------------|----------|\n") for mode in ['naked', 'guided', 'blackboard']: if mode in summary.get('modes', {}): m = summary['modes'][mode] report.append( f"| {mode.upper()} | {m['success_rate']*100:.0f}% | " f"{m['avg_quality_score']:.1f}/100 | {m['avg_depth']:.1f} | " f"{m['avg_gates']:.1f} | {m['avg_cx_count']:.1f} | " f"{m['avg_hardware_fitness']:.3f} | {m['total_llm_requests']} |\n" ) report.append("\n## KEY FINDINGS\n\n") # Determine winner modes_data = summary.get('modes', {}) if modes_data: best_quality = max(modes_data.items(), key=lambda x: x[1].get('avg_quality_score', 0)) best_success = max(modes_data.items(), key=lambda x: x[1].get('success_rate', 0)) lowest_cost = min(modes_data.items(), key=lambda x: x[1].get('total_llm_requests', float('inf'))) report.append(f"- **Best Quality**: {best_quality[0].upper()} ({best_quality[1]['avg_quality_score']:.1f}/100)\n") report.append(f"- **Best Success Rate**: {best_success[0].upper()} ({best_success[1]['success_rate']*100:.0f}%)\n") report.append(f"- **Lowest Cost**: {lowest_cost[0].upper()} ({lowest_cost[1]['total_llm_requests']} LLM calls)\n") # Quality per LLM call report.append("\n### Quality Efficiency (Quality Score per LLM Call)\n\n") for mode, data in modes_data.items(): llm_calls = data.get('total_llm_requests', 1) or 1 quality = data.get('avg_quality_score', 0) efficiency = quality / llm_calls report.append(f"- {mode.upper()}: {efficiency:.2f} quality points per LLM call\n") report.append("\n---\n") report.append("\n## DETAILED CIRCUIT COMPARISONS\n") report.append(circuits_md) return "".join(report) def print_summary(self, run_id: Optional[str] = None): """Print a quick summary to console.""" if run_id is None: run_id = self.run_id summary = self.db.get_quality_summary(run_id) print("\n" + "="*70) print("QUALITY EVALUATION SUMMARY") print("="*70) modes = summary.get('modes', {}) for mode in ['naked', 'guided', 'blackboard']: if mode in modes: m = modes[mode] print(f"\n{mode.upper()}:") print(f" Success Rate: {m['success_rate']*100:.0f}%") print(f" Quality Score: {m['avg_quality_score']:.1f}/100") print(f" Avg Depth: {m['avg_depth']:.1f}") print(f" Avg Gates: {m['avg_gates']:.1f}") print(f" Avg CX Count: {m['avg_cx_count']:.1f}") print(f" HW Fitness: {m['avg_hardware_fitness']:.3f}") print(f" LLM Requests: {m['total_llm_requests']}") print("\n" + "="*70) def run_quick_quality_test(mode: str = 'naked', problem_id: str = 'bell_state') -> CircuitEvaluation: """Quick test function to verify system works.""" problem = get_problem(problem_id) if not problem: raise ValueError(f"Problem not found: {problem_id}") harness = QualityEvaluationHarness() return harness.evaluate_single(problem, mode)