""" Evaluation metrics for the quantum finance feasibility framework. Metrics defined per plan.md: 1. Expert Alignment: Agreement with literature/expert consensus 2. Hallucination Rate: Factual accuracy of agent outputs 3. Computational Efficiency: Agent rounds and time per evaluation """ import re import time from dataclasses import dataclass, field from typing import Optional from datetime import datetime @dataclass class EvaluationMetrics: """Container for evaluation metrics of a single run.""" idea: str start_time: datetime = field(default_factory=datetime.now) end_time: Optional[datetime] = None # Computational efficiency metrics total_agent_rounds: int = 0 total_tokens_used: int = 0 wall_clock_time_seconds: float = 0.0 # Quality metrics (to be assessed manually or via comparison) expert_alignment_score: Optional[float] = None # 0-1 scale hallucination_count: int = 0 factual_claims_count: int = 0 # Output scores from agents quantum_feasibility_score: Optional[float] = None hardware_readiness_score: Optional[float] = None classical_competitiveness_score: Optional[float] = None business_viability_score: Optional[float] = None overall_score: Optional[float] = None def complete(self): """Mark evaluation as complete and calculate duration.""" self.end_time = datetime.now() self.wall_clock_time_seconds = (self.end_time - self.start_time).total_seconds() @property def hallucination_rate(self) -> Optional[float]: """Calculate hallucination rate as fraction of factual claims.""" if self.factual_claims_count == 0: return None return self.hallucination_count / self.factual_claims_count def to_dict(self) -> dict: """Convert to dictionary for serialization.""" return { 'idea': self.idea, 'start_time': self.start_time.isoformat(), 'end_time': self.end_time.isoformat() if self.end_time else None, 'total_agent_rounds': self.total_agent_rounds, 'total_tokens_used': self.total_tokens_used, 'wall_clock_time_seconds': self.wall_clock_time_seconds, 'expert_alignment_score': self.expert_alignment_score, 'hallucination_rate': self.hallucination_rate, 'quantum_feasibility_score': self.quantum_feasibility_score, 'hardware_readiness_score': self.hardware_readiness_score, 'classical_competitiveness_score': self.classical_competitiveness_score, 'business_viability_score': self.business_viability_score, 'overall_score': self.overall_score } class MetricsCollector: """Collect and aggregate metrics across multiple evaluations.""" def __init__(self): self.evaluations: list[EvaluationMetrics] = [] def start_evaluation(self, idea: str) -> EvaluationMetrics: """Start tracking a new evaluation.""" metrics = EvaluationMetrics(idea=idea) self.evaluations.append(metrics) return metrics def get_summary_statistics(self) -> dict: """Calculate summary statistics across all evaluations.""" if not self.evaluations: return {'error': 'No evaluations recorded'} completed = [e for e in self.evaluations if e.end_time is not None] if not completed: return {'error': 'No completed evaluations'} times = [e.wall_clock_time_seconds for e in completed] rounds = [e.total_agent_rounds for e in completed] alignment_scores = [e.expert_alignment_score for e in completed if e.expert_alignment_score is not None] overall_scores = [e.overall_score for e in completed if e.overall_score is not None] import numpy as np return { 'total_evaluations': len(completed), 'timing': { 'mean_seconds': np.mean(times), 'std_seconds': np.std(times), 'min_seconds': np.min(times), 'max_seconds': np.max(times) }, 'agent_rounds': { 'mean': np.mean(rounds), 'std': np.std(rounds), 'min': np.min(rounds), 'max': np.max(rounds) }, 'expert_alignment': { 'mean': np.mean(alignment_scores) if alignment_scores else None, 'n_rated': len(alignment_scores) }, 'overall_scores': { 'mean': np.mean(overall_scores) if overall_scores else None, 'std': np.std(overall_scores) if overall_scores else None } } # Ground truth for validation against literature consensus LITERATURE_CONSENSUS = { 'portfolio_optimization_qaoa': { 'quantum_advantage_near_term': False, 'feasible_on_nisq': True, # Can run, but no advantage 'bottleneck': 'noise_and_classical_competition', 'recommended_approach': 'hybrid', 'references': [ 'Egger et al. (2020) - Quantum Computing for Finance', 'Herman et al. (2022) - Portfolio Optimization Survey' ] }, 'option_pricing_amplitude_estimation': { 'quantum_advantage_near_term': False, 'feasible_on_nisq': False, # Requires error correction 'bottleneck': 'circuit_depth_and_error_correction', 'recommended_approach': 'classical_monte_carlo', 'references': [ 'Stamatopoulos et al. (2020) - Option Pricing using QC', 'Chakrabarti et al. (2021) - Threshold for Quantum Speedup' ] }, 'fraud_detection_quantum_ml': { 'quantum_advantage_near_term': False, 'feasible_on_nisq': True, # Can run, limited scale 'bottleneck': 'data_loading_and_classical_ml_strength', 'recommended_approach': 'classical_ml', 'references': [ 'Schuld & Petruccione (2021) - ML with Quantum Computers', 'Tang (2019) - Quantum-inspired classical algorithms' ] }, 'risk_analysis_quantum_monte_carlo': { 'quantum_advantage_near_term': False, 'feasible_on_nisq': False, 'bottleneck': 'quadratic_speedup_insufficient_with_noise', 'recommended_approach': 'gpu_accelerated_classical', 'references': [ 'Woerner & Egger (2019) - Quantum Risk Analysis', 'Miyamoto & Shiohara (2022) - Bermudan Option Pricing' ] }, 'credit_scoring_vqc': { 'quantum_advantage_near_term': False, 'feasible_on_nisq': True, 'bottleneck': 'barren_plateaus_and_expressibility', 'recommended_approach': 'classical_ensemble_methods', 'references': [ 'McClean et al. (2018) - Barren Plateaus', 'Cerezo et al. (2021) - Variational Quantum Algorithms' ] } } def calculate_expert_alignment( agent_output: dict, use_case_key: str ) -> dict: """ Calculate alignment between agent output and literature consensus. Args: agent_output: Structured output from the agent crew use_case_key: Key into LITERATURE_CONSENSUS dict Returns: Alignment analysis with score """ if use_case_key not in LITERATURE_CONSENSUS: return { 'error': f'Unknown use case: {use_case_key}', 'available_cases': list(LITERATURE_CONSENSUS.keys()) } consensus = LITERATURE_CONSENSUS[use_case_key] alignment_checks = [] # Check quantum advantage assessment if 'quantum_advantage' in agent_output: agent_says_advantage = agent_output.get('quantum_advantage', False) consensus_advantage = consensus['quantum_advantage_near_term'] alignment_checks.append({ 'criterion': 'quantum_advantage_assessment', 'agent': agent_says_advantage, 'consensus': consensus_advantage, 'aligned': agent_says_advantage == consensus_advantage }) # Check feasibility assessment if 'feasible_on_nisq' in agent_output: agent_feasible = agent_output.get('feasible_on_nisq', False) consensus_feasible = consensus['feasible_on_nisq'] alignment_checks.append({ 'criterion': 'nisq_feasibility', 'agent': agent_feasible, 'consensus': consensus_feasible, 'aligned': agent_feasible == consensus_feasible }) # Calculate alignment score if alignment_checks: aligned_count = sum(1 for c in alignment_checks if c['aligned']) alignment_score = aligned_count / len(alignment_checks) else: alignment_score = None return { 'use_case': use_case_key, 'alignment_checks': alignment_checks, 'alignment_score': alignment_score, 'consensus_references': consensus['references'] } def extract_scores_from_text(text: str) -> dict: """ Extract numerical scores from agent output text. Looks for patterns like: - "score: 0.7" - "feasibility score (0-1): 0.65" - "Rating: 7/10" """ scores = {} # Pattern: "X score: 0.Y" or "X score (0-1): 0.Y" score_pattern = r'(\w+(?:\s+\w+)?)\s+score\s*(?:\([^)]+\))?\s*[:=]\s*([\d.]+)' for match in re.finditer(score_pattern, text, re.IGNORECASE): label = match.group(1).lower().replace(' ', '_') value = float(match.group(2)) if 0 <= value <= 1: scores[f'{label}_score'] = value elif 0 <= value <= 10: scores[f'{label}_score'] = value / 10 # Pattern: "Rating: X/10" rating_pattern = r'rating\s*[:=]\s*([\d.]+)\s*/\s*10' for match in re.finditer(rating_pattern, text, re.IGNORECASE): value = float(match.group(1)) scores['rating'] = value / 10 return scores class HallucinationChecker: """ Check agent outputs for potential hallucinations. Compares claims against known facts about quantum computing and finance applications. """ # Known facts for validation KNOWN_FACTS = { # Hardware facts (2024-2026) 'ibm_qubit_count': (433, 1121), # Range: Osprey to Condor 'ionq_qubit_count': (32, 36), 'google_qubit_count': (72, 100), # Algorithm facts 'grover_speedup': 'quadratic', # sqrt(N) 'shor_speedup': 'exponential', 'qaoa_proven_advantage': False, 'vqe_proven_advantage': False, # Finance facts 'hft_latency_requirement_us': 1, # microseconds 'typical_portfolio_size': (10, 10000), } HALLUCINATION_PATTERNS = [ # Claims of proven quantum advantage for NISQ r'proven\s+quantum\s+advantage\s+(?:for|in|on)\s+(?:NISQ|near-term)', r'demonstrat(?:ed|es)\s+quantum\s+supremacy\s+(?:for|in)\s+finance', # Unrealistic hardware claims r'(?:current|available)\s+(?:quantum\s+)?computers?\s+(?:with|have)\s+(?:\d{4,}|\d+,\d{3,})\s+qubits', # Impossible speedup claims r'exponential\s+speedup\s+(?:for|in|using)\s+(?:QAOA|VQE)', ] def check_output(self, text: str) -> dict: """ Check text for potential hallucinations. Args: text: Agent output text to check Returns: Dictionary with hallucination analysis """ findings = [] # Check against hallucination patterns for pattern in self.HALLUCINATION_PATTERNS: matches = re.findall(pattern, text, re.IGNORECASE) if matches: findings.append({ 'type': 'suspicious_claim', 'pattern': pattern, 'matches': matches }) # Check qubit count claims qubit_claims = re.findall(r'(\d+)\s*qubits?', text, re.IGNORECASE) for claim in qubit_claims: count = int(claim) if count > 2000: # No current hardware has this many findings.append({ 'type': 'unrealistic_qubit_count', 'claimed': count, 'realistic_max': 1121 }) return { 'potential_hallucinations': len(findings), 'findings': findings, 'text_length': len(text) } if __name__ == "__main__": # Demo metrics collection collector = MetricsCollector() # Simulate an evaluation metrics = collector.start_evaluation( "Quantum portfolio optimization using QAOA" ) metrics.total_agent_rounds = 5 metrics.quantum_feasibility_score = 0.6 metrics.hardware_readiness_score = 0.4 metrics.classical_competitiveness_score = 0.8 metrics.business_viability_score = 0.5 metrics.overall_score = 0.3 metrics.expert_alignment_score = 0.85 import time time.sleep(0.1) # Simulate some work metrics.complete() print("Evaluation Metrics Demo") print("=" * 50) print(metrics.to_dict()) print("\nSummary Statistics:") print(collector.get_summary_statistics())