Spaces:
Sleeping
Sleeping
| """ | |
| Evaluation metrics for the quantum finance feasibility framework. | |
| Metrics defined per plan.md: | |
| 1. Expert Alignment: Agreement with literature/expert consensus | |
| 2. Hallucination Rate: Factual accuracy of agent outputs | |
| 3. Computational Efficiency: Agent rounds and time per evaluation | |
| """ | |
| import re | |
| import time | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| from datetime import datetime | |
| class EvaluationMetrics: | |
| """Container for evaluation metrics of a single run.""" | |
| idea: str | |
| start_time: datetime = field(default_factory=datetime.now) | |
| end_time: Optional[datetime] = None | |
| # Computational efficiency metrics | |
| total_agent_rounds: int = 0 | |
| total_tokens_used: int = 0 | |
| wall_clock_time_seconds: float = 0.0 | |
| # Quality metrics (to be assessed manually or via comparison) | |
| expert_alignment_score: Optional[float] = None # 0-1 scale | |
| hallucination_count: int = 0 | |
| factual_claims_count: int = 0 | |
| # Output scores from agents | |
| quantum_feasibility_score: Optional[float] = None | |
| hardware_readiness_score: Optional[float] = None | |
| classical_competitiveness_score: Optional[float] = None | |
| business_viability_score: Optional[float] = None | |
| overall_score: Optional[float] = None | |
| def complete(self): | |
| """Mark evaluation as complete and calculate duration.""" | |
| self.end_time = datetime.now() | |
| self.wall_clock_time_seconds = (self.end_time - self.start_time).total_seconds() | |
| def hallucination_rate(self) -> Optional[float]: | |
| """Calculate hallucination rate as fraction of factual claims.""" | |
| if self.factual_claims_count == 0: | |
| return None | |
| return self.hallucination_count / self.factual_claims_count | |
| def to_dict(self) -> dict: | |
| """Convert to dictionary for serialization.""" | |
| return { | |
| 'idea': self.idea, | |
| 'start_time': self.start_time.isoformat(), | |
| 'end_time': self.end_time.isoformat() if self.end_time else None, | |
| 'total_agent_rounds': self.total_agent_rounds, | |
| 'total_tokens_used': self.total_tokens_used, | |
| 'wall_clock_time_seconds': self.wall_clock_time_seconds, | |
| 'expert_alignment_score': self.expert_alignment_score, | |
| 'hallucination_rate': self.hallucination_rate, | |
| 'quantum_feasibility_score': self.quantum_feasibility_score, | |
| 'hardware_readiness_score': self.hardware_readiness_score, | |
| 'classical_competitiveness_score': self.classical_competitiveness_score, | |
| 'business_viability_score': self.business_viability_score, | |
| 'overall_score': self.overall_score | |
| } | |
| class MetricsCollector: | |
| """Collect and aggregate metrics across multiple evaluations.""" | |
| def __init__(self): | |
| self.evaluations: list[EvaluationMetrics] = [] | |
| def start_evaluation(self, idea: str) -> EvaluationMetrics: | |
| """Start tracking a new evaluation.""" | |
| metrics = EvaluationMetrics(idea=idea) | |
| self.evaluations.append(metrics) | |
| return metrics | |
| def get_summary_statistics(self) -> dict: | |
| """Calculate summary statistics across all evaluations.""" | |
| if not self.evaluations: | |
| return {'error': 'No evaluations recorded'} | |
| completed = [e for e in self.evaluations if e.end_time is not None] | |
| if not completed: | |
| return {'error': 'No completed evaluations'} | |
| times = [e.wall_clock_time_seconds for e in completed] | |
| rounds = [e.total_agent_rounds for e in completed] | |
| alignment_scores = [e.expert_alignment_score for e in completed | |
| if e.expert_alignment_score is not None] | |
| overall_scores = [e.overall_score for e in completed | |
| if e.overall_score is not None] | |
| import numpy as np | |
| return { | |
| 'total_evaluations': len(completed), | |
| 'timing': { | |
| 'mean_seconds': np.mean(times), | |
| 'std_seconds': np.std(times), | |
| 'min_seconds': np.min(times), | |
| 'max_seconds': np.max(times) | |
| }, | |
| 'agent_rounds': { | |
| 'mean': np.mean(rounds), | |
| 'std': np.std(rounds), | |
| 'min': np.min(rounds), | |
| 'max': np.max(rounds) | |
| }, | |
| 'expert_alignment': { | |
| 'mean': np.mean(alignment_scores) if alignment_scores else None, | |
| 'n_rated': len(alignment_scores) | |
| }, | |
| 'overall_scores': { | |
| 'mean': np.mean(overall_scores) if overall_scores else None, | |
| 'std': np.std(overall_scores) if overall_scores else None | |
| } | |
| } | |
| # Ground truth for validation against literature consensus | |
| LITERATURE_CONSENSUS = { | |
| 'portfolio_optimization_qaoa': { | |
| 'quantum_advantage_near_term': False, | |
| 'feasible_on_nisq': True, # Can run, but no advantage | |
| 'bottleneck': 'noise_and_classical_competition', | |
| 'recommended_approach': 'hybrid', | |
| 'references': [ | |
| 'Egger et al. (2020) - Quantum Computing for Finance', | |
| 'Herman et al. (2022) - Portfolio Optimization Survey' | |
| ] | |
| }, | |
| 'option_pricing_amplitude_estimation': { | |
| 'quantum_advantage_near_term': False, | |
| 'feasible_on_nisq': False, # Requires error correction | |
| 'bottleneck': 'circuit_depth_and_error_correction', | |
| 'recommended_approach': 'classical_monte_carlo', | |
| 'references': [ | |
| 'Stamatopoulos et al. (2020) - Option Pricing using QC', | |
| 'Chakrabarti et al. (2021) - Threshold for Quantum Speedup' | |
| ] | |
| }, | |
| 'fraud_detection_quantum_ml': { | |
| 'quantum_advantage_near_term': False, | |
| 'feasible_on_nisq': True, # Can run, limited scale | |
| 'bottleneck': 'data_loading_and_classical_ml_strength', | |
| 'recommended_approach': 'classical_ml', | |
| 'references': [ | |
| 'Schuld & Petruccione (2021) - ML with Quantum Computers', | |
| 'Tang (2019) - Quantum-inspired classical algorithms' | |
| ] | |
| }, | |
| 'risk_analysis_quantum_monte_carlo': { | |
| 'quantum_advantage_near_term': False, | |
| 'feasible_on_nisq': False, | |
| 'bottleneck': 'quadratic_speedup_insufficient_with_noise', | |
| 'recommended_approach': 'gpu_accelerated_classical', | |
| 'references': [ | |
| 'Woerner & Egger (2019) - Quantum Risk Analysis', | |
| 'Miyamoto & Shiohara (2022) - Bermudan Option Pricing' | |
| ] | |
| }, | |
| 'credit_scoring_vqc': { | |
| 'quantum_advantage_near_term': False, | |
| 'feasible_on_nisq': True, | |
| 'bottleneck': 'barren_plateaus_and_expressibility', | |
| 'recommended_approach': 'classical_ensemble_methods', | |
| 'references': [ | |
| 'McClean et al. (2018) - Barren Plateaus', | |
| 'Cerezo et al. (2021) - Variational Quantum Algorithms' | |
| ] | |
| } | |
| } | |
| def calculate_expert_alignment( | |
| agent_output: dict, | |
| use_case_key: str | |
| ) -> dict: | |
| """ | |
| Calculate alignment between agent output and literature consensus. | |
| Args: | |
| agent_output: Structured output from the agent crew | |
| use_case_key: Key into LITERATURE_CONSENSUS dict | |
| Returns: | |
| Alignment analysis with score | |
| """ | |
| if use_case_key not in LITERATURE_CONSENSUS: | |
| return { | |
| 'error': f'Unknown use case: {use_case_key}', | |
| 'available_cases': list(LITERATURE_CONSENSUS.keys()) | |
| } | |
| consensus = LITERATURE_CONSENSUS[use_case_key] | |
| alignment_checks = [] | |
| # Check quantum advantage assessment | |
| if 'quantum_advantage' in agent_output: | |
| agent_says_advantage = agent_output.get('quantum_advantage', False) | |
| consensus_advantage = consensus['quantum_advantage_near_term'] | |
| alignment_checks.append({ | |
| 'criterion': 'quantum_advantage_assessment', | |
| 'agent': agent_says_advantage, | |
| 'consensus': consensus_advantage, | |
| 'aligned': agent_says_advantage == consensus_advantage | |
| }) | |
| # Check feasibility assessment | |
| if 'feasible_on_nisq' in agent_output: | |
| agent_feasible = agent_output.get('feasible_on_nisq', False) | |
| consensus_feasible = consensus['feasible_on_nisq'] | |
| alignment_checks.append({ | |
| 'criterion': 'nisq_feasibility', | |
| 'agent': agent_feasible, | |
| 'consensus': consensus_feasible, | |
| 'aligned': agent_feasible == consensus_feasible | |
| }) | |
| # Calculate alignment score | |
| if alignment_checks: | |
| aligned_count = sum(1 for c in alignment_checks if c['aligned']) | |
| alignment_score = aligned_count / len(alignment_checks) | |
| else: | |
| alignment_score = None | |
| return { | |
| 'use_case': use_case_key, | |
| 'alignment_checks': alignment_checks, | |
| 'alignment_score': alignment_score, | |
| 'consensus_references': consensus['references'] | |
| } | |
| def extract_scores_from_text(text: str) -> dict: | |
| """ | |
| Extract numerical scores from agent output text. | |
| Looks for patterns like: | |
| - "score: 0.7" | |
| - "feasibility score (0-1): 0.65" | |
| - "Rating: 7/10" | |
| """ | |
| scores = {} | |
| # Pattern: "X score: 0.Y" or "X score (0-1): 0.Y" | |
| score_pattern = r'(\w+(?:\s+\w+)?)\s+score\s*(?:\([^)]+\))?\s*[:=]\s*([\d.]+)' | |
| for match in re.finditer(score_pattern, text, re.IGNORECASE): | |
| label = match.group(1).lower().replace(' ', '_') | |
| value = float(match.group(2)) | |
| if 0 <= value <= 1: | |
| scores[f'{label}_score'] = value | |
| elif 0 <= value <= 10: | |
| scores[f'{label}_score'] = value / 10 | |
| # Pattern: "Rating: X/10" | |
| rating_pattern = r'rating\s*[:=]\s*([\d.]+)\s*/\s*10' | |
| for match in re.finditer(rating_pattern, text, re.IGNORECASE): | |
| value = float(match.group(1)) | |
| scores['rating'] = value / 10 | |
| return scores | |
| class HallucinationChecker: | |
| """ | |
| Check agent outputs for potential hallucinations. | |
| Compares claims against known facts about quantum computing | |
| and finance applications. | |
| """ | |
| # Known facts for validation | |
| KNOWN_FACTS = { | |
| # Hardware facts (2024-2026) | |
| 'ibm_qubit_count': (433, 1121), # Range: Osprey to Condor | |
| 'ionq_qubit_count': (32, 36), | |
| 'google_qubit_count': (72, 100), | |
| # Algorithm facts | |
| 'grover_speedup': 'quadratic', # sqrt(N) | |
| 'shor_speedup': 'exponential', | |
| 'qaoa_proven_advantage': False, | |
| 'vqe_proven_advantage': False, | |
| # Finance facts | |
| 'hft_latency_requirement_us': 1, # microseconds | |
| 'typical_portfolio_size': (10, 10000), | |
| } | |
| HALLUCINATION_PATTERNS = [ | |
| # Claims of proven quantum advantage for NISQ | |
| r'proven\s+quantum\s+advantage\s+(?:for|in|on)\s+(?:NISQ|near-term)', | |
| r'demonstrat(?:ed|es)\s+quantum\s+supremacy\s+(?:for|in)\s+finance', | |
| # Unrealistic hardware claims | |
| r'(?:current|available)\s+(?:quantum\s+)?computers?\s+(?:with|have)\s+(?:\d{4,}|\d+,\d{3,})\s+qubits', | |
| # Impossible speedup claims | |
| r'exponential\s+speedup\s+(?:for|in|using)\s+(?:QAOA|VQE)', | |
| ] | |
| def check_output(self, text: str) -> dict: | |
| """ | |
| Check text for potential hallucinations. | |
| Args: | |
| text: Agent output text to check | |
| Returns: | |
| Dictionary with hallucination analysis | |
| """ | |
| findings = [] | |
| # Check against hallucination patterns | |
| for pattern in self.HALLUCINATION_PATTERNS: | |
| matches = re.findall(pattern, text, re.IGNORECASE) | |
| if matches: | |
| findings.append({ | |
| 'type': 'suspicious_claim', | |
| 'pattern': pattern, | |
| 'matches': matches | |
| }) | |
| # Check qubit count claims | |
| qubit_claims = re.findall(r'(\d+)\s*qubits?', text, re.IGNORECASE) | |
| for claim in qubit_claims: | |
| count = int(claim) | |
| if count > 2000: # No current hardware has this many | |
| findings.append({ | |
| 'type': 'unrealistic_qubit_count', | |
| 'claimed': count, | |
| 'realistic_max': 1121 | |
| }) | |
| return { | |
| 'potential_hallucinations': len(findings), | |
| 'findings': findings, | |
| 'text_length': len(text) | |
| } | |
| if __name__ == "__main__": | |
| # Demo metrics collection | |
| collector = MetricsCollector() | |
| # Simulate an evaluation | |
| metrics = collector.start_evaluation( | |
| "Quantum portfolio optimization using QAOA" | |
| ) | |
| metrics.total_agent_rounds = 5 | |
| metrics.quantum_feasibility_score = 0.6 | |
| metrics.hardware_readiness_score = 0.4 | |
| metrics.classical_competitiveness_score = 0.8 | |
| metrics.business_viability_score = 0.5 | |
| metrics.overall_score = 0.3 | |
| metrics.expert_alignment_score = 0.85 | |
| import time | |
| time.sleep(0.1) # Simulate some work | |
| metrics.complete() | |
| print("Evaluation Metrics Demo") | |
| print("=" * 50) | |
| print(metrics.to_dict()) | |
| print("\nSummary Statistics:") | |
| print(collector.get_summary_statistics()) | |