des137's picture
Deploy Quantum Finance Analyzer
6413ecc
"""
Evaluation metrics for the quantum finance feasibility framework.
Metrics defined per plan.md:
1. Expert Alignment: Agreement with literature/expert consensus
2. Hallucination Rate: Factual accuracy of agent outputs
3. Computational Efficiency: Agent rounds and time per evaluation
"""
import re
import time
from dataclasses import dataclass, field
from typing import Optional
from datetime import datetime
@dataclass
class EvaluationMetrics:
"""Container for evaluation metrics of a single run."""
idea: str
start_time: datetime = field(default_factory=datetime.now)
end_time: Optional[datetime] = None
# Computational efficiency metrics
total_agent_rounds: int = 0
total_tokens_used: int = 0
wall_clock_time_seconds: float = 0.0
# Quality metrics (to be assessed manually or via comparison)
expert_alignment_score: Optional[float] = None # 0-1 scale
hallucination_count: int = 0
factual_claims_count: int = 0
# Output scores from agents
quantum_feasibility_score: Optional[float] = None
hardware_readiness_score: Optional[float] = None
classical_competitiveness_score: Optional[float] = None
business_viability_score: Optional[float] = None
overall_score: Optional[float] = None
def complete(self):
"""Mark evaluation as complete and calculate duration."""
self.end_time = datetime.now()
self.wall_clock_time_seconds = (self.end_time - self.start_time).total_seconds()
@property
def hallucination_rate(self) -> Optional[float]:
"""Calculate hallucination rate as fraction of factual claims."""
if self.factual_claims_count == 0:
return None
return self.hallucination_count / self.factual_claims_count
def to_dict(self) -> dict:
"""Convert to dictionary for serialization."""
return {
'idea': self.idea,
'start_time': self.start_time.isoformat(),
'end_time': self.end_time.isoformat() if self.end_time else None,
'total_agent_rounds': self.total_agent_rounds,
'total_tokens_used': self.total_tokens_used,
'wall_clock_time_seconds': self.wall_clock_time_seconds,
'expert_alignment_score': self.expert_alignment_score,
'hallucination_rate': self.hallucination_rate,
'quantum_feasibility_score': self.quantum_feasibility_score,
'hardware_readiness_score': self.hardware_readiness_score,
'classical_competitiveness_score': self.classical_competitiveness_score,
'business_viability_score': self.business_viability_score,
'overall_score': self.overall_score
}
class MetricsCollector:
"""Collect and aggregate metrics across multiple evaluations."""
def __init__(self):
self.evaluations: list[EvaluationMetrics] = []
def start_evaluation(self, idea: str) -> EvaluationMetrics:
"""Start tracking a new evaluation."""
metrics = EvaluationMetrics(idea=idea)
self.evaluations.append(metrics)
return metrics
def get_summary_statistics(self) -> dict:
"""Calculate summary statistics across all evaluations."""
if not self.evaluations:
return {'error': 'No evaluations recorded'}
completed = [e for e in self.evaluations if e.end_time is not None]
if not completed:
return {'error': 'No completed evaluations'}
times = [e.wall_clock_time_seconds for e in completed]
rounds = [e.total_agent_rounds for e in completed]
alignment_scores = [e.expert_alignment_score for e in completed
if e.expert_alignment_score is not None]
overall_scores = [e.overall_score for e in completed
if e.overall_score is not None]
import numpy as np
return {
'total_evaluations': len(completed),
'timing': {
'mean_seconds': np.mean(times),
'std_seconds': np.std(times),
'min_seconds': np.min(times),
'max_seconds': np.max(times)
},
'agent_rounds': {
'mean': np.mean(rounds),
'std': np.std(rounds),
'min': np.min(rounds),
'max': np.max(rounds)
},
'expert_alignment': {
'mean': np.mean(alignment_scores) if alignment_scores else None,
'n_rated': len(alignment_scores)
},
'overall_scores': {
'mean': np.mean(overall_scores) if overall_scores else None,
'std': np.std(overall_scores) if overall_scores else None
}
}
# Ground truth for validation against literature consensus
LITERATURE_CONSENSUS = {
'portfolio_optimization_qaoa': {
'quantum_advantage_near_term': False,
'feasible_on_nisq': True, # Can run, but no advantage
'bottleneck': 'noise_and_classical_competition',
'recommended_approach': 'hybrid',
'references': [
'Egger et al. (2020) - Quantum Computing for Finance',
'Herman et al. (2022) - Portfolio Optimization Survey'
]
},
'option_pricing_amplitude_estimation': {
'quantum_advantage_near_term': False,
'feasible_on_nisq': False, # Requires error correction
'bottleneck': 'circuit_depth_and_error_correction',
'recommended_approach': 'classical_monte_carlo',
'references': [
'Stamatopoulos et al. (2020) - Option Pricing using QC',
'Chakrabarti et al. (2021) - Threshold for Quantum Speedup'
]
},
'fraud_detection_quantum_ml': {
'quantum_advantage_near_term': False,
'feasible_on_nisq': True, # Can run, limited scale
'bottleneck': 'data_loading_and_classical_ml_strength',
'recommended_approach': 'classical_ml',
'references': [
'Schuld & Petruccione (2021) - ML with Quantum Computers',
'Tang (2019) - Quantum-inspired classical algorithms'
]
},
'risk_analysis_quantum_monte_carlo': {
'quantum_advantage_near_term': False,
'feasible_on_nisq': False,
'bottleneck': 'quadratic_speedup_insufficient_with_noise',
'recommended_approach': 'gpu_accelerated_classical',
'references': [
'Woerner & Egger (2019) - Quantum Risk Analysis',
'Miyamoto & Shiohara (2022) - Bermudan Option Pricing'
]
},
'credit_scoring_vqc': {
'quantum_advantage_near_term': False,
'feasible_on_nisq': True,
'bottleneck': 'barren_plateaus_and_expressibility',
'recommended_approach': 'classical_ensemble_methods',
'references': [
'McClean et al. (2018) - Barren Plateaus',
'Cerezo et al. (2021) - Variational Quantum Algorithms'
]
}
}
def calculate_expert_alignment(
agent_output: dict,
use_case_key: str
) -> dict:
"""
Calculate alignment between agent output and literature consensus.
Args:
agent_output: Structured output from the agent crew
use_case_key: Key into LITERATURE_CONSENSUS dict
Returns:
Alignment analysis with score
"""
if use_case_key not in LITERATURE_CONSENSUS:
return {
'error': f'Unknown use case: {use_case_key}',
'available_cases': list(LITERATURE_CONSENSUS.keys())
}
consensus = LITERATURE_CONSENSUS[use_case_key]
alignment_checks = []
# Check quantum advantage assessment
if 'quantum_advantage' in agent_output:
agent_says_advantage = agent_output.get('quantum_advantage', False)
consensus_advantage = consensus['quantum_advantage_near_term']
alignment_checks.append({
'criterion': 'quantum_advantage_assessment',
'agent': agent_says_advantage,
'consensus': consensus_advantage,
'aligned': agent_says_advantage == consensus_advantage
})
# Check feasibility assessment
if 'feasible_on_nisq' in agent_output:
agent_feasible = agent_output.get('feasible_on_nisq', False)
consensus_feasible = consensus['feasible_on_nisq']
alignment_checks.append({
'criterion': 'nisq_feasibility',
'agent': agent_feasible,
'consensus': consensus_feasible,
'aligned': agent_feasible == consensus_feasible
})
# Calculate alignment score
if alignment_checks:
aligned_count = sum(1 for c in alignment_checks if c['aligned'])
alignment_score = aligned_count / len(alignment_checks)
else:
alignment_score = None
return {
'use_case': use_case_key,
'alignment_checks': alignment_checks,
'alignment_score': alignment_score,
'consensus_references': consensus['references']
}
def extract_scores_from_text(text: str) -> dict:
"""
Extract numerical scores from agent output text.
Looks for patterns like:
- "score: 0.7"
- "feasibility score (0-1): 0.65"
- "Rating: 7/10"
"""
scores = {}
# Pattern: "X score: 0.Y" or "X score (0-1): 0.Y"
score_pattern = r'(\w+(?:\s+\w+)?)\s+score\s*(?:\([^)]+\))?\s*[:=]\s*([\d.]+)'
for match in re.finditer(score_pattern, text, re.IGNORECASE):
label = match.group(1).lower().replace(' ', '_')
value = float(match.group(2))
if 0 <= value <= 1:
scores[f'{label}_score'] = value
elif 0 <= value <= 10:
scores[f'{label}_score'] = value / 10
# Pattern: "Rating: X/10"
rating_pattern = r'rating\s*[:=]\s*([\d.]+)\s*/\s*10'
for match in re.finditer(rating_pattern, text, re.IGNORECASE):
value = float(match.group(1))
scores['rating'] = value / 10
return scores
class HallucinationChecker:
"""
Check agent outputs for potential hallucinations.
Compares claims against known facts about quantum computing
and finance applications.
"""
# Known facts for validation
KNOWN_FACTS = {
# Hardware facts (2024-2026)
'ibm_qubit_count': (433, 1121), # Range: Osprey to Condor
'ionq_qubit_count': (32, 36),
'google_qubit_count': (72, 100),
# Algorithm facts
'grover_speedup': 'quadratic', # sqrt(N)
'shor_speedup': 'exponential',
'qaoa_proven_advantage': False,
'vqe_proven_advantage': False,
# Finance facts
'hft_latency_requirement_us': 1, # microseconds
'typical_portfolio_size': (10, 10000),
}
HALLUCINATION_PATTERNS = [
# Claims of proven quantum advantage for NISQ
r'proven\s+quantum\s+advantage\s+(?:for|in|on)\s+(?:NISQ|near-term)',
r'demonstrat(?:ed|es)\s+quantum\s+supremacy\s+(?:for|in)\s+finance',
# Unrealistic hardware claims
r'(?:current|available)\s+(?:quantum\s+)?computers?\s+(?:with|have)\s+(?:\d{4,}|\d+,\d{3,})\s+qubits',
# Impossible speedup claims
r'exponential\s+speedup\s+(?:for|in|using)\s+(?:QAOA|VQE)',
]
def check_output(self, text: str) -> dict:
"""
Check text for potential hallucinations.
Args:
text: Agent output text to check
Returns:
Dictionary with hallucination analysis
"""
findings = []
# Check against hallucination patterns
for pattern in self.HALLUCINATION_PATTERNS:
matches = re.findall(pattern, text, re.IGNORECASE)
if matches:
findings.append({
'type': 'suspicious_claim',
'pattern': pattern,
'matches': matches
})
# Check qubit count claims
qubit_claims = re.findall(r'(\d+)\s*qubits?', text, re.IGNORECASE)
for claim in qubit_claims:
count = int(claim)
if count > 2000: # No current hardware has this many
findings.append({
'type': 'unrealistic_qubit_count',
'claimed': count,
'realistic_max': 1121
})
return {
'potential_hallucinations': len(findings),
'findings': findings,
'text_length': len(text)
}
if __name__ == "__main__":
# Demo metrics collection
collector = MetricsCollector()
# Simulate an evaluation
metrics = collector.start_evaluation(
"Quantum portfolio optimization using QAOA"
)
metrics.total_agent_rounds = 5
metrics.quantum_feasibility_score = 0.6
metrics.hardware_readiness_score = 0.4
metrics.classical_competitiveness_score = 0.8
metrics.business_viability_score = 0.5
metrics.overall_score = 0.3
metrics.expert_alignment_score = 0.85
import time
time.sleep(0.1) # Simulate some work
metrics.complete()
print("Evaluation Metrics Demo")
print("=" * 50)
print(metrics.to_dict())
print("\nSummary Statistics:")
print(collector.get_summary_statistics())