Spaces:

des137
/

quantum-finance-analyzer

Sleeping

App Files Files Community

quantum-finance-analyzer / src /utils /evaluation.py

des137

Deploy Quantum Finance Analyzer

6413ecc 7 days ago

raw

history blame contribute delete

13.2 kB

	"""
	Evaluation metrics for the quantum finance feasibility framework.

	Metrics defined per plan.md:
	1. Expert Alignment: Agreement with literature/expert consensus
	2. Hallucination Rate: Factual accuracy of agent outputs
	3. Computational Efficiency: Agent rounds and time per evaluation
	"""

	import re
	import time
	from dataclasses import dataclass, field
	from typing import Optional
	from datetime import datetime


	@dataclass
	class EvaluationMetrics:
	"""Container for evaluation metrics of a single run."""
	idea: str
	start_time: datetime = field(default_factory=datetime.now)
	end_time: Optional[datetime] = None

	# Computational efficiency metrics
	total_agent_rounds: int = 0
	total_tokens_used: int = 0
	wall_clock_time_seconds: float = 0.0

	# Quality metrics (to be assessed manually or via comparison)
	expert_alignment_score: Optional[float] = None # 0-1 scale
	hallucination_count: int = 0
	factual_claims_count: int = 0

	# Output scores from agents
	quantum_feasibility_score: Optional[float] = None
	hardware_readiness_score: Optional[float] = None
	classical_competitiveness_score: Optional[float] = None
	business_viability_score: Optional[float] = None
	overall_score: Optional[float] = None

	def complete(self):
	"""Mark evaluation as complete and calculate duration."""
	self.end_time = datetime.now()
	self.wall_clock_time_seconds = (self.end_time - self.start_time).total_seconds()

	@property
	def hallucination_rate(self) -> Optional[float]:
	"""Calculate hallucination rate as fraction of factual claims."""
	if self.factual_claims_count == 0:
	return None
	return self.hallucination_count / self.factual_claims_count

	def to_dict(self) -> dict:
	"""Convert to dictionary for serialization."""
	return {
	'idea': self.idea,
	'start_time': self.start_time.isoformat(),
	'end_time': self.end_time.isoformat() if self.end_time else None,
	'total_agent_rounds': self.total_agent_rounds,
	'total_tokens_used': self.total_tokens_used,
	'wall_clock_time_seconds': self.wall_clock_time_seconds,
	'expert_alignment_score': self.expert_alignment_score,
	'hallucination_rate': self.hallucination_rate,
	'quantum_feasibility_score': self.quantum_feasibility_score,
	'hardware_readiness_score': self.hardware_readiness_score,
	'classical_competitiveness_score': self.classical_competitiveness_score,
	'business_viability_score': self.business_viability_score,
	'overall_score': self.overall_score
	}


	class MetricsCollector:
	"""Collect and aggregate metrics across multiple evaluations."""

	def __init__(self):
	self.evaluations: list[EvaluationMetrics] = []

	def start_evaluation(self, idea: str) -> EvaluationMetrics:
	"""Start tracking a new evaluation."""
	metrics = EvaluationMetrics(idea=idea)
	self.evaluations.append(metrics)
	return metrics

	def get_summary_statistics(self) -> dict:
	"""Calculate summary statistics across all evaluations."""
	if not self.evaluations:
	return {'error': 'No evaluations recorded'}

	completed = [e for e in self.evaluations if e.end_time is not None]

	if not completed:
	return {'error': 'No completed evaluations'}

	times = [e.wall_clock_time_seconds for e in completed]
	rounds = [e.total_agent_rounds for e in completed]

	alignment_scores = [e.expert_alignment_score for e in completed
	if e.expert_alignment_score is not None]
	overall_scores = [e.overall_score for e in completed
	if e.overall_score is not None]

	import numpy as np

	return {
	'total_evaluations': len(completed),
	'timing': {
	'mean_seconds': np.mean(times),
	'std_seconds': np.std(times),
	'min_seconds': np.min(times),
	'max_seconds': np.max(times)
	},
	'agent_rounds': {
	'mean': np.mean(rounds),
	'std': np.std(rounds),
	'min': np.min(rounds),
	'max': np.max(rounds)
	},
	'expert_alignment': {
	'mean': np.mean(alignment_scores) if alignment_scores else None,
	'n_rated': len(alignment_scores)
	},
	'overall_scores': {
	'mean': np.mean(overall_scores) if overall_scores else None,
	'std': np.std(overall_scores) if overall_scores else None
	}
	}


	# Ground truth for validation against literature consensus
	LITERATURE_CONSENSUS = {
	'portfolio_optimization_qaoa': {
	'quantum_advantage_near_term': False,
	'feasible_on_nisq': True, # Can run, but no advantage
	'bottleneck': 'noise_and_classical_competition',
	'recommended_approach': 'hybrid',
	'references': [
	'Egger et al. (2020) - Quantum Computing for Finance',
	'Herman et al. (2022) - Portfolio Optimization Survey'
	]
	},
	'option_pricing_amplitude_estimation': {
	'quantum_advantage_near_term': False,
	'feasible_on_nisq': False, # Requires error correction
	'bottleneck': 'circuit_depth_and_error_correction',
	'recommended_approach': 'classical_monte_carlo',
	'references': [
	'Stamatopoulos et al. (2020) - Option Pricing using QC',
	'Chakrabarti et al. (2021) - Threshold for Quantum Speedup'
	]
	},
	'fraud_detection_quantum_ml': {
	'quantum_advantage_near_term': False,
	'feasible_on_nisq': True, # Can run, limited scale
	'bottleneck': 'data_loading_and_classical_ml_strength',
	'recommended_approach': 'classical_ml',
	'references': [
	'Schuld & Petruccione (2021) - ML with Quantum Computers',
	'Tang (2019) - Quantum-inspired classical algorithms'
	]
	},
	'risk_analysis_quantum_monte_carlo': {
	'quantum_advantage_near_term': False,
	'feasible_on_nisq': False,
	'bottleneck': 'quadratic_speedup_insufficient_with_noise',
	'recommended_approach': 'gpu_accelerated_classical',
	'references': [
	'Woerner & Egger (2019) - Quantum Risk Analysis',
	'Miyamoto & Shiohara (2022) - Bermudan Option Pricing'
	]
	},
	'credit_scoring_vqc': {
	'quantum_advantage_near_term': False,
	'feasible_on_nisq': True,
	'bottleneck': 'barren_plateaus_and_expressibility',
	'recommended_approach': 'classical_ensemble_methods',
	'references': [
	'McClean et al. (2018) - Barren Plateaus',
	'Cerezo et al. (2021) - Variational Quantum Algorithms'
	]
	}
	}


	def calculate_expert_alignment(
	agent_output: dict,
	use_case_key: str
	) -> dict:
	"""
	Calculate alignment between agent output and literature consensus.

	Args:
	agent_output: Structured output from the agent crew
	use_case_key: Key into LITERATURE_CONSENSUS dict

	Returns:
	Alignment analysis with score
	"""
	if use_case_key not in LITERATURE_CONSENSUS:
	return {
	'error': f'Unknown use case: {use_case_key}',
	'available_cases': list(LITERATURE_CONSENSUS.keys())
	}

	consensus = LITERATURE_CONSENSUS[use_case_key]
	alignment_checks = []

	# Check quantum advantage assessment
	if 'quantum_advantage' in agent_output:
	agent_says_advantage = agent_output.get('quantum_advantage', False)
	consensus_advantage = consensus['quantum_advantage_near_term']
	alignment_checks.append({
	'criterion': 'quantum_advantage_assessment',
	'agent': agent_says_advantage,
	'consensus': consensus_advantage,
	'aligned': agent_says_advantage == consensus_advantage
	})

	# Check feasibility assessment
	if 'feasible_on_nisq' in agent_output:
	agent_feasible = agent_output.get('feasible_on_nisq', False)
	consensus_feasible = consensus['feasible_on_nisq']
	alignment_checks.append({
	'criterion': 'nisq_feasibility',
	'agent': agent_feasible,
	'consensus': consensus_feasible,
	'aligned': agent_feasible == consensus_feasible
	})

	# Calculate alignment score
	if alignment_checks:
	aligned_count = sum(1 for c in alignment_checks if c['aligned'])
	alignment_score = aligned_count / len(alignment_checks)
	else:
	alignment_score = None

	return {
	'use_case': use_case_key,
	'alignment_checks': alignment_checks,
	'alignment_score': alignment_score,
	'consensus_references': consensus['references']
	}


	def extract_scores_from_text(text: str) -> dict:
	"""
	Extract numerical scores from agent output text.

	Looks for patterns like:
	- "score: 0.7"
	- "feasibility score (0-1): 0.65"
	- "Rating: 7/10"
	"""
	scores = {}

	# Pattern: "X score: 0.Y" or "X score (0-1): 0.Y"
	score_pattern = r'(\w+(?:\s+\w+)?)\s+score\s(?:\([^)]+\))?\s[:=]\s*([\d.]+)'
	for match in re.finditer(score_pattern, text, re.IGNORECASE):
	label = match.group(1).lower().replace(' ', '_')
	value = float(match.group(2))
	if 0 <= value <= 1:
	scores[f'{label}_score'] = value
	elif 0 <= value <= 10:
	scores[f'{label}_score'] = value / 10

	# Pattern: "Rating: X/10"
	rating_pattern = r'rating\s[:=]\s([\d.]+)\s/\s10'
	for match in re.finditer(rating_pattern, text, re.IGNORECASE):
	value = float(match.group(1))
	scores['rating'] = value / 10

	return scores


	class HallucinationChecker:
	"""
	Check agent outputs for potential hallucinations.

	Compares claims against known facts about quantum computing
	and finance applications.
	"""

	# Known facts for validation
	KNOWN_FACTS = {
	# Hardware facts (2024-2026)
	'ibm_qubit_count': (433, 1121), # Range: Osprey to Condor
	'ionq_qubit_count': (32, 36),
	'google_qubit_count': (72, 100),

	# Algorithm facts
	'grover_speedup': 'quadratic', # sqrt(N)
	'shor_speedup': 'exponential',
	'qaoa_proven_advantage': False,
	'vqe_proven_advantage': False,

	# Finance facts
	'hft_latency_requirement_us': 1, # microseconds
	'typical_portfolio_size': (10, 10000),
	}

	HALLUCINATION_PATTERNS = [
	# Claims of proven quantum advantage for NISQ
	r'proven\s+quantum\s+advantage\s+(?:for\|in\|on)\s+(?:NISQ\|near-term)',
	r'demonstrat(?:ed\|es)\s+quantum\s+supremacy\s+(?:for\|in)\s+finance',

	# Unrealistic hardware claims
	r'(?:current\|available)\s+(?:quantum\s+)?computers?\s+(?:with\|have)\s+(?:\d{4,}\|\d+,\d{3,})\s+qubits',

	# Impossible speedup claims
	r'exponential\s+speedup\s+(?:for\|in\|using)\s+(?:QAOA\|VQE)',
	]

	def check_output(self, text: str) -> dict:
	"""
	Check text for potential hallucinations.

	Args:
	text: Agent output text to check

	Returns:
	Dictionary with hallucination analysis
	"""
	findings = []

	# Check against hallucination patterns
	for pattern in self.HALLUCINATION_PATTERNS:
	matches = re.findall(pattern, text, re.IGNORECASE)
	if matches:
	findings.append({
	'type': 'suspicious_claim',
	'pattern': pattern,
	'matches': matches
	})

	# Check qubit count claims
	qubit_claims = re.findall(r'(\d+)\s*qubits?', text, re.IGNORECASE)
	for claim in qubit_claims:
	count = int(claim)
	if count > 2000: # No current hardware has this many
	findings.append({
	'type': 'unrealistic_qubit_count',
	'claimed': count,
	'realistic_max': 1121
	})

	return {
	'potential_hallucinations': len(findings),
	'findings': findings,
	'text_length': len(text)
	}


	if __name__ == "__main__":
	# Demo metrics collection
	collector = MetricsCollector()

	# Simulate an evaluation
	metrics = collector.start_evaluation(
	"Quantum portfolio optimization using QAOA"
	)
	metrics.total_agent_rounds = 5
	metrics.quantum_feasibility_score = 0.6
	metrics.hardware_readiness_score = 0.4
	metrics.classical_competitiveness_score = 0.8
	metrics.business_viability_score = 0.5
	metrics.overall_score = 0.3
	metrics.expert_alignment_score = 0.85

	import time
	time.sleep(0.1) # Simulate some work
	metrics.complete()

	print("Evaluation Metrics Demo")
	print("=" * 50)
	print(metrics.to_dict())

	print("\nSummary Statistics:")
	print(collector.get_summary_statistics())