Spaces:

des137
/

quantum-finance-analyzer

Sleeping

File size: 13,158 Bytes

6413ecc

"""
Evaluation metrics for the quantum finance feasibility framework.

Metrics defined per plan.md:
1. Expert Alignment: Agreement with literature/expert consensus
2. Hallucination Rate: Factual accuracy of agent outputs
3. Computational Efficiency: Agent rounds and time per evaluation
"""

import re
import time
from dataclasses import dataclass, field
from typing import Optional
from datetime import datetime


@dataclass
class EvaluationMetrics:
    """Container for evaluation metrics of a single run."""
    idea: str
    start_time: datetime = field(default_factory=datetime.now)
    end_time: Optional[datetime] = None

    # Computational efficiency metrics
    total_agent_rounds: int = 0
    total_tokens_used: int = 0
    wall_clock_time_seconds: float = 0.0

    # Quality metrics (to be assessed manually or via comparison)
    expert_alignment_score: Optional[float] = None  # 0-1 scale
    hallucination_count: int = 0
    factual_claims_count: int = 0

    # Output scores from agents
    quantum_feasibility_score: Optional[float] = None
    hardware_readiness_score: Optional[float] = None
    classical_competitiveness_score: Optional[float] = None
    business_viability_score: Optional[float] = None
    overall_score: Optional[float] = None

    def complete(self):
        """Mark evaluation as complete and calculate duration."""
        self.end_time = datetime.now()
        self.wall_clock_time_seconds = (self.end_time - self.start_time).total_seconds()

    @property
    def hallucination_rate(self) -> Optional[float]:
        """Calculate hallucination rate as fraction of factual claims."""
        if self.factual_claims_count == 0:
            return None
        return self.hallucination_count / self.factual_claims_count

    def to_dict(self) -> dict:
        """Convert to dictionary for serialization."""
        return {
            'idea': self.idea,
            'start_time': self.start_time.isoformat(),
            'end_time': self.end_time.isoformat() if self.end_time else None,
            'total_agent_rounds': self.total_agent_rounds,
            'total_tokens_used': self.total_tokens_used,
            'wall_clock_time_seconds': self.wall_clock_time_seconds,
            'expert_alignment_score': self.expert_alignment_score,
            'hallucination_rate': self.hallucination_rate,
            'quantum_feasibility_score': self.quantum_feasibility_score,
            'hardware_readiness_score': self.hardware_readiness_score,
            'classical_competitiveness_score': self.classical_competitiveness_score,
            'business_viability_score': self.business_viability_score,
            'overall_score': self.overall_score
        }


class MetricsCollector:
    """Collect and aggregate metrics across multiple evaluations."""

    def __init__(self):
        self.evaluations: list[EvaluationMetrics] = []

    def start_evaluation(self, idea: str) -> EvaluationMetrics:
        """Start tracking a new evaluation."""
        metrics = EvaluationMetrics(idea=idea)
        self.evaluations.append(metrics)
        return metrics

    def get_summary_statistics(self) -> dict:
        """Calculate summary statistics across all evaluations."""
        if not self.evaluations:
            return {'error': 'No evaluations recorded'}

        completed = [e for e in self.evaluations if e.end_time is not None]

        if not completed:
            return {'error': 'No completed evaluations'}

        times = [e.wall_clock_time_seconds for e in completed]
        rounds = [e.total_agent_rounds for e in completed]

        alignment_scores = [e.expert_alignment_score for e in completed
                          if e.expert_alignment_score is not None]
        overall_scores = [e.overall_score for e in completed
                         if e.overall_score is not None]

        import numpy as np

        return {
            'total_evaluations': len(completed),
            'timing': {
                'mean_seconds': np.mean(times),
                'std_seconds': np.std(times),
                'min_seconds': np.min(times),
                'max_seconds': np.max(times)
            },
            'agent_rounds': {
                'mean': np.mean(rounds),
                'std': np.std(rounds),
                'min': np.min(rounds),
                'max': np.max(rounds)
            },
            'expert_alignment': {
                'mean': np.mean(alignment_scores) if alignment_scores else None,
                'n_rated': len(alignment_scores)
            },
            'overall_scores': {
                'mean': np.mean(overall_scores) if overall_scores else None,
                'std': np.std(overall_scores) if overall_scores else None
            }
        }


# Ground truth for validation against literature consensus
LITERATURE_CONSENSUS = {
    'portfolio_optimization_qaoa': {
        'quantum_advantage_near_term': False,
        'feasible_on_nisq': True,  # Can run, but no advantage
        'bottleneck': 'noise_and_classical_competition',
        'recommended_approach': 'hybrid',
        'references': [
            'Egger et al. (2020) - Quantum Computing for Finance',
            'Herman et al. (2022) - Portfolio Optimization Survey'
        ]
    },
    'option_pricing_amplitude_estimation': {
        'quantum_advantage_near_term': False,
        'feasible_on_nisq': False,  # Requires error correction
        'bottleneck': 'circuit_depth_and_error_correction',
        'recommended_approach': 'classical_monte_carlo',
        'references': [
            'Stamatopoulos et al. (2020) - Option Pricing using QC',
            'Chakrabarti et al. (2021) - Threshold for Quantum Speedup'
        ]
    },
    'fraud_detection_quantum_ml': {
        'quantum_advantage_near_term': False,
        'feasible_on_nisq': True,  # Can run, limited scale
        'bottleneck': 'data_loading_and_classical_ml_strength',
        'recommended_approach': 'classical_ml',
        'references': [
            'Schuld & Petruccione (2021) - ML with Quantum Computers',
            'Tang (2019) - Quantum-inspired classical algorithms'
        ]
    },
    'risk_analysis_quantum_monte_carlo': {
        'quantum_advantage_near_term': False,
        'feasible_on_nisq': False,
        'bottleneck': 'quadratic_speedup_insufficient_with_noise',
        'recommended_approach': 'gpu_accelerated_classical',
        'references': [
            'Woerner & Egger (2019) - Quantum Risk Analysis',
            'Miyamoto & Shiohara (2022) - Bermudan Option Pricing'
        ]
    },
    'credit_scoring_vqc': {
        'quantum_advantage_near_term': False,
        'feasible_on_nisq': True,
        'bottleneck': 'barren_plateaus_and_expressibility',
        'recommended_approach': 'classical_ensemble_methods',
        'references': [
            'McClean et al. (2018) - Barren Plateaus',
            'Cerezo et al. (2021) - Variational Quantum Algorithms'
        ]
    }
}


def calculate_expert_alignment(
    agent_output: dict,
    use_case_key: str
) -> dict:
    """
    Calculate alignment between agent output and literature consensus.

    Args:
        agent_output: Structured output from the agent crew
        use_case_key: Key into LITERATURE_CONSENSUS dict

    Returns:
        Alignment analysis with score
    """
    if use_case_key not in LITERATURE_CONSENSUS:
        return {
            'error': f'Unknown use case: {use_case_key}',
            'available_cases': list(LITERATURE_CONSENSUS.keys())
        }

    consensus = LITERATURE_CONSENSUS[use_case_key]
    alignment_checks = []

    # Check quantum advantage assessment
    if 'quantum_advantage' in agent_output:
        agent_says_advantage = agent_output.get('quantum_advantage', False)
        consensus_advantage = consensus['quantum_advantage_near_term']
        alignment_checks.append({
            'criterion': 'quantum_advantage_assessment',
            'agent': agent_says_advantage,
            'consensus': consensus_advantage,
            'aligned': agent_says_advantage == consensus_advantage
        })

    # Check feasibility assessment
    if 'feasible_on_nisq' in agent_output:
        agent_feasible = agent_output.get('feasible_on_nisq', False)
        consensus_feasible = consensus['feasible_on_nisq']
        alignment_checks.append({
            'criterion': 'nisq_feasibility',
            'agent': agent_feasible,
            'consensus': consensus_feasible,
            'aligned': agent_feasible == consensus_feasible
        })

    # Calculate alignment score
    if alignment_checks:
        aligned_count = sum(1 for c in alignment_checks if c['aligned'])
        alignment_score = aligned_count / len(alignment_checks)
    else:
        alignment_score = None

    return {
        'use_case': use_case_key,
        'alignment_checks': alignment_checks,
        'alignment_score': alignment_score,
        'consensus_references': consensus['references']
    }


def extract_scores_from_text(text: str) -> dict:
    """
    Extract numerical scores from agent output text.

    Looks for patterns like:
    - "score: 0.7"
    - "feasibility score (0-1): 0.65"
    - "Rating: 7/10"
    """
    scores = {}

    # Pattern: "X score: 0.Y" or "X score (0-1): 0.Y"
    score_pattern = r'(\w+(?:\s+\w+)?)\s+score\s*(?:\([^)]+\))?\s*[:=]\s*([\d.]+)'
    for match in re.finditer(score_pattern, text, re.IGNORECASE):
        label = match.group(1).lower().replace(' ', '_')
        value = float(match.group(2))
        if 0 <= value <= 1:
            scores[f'{label}_score'] = value
        elif 0 <= value <= 10:
            scores[f'{label}_score'] = value / 10

    # Pattern: "Rating: X/10"
    rating_pattern = r'rating\s*[:=]\s*([\d.]+)\s*/\s*10'
    for match in re.finditer(rating_pattern, text, re.IGNORECASE):
        value = float(match.group(1))
        scores['rating'] = value / 10

    return scores


class HallucinationChecker:
    """
    Check agent outputs for potential hallucinations.

    Compares claims against known facts about quantum computing
    and finance applications.
    """

    # Known facts for validation
    KNOWN_FACTS = {
        # Hardware facts (2024-2026)
        'ibm_qubit_count': (433, 1121),  # Range: Osprey to Condor
        'ionq_qubit_count': (32, 36),
        'google_qubit_count': (72, 100),

        # Algorithm facts
        'grover_speedup': 'quadratic',  # sqrt(N)
        'shor_speedup': 'exponential',
        'qaoa_proven_advantage': False,
        'vqe_proven_advantage': False,

        # Finance facts
        'hft_latency_requirement_us': 1,  # microseconds
        'typical_portfolio_size': (10, 10000),
    }

    HALLUCINATION_PATTERNS = [
        # Claims of proven quantum advantage for NISQ
        r'proven\s+quantum\s+advantage\s+(?:for|in|on)\s+(?:NISQ|near-term)',
        r'demonstrat(?:ed|es)\s+quantum\s+supremacy\s+(?:for|in)\s+finance',

        # Unrealistic hardware claims
        r'(?:current|available)\s+(?:quantum\s+)?computers?\s+(?:with|have)\s+(?:\d{4,}|\d+,\d{3,})\s+qubits',

        # Impossible speedup claims
        r'exponential\s+speedup\s+(?:for|in|using)\s+(?:QAOA|VQE)',
    ]

    def check_output(self, text: str) -> dict:
        """
        Check text for potential hallucinations.

        Args:
            text: Agent output text to check

        Returns:
            Dictionary with hallucination analysis
        """
        findings = []

        # Check against hallucination patterns
        for pattern in self.HALLUCINATION_PATTERNS:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                findings.append({
                    'type': 'suspicious_claim',
                    'pattern': pattern,
                    'matches': matches
                })

        # Check qubit count claims
        qubit_claims = re.findall(r'(\d+)\s*qubits?', text, re.IGNORECASE)
        for claim in qubit_claims:
            count = int(claim)
            if count > 2000:  # No current hardware has this many
                findings.append({
                    'type': 'unrealistic_qubit_count',
                    'claimed': count,
                    'realistic_max': 1121
                })

        return {
            'potential_hallucinations': len(findings),
            'findings': findings,
            'text_length': len(text)
        }


if __name__ == "__main__":
    # Demo metrics collection
    collector = MetricsCollector()

    # Simulate an evaluation
    metrics = collector.start_evaluation(
        "Quantum portfolio optimization using QAOA"
    )
    metrics.total_agent_rounds = 5
    metrics.quantum_feasibility_score = 0.6
    metrics.hardware_readiness_score = 0.4
    metrics.classical_competitiveness_score = 0.8
    metrics.business_viability_score = 0.5
    metrics.overall_score = 0.3
    metrics.expert_alignment_score = 0.85

    import time
    time.sleep(0.1)  # Simulate some work
    metrics.complete()

    print("Evaluation Metrics Demo")
    print("=" * 50)
    print(metrics.to_dict())

    print("\nSummary Statistics:")
    print(collector.get_summary_statistics())