""" Core evaluation data structures and metrics definitions. Implements the EvaluationMetrics class from Issue #27 with comprehensive RAG system performance measurement capabilities. """ from dataclasses import dataclass from typing import Any, Dict, List, Optional @dataclass class EvaluationMetrics: """Core evaluation dimensions as specified in Issue #27.""" # Retrieval Quality precision_at_k: float = 0.0 # Precision@K for retrieved documents recall_at_k: float = 0.0 # Recall@K for retrieved documents mrr: float = 0.0 # Mean Reciprocal Rank ndcg: float = 0.0 # Normalized Discounted Cumulative Gain # Response Quality bleu_score: float = 0.0 # BLEU score vs reference answers rouge_scores: Optional[Dict[str, float]] = None # ROUGE-1, ROUGE-2, ROUGE-L scores bert_score: float = 0.0 # BERTScore semantic similarity faithfulness: float = 0.0 # Response alignment to sources # System Performance latency_p50: float = 0.0 # 50th percentile response time latency_p95: float = 0.0 # 95th percentile response time throughput: float = 0.0 # Requests per second error_rate: float = 0.0 # Percentage of failed requests # User Experience user_satisfaction: float = 0.0 # Average user rating task_completion: float = 0.0 # Task success rate source_citation_accuracy: float = 0.0 # Accuracy of source citations # Initialize nested metrics containers retrieval_metrics: Optional[Dict[str, float]] = None generation_metrics: Optional[Dict[str, float]] = None system_metrics: Optional[Dict[str, float]] = None user_metrics: Optional[Dict[str, float]] = None def __post_init__(self): """Initialize nested metrics containers if not provided.""" if self.rouge_scores is None: self.rouge_scores = {} if self.retrieval_metrics is None: self.retrieval_metrics = {} if self.generation_metrics is None: self.generation_metrics = {} if self.system_metrics is None: self.system_metrics = {} if self.user_metrics is None: self.user_metrics = {} @dataclass class EvaluationResult: """Single evaluation result for a question-answer pair.""" query_id: str query: str metrics: EvaluationMetrics timestamp: float generated_answer: Optional[str] = None reference_answer: Optional[str] = None retrieved_sources: Optional[List[Dict[str, Any]]] = None expected_sources: Optional[List[str]] = None error_message: Optional[str] = None @dataclass class BenchmarkResults: """Comprehensive benchmark results for comparison.""" total_queries: int = 0 avg_retrieval_metrics: Optional[Dict[str, float]] = None avg_generation_metrics: Optional[Dict[str, float]] = None system_performance: Optional[Dict[str, float]] = None user_experience: Optional[Dict[str, float]] = None timestamp: float = 0.0 evaluation_time: float = 0.0 baseline_comparison: Optional[Dict[str, float]] = None def __post_init__(self): """Initialize metrics containers if not provided.""" if self.avg_retrieval_metrics is None: self.avg_retrieval_metrics = {} if self.avg_generation_metrics is None: self.avg_generation_metrics = {} if self.system_performance is None: self.system_performance = {} if self.user_experience is None: self.user_experience = {} @dataclass class ComparisonReport: """Performance comparison against baseline metrics.""" current_metrics: EvaluationMetrics baseline_metrics: EvaluationMetrics improvements: Dict[str, float] regressions: Dict[str, float] overall_score: float recommendations: List[str]