Spaces:
Sleeping
Sleeping
| """ | |
| Core evaluation data structures and metrics definitions. | |
| Implements the EvaluationMetrics class from Issue #27 with comprehensive | |
| RAG system performance measurement capabilities. | |
| """ | |
| from dataclasses import dataclass | |
| from typing import Any, Dict, List, Optional | |
| class EvaluationMetrics: | |
| """Core evaluation dimensions as specified in Issue #27.""" | |
| # Retrieval Quality | |
| precision_at_k: float = 0.0 # Precision@K for retrieved documents | |
| recall_at_k: float = 0.0 # Recall@K for retrieved documents | |
| mrr: float = 0.0 # Mean Reciprocal Rank | |
| ndcg: float = 0.0 # Normalized Discounted Cumulative Gain | |
| # Response Quality | |
| bleu_score: float = 0.0 # BLEU score vs reference answers | |
| rouge_scores: Optional[Dict[str, float]] = None # ROUGE-1, ROUGE-2, ROUGE-L scores | |
| bert_score: float = 0.0 # BERTScore semantic similarity | |
| faithfulness: float = 0.0 # Response alignment to sources | |
| # System Performance | |
| latency_p50: float = 0.0 # 50th percentile response time | |
| latency_p95: float = 0.0 # 95th percentile response time | |
| throughput: float = 0.0 # Requests per second | |
| error_rate: float = 0.0 # Percentage of failed requests | |
| # User Experience | |
| user_satisfaction: float = 0.0 # Average user rating | |
| task_completion: float = 0.0 # Task success rate | |
| source_citation_accuracy: float = 0.0 # Accuracy of source citations | |
| # Initialize nested metrics containers | |
| retrieval_metrics: Optional[Dict[str, float]] = None | |
| generation_metrics: Optional[Dict[str, float]] = None | |
| system_metrics: Optional[Dict[str, float]] = None | |
| user_metrics: Optional[Dict[str, float]] = None | |
| def __post_init__(self): | |
| """Initialize nested metrics containers if not provided.""" | |
| if self.rouge_scores is None: | |
| self.rouge_scores = {} | |
| if self.retrieval_metrics is None: | |
| self.retrieval_metrics = {} | |
| if self.generation_metrics is None: | |
| self.generation_metrics = {} | |
| if self.system_metrics is None: | |
| self.system_metrics = {} | |
| if self.user_metrics is None: | |
| self.user_metrics = {} | |
| class EvaluationResult: | |
| """Single evaluation result for a question-answer pair.""" | |
| query_id: str | |
| query: str | |
| metrics: EvaluationMetrics | |
| timestamp: float | |
| generated_answer: Optional[str] = None | |
| reference_answer: Optional[str] = None | |
| retrieved_sources: Optional[List[Dict[str, Any]]] = None | |
| expected_sources: Optional[List[str]] = None | |
| error_message: Optional[str] = None | |
| class BenchmarkResults: | |
| """Comprehensive benchmark results for comparison.""" | |
| total_queries: int = 0 | |
| avg_retrieval_metrics: Optional[Dict[str, float]] = None | |
| avg_generation_metrics: Optional[Dict[str, float]] = None | |
| system_performance: Optional[Dict[str, float]] = None | |
| user_experience: Optional[Dict[str, float]] = None | |
| timestamp: float = 0.0 | |
| evaluation_time: float = 0.0 | |
| baseline_comparison: Optional[Dict[str, float]] = None | |
| def __post_init__(self): | |
| """Initialize metrics containers if not provided.""" | |
| if self.avg_retrieval_metrics is None: | |
| self.avg_retrieval_metrics = {} | |
| if self.avg_generation_metrics is None: | |
| self.avg_generation_metrics = {} | |
| if self.system_performance is None: | |
| self.system_performance = {} | |
| if self.user_experience is None: | |
| self.user_experience = {} | |
| class ComparisonReport: | |
| """Performance comparison against baseline metrics.""" | |
| current_metrics: EvaluationMetrics | |
| baseline_metrics: EvaluationMetrics | |
| improvements: Dict[str, float] | |
| regressions: Dict[str, float] | |
| overall_score: float | |
| recommendations: List[str] | |