GitHub Action
Clean deployment without binary files
f884e6e
"""
Core evaluation data structures and metrics definitions.
Implements the EvaluationMetrics class from Issue #27 with comprehensive
RAG system performance measurement capabilities.
"""
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
@dataclass
class EvaluationMetrics:
"""Core evaluation dimensions as specified in Issue #27."""
# Retrieval Quality
precision_at_k: float = 0.0 # Precision@K for retrieved documents
recall_at_k: float = 0.0 # Recall@K for retrieved documents
mrr: float = 0.0 # Mean Reciprocal Rank
ndcg: float = 0.0 # Normalized Discounted Cumulative Gain
# Response Quality
bleu_score: float = 0.0 # BLEU score vs reference answers
rouge_scores: Optional[Dict[str, float]] = None # ROUGE-1, ROUGE-2, ROUGE-L scores
bert_score: float = 0.0 # BERTScore semantic similarity
faithfulness: float = 0.0 # Response alignment to sources
# System Performance
latency_p50: float = 0.0 # 50th percentile response time
latency_p95: float = 0.0 # 95th percentile response time
throughput: float = 0.0 # Requests per second
error_rate: float = 0.0 # Percentage of failed requests
# User Experience
user_satisfaction: float = 0.0 # Average user rating
task_completion: float = 0.0 # Task success rate
source_citation_accuracy: float = 0.0 # Accuracy of source citations
# Initialize nested metrics containers
retrieval_metrics: Optional[Dict[str, float]] = None
generation_metrics: Optional[Dict[str, float]] = None
system_metrics: Optional[Dict[str, float]] = None
user_metrics: Optional[Dict[str, float]] = None
def __post_init__(self):
"""Initialize nested metrics containers if not provided."""
if self.rouge_scores is None:
self.rouge_scores = {}
if self.retrieval_metrics is None:
self.retrieval_metrics = {}
if self.generation_metrics is None:
self.generation_metrics = {}
if self.system_metrics is None:
self.system_metrics = {}
if self.user_metrics is None:
self.user_metrics = {}
@dataclass
class EvaluationResult:
"""Single evaluation result for a question-answer pair."""
query_id: str
query: str
metrics: EvaluationMetrics
timestamp: float
generated_answer: Optional[str] = None
reference_answer: Optional[str] = None
retrieved_sources: Optional[List[Dict[str, Any]]] = None
expected_sources: Optional[List[str]] = None
error_message: Optional[str] = None
@dataclass
class BenchmarkResults:
"""Comprehensive benchmark results for comparison."""
total_queries: int = 0
avg_retrieval_metrics: Optional[Dict[str, float]] = None
avg_generation_metrics: Optional[Dict[str, float]] = None
system_performance: Optional[Dict[str, float]] = None
user_experience: Optional[Dict[str, float]] = None
timestamp: float = 0.0
evaluation_time: float = 0.0
baseline_comparison: Optional[Dict[str, float]] = None
def __post_init__(self):
"""Initialize metrics containers if not provided."""
if self.avg_retrieval_metrics is None:
self.avg_retrieval_metrics = {}
if self.avg_generation_metrics is None:
self.avg_generation_metrics = {}
if self.system_performance is None:
self.system_performance = {}
if self.user_experience is None:
self.user_experience = {}
@dataclass
class ComparisonReport:
"""Performance comparison against baseline metrics."""
current_metrics: EvaluationMetrics
baseline_metrics: EvaluationMetrics
improvements: Dict[str, float]
regressions: Dict[str, float]
overall_score: float
recommendations: List[str]