Spaces:
Sleeping
Sleeping
File size: 3,803 Bytes
f884e6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
"""
Core evaluation data structures and metrics definitions.
Implements the EvaluationMetrics class from Issue #27 with comprehensive
RAG system performance measurement capabilities.
"""
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
@dataclass
class EvaluationMetrics:
"""Core evaluation dimensions as specified in Issue #27."""
# Retrieval Quality
precision_at_k: float = 0.0 # Precision@K for retrieved documents
recall_at_k: float = 0.0 # Recall@K for retrieved documents
mrr: float = 0.0 # Mean Reciprocal Rank
ndcg: float = 0.0 # Normalized Discounted Cumulative Gain
# Response Quality
bleu_score: float = 0.0 # BLEU score vs reference answers
rouge_scores: Optional[Dict[str, float]] = None # ROUGE-1, ROUGE-2, ROUGE-L scores
bert_score: float = 0.0 # BERTScore semantic similarity
faithfulness: float = 0.0 # Response alignment to sources
# System Performance
latency_p50: float = 0.0 # 50th percentile response time
latency_p95: float = 0.0 # 95th percentile response time
throughput: float = 0.0 # Requests per second
error_rate: float = 0.0 # Percentage of failed requests
# User Experience
user_satisfaction: float = 0.0 # Average user rating
task_completion: float = 0.0 # Task success rate
source_citation_accuracy: float = 0.0 # Accuracy of source citations
# Initialize nested metrics containers
retrieval_metrics: Optional[Dict[str, float]] = None
generation_metrics: Optional[Dict[str, float]] = None
system_metrics: Optional[Dict[str, float]] = None
user_metrics: Optional[Dict[str, float]] = None
def __post_init__(self):
"""Initialize nested metrics containers if not provided."""
if self.rouge_scores is None:
self.rouge_scores = {}
if self.retrieval_metrics is None:
self.retrieval_metrics = {}
if self.generation_metrics is None:
self.generation_metrics = {}
if self.system_metrics is None:
self.system_metrics = {}
if self.user_metrics is None:
self.user_metrics = {}
@dataclass
class EvaluationResult:
"""Single evaluation result for a question-answer pair."""
query_id: str
query: str
metrics: EvaluationMetrics
timestamp: float
generated_answer: Optional[str] = None
reference_answer: Optional[str] = None
retrieved_sources: Optional[List[Dict[str, Any]]] = None
expected_sources: Optional[List[str]] = None
error_message: Optional[str] = None
@dataclass
class BenchmarkResults:
"""Comprehensive benchmark results for comparison."""
total_queries: int = 0
avg_retrieval_metrics: Optional[Dict[str, float]] = None
avg_generation_metrics: Optional[Dict[str, float]] = None
system_performance: Optional[Dict[str, float]] = None
user_experience: Optional[Dict[str, float]] = None
timestamp: float = 0.0
evaluation_time: float = 0.0
baseline_comparison: Optional[Dict[str, float]] = None
def __post_init__(self):
"""Initialize metrics containers if not provided."""
if self.avg_retrieval_metrics is None:
self.avg_retrieval_metrics = {}
if self.avg_generation_metrics is None:
self.avg_generation_metrics = {}
if self.system_performance is None:
self.system_performance = {}
if self.user_experience is None:
self.user_experience = {}
@dataclass
class ComparisonReport:
"""Performance comparison against baseline metrics."""
current_metrics: EvaluationMetrics
baseline_metrics: EvaluationMetrics
improvements: Dict[str, float]
regressions: Dict[str, float]
overall_score: float
recommendations: List[str]
|