File size: 3,803 Bytes
f884e6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Core evaluation data structures and metrics definitions.

Implements the EvaluationMetrics class from Issue #27 with comprehensive
RAG system performance measurement capabilities.
"""

from dataclasses import dataclass
from typing import Any, Dict, List, Optional


@dataclass
class EvaluationMetrics:
    """Core evaluation dimensions as specified in Issue #27."""

    # Retrieval Quality
    precision_at_k: float = 0.0  # Precision@K for retrieved documents
    recall_at_k: float = 0.0  # Recall@K for retrieved documents
    mrr: float = 0.0  # Mean Reciprocal Rank
    ndcg: float = 0.0  # Normalized Discounted Cumulative Gain

    # Response Quality
    bleu_score: float = 0.0  # BLEU score vs reference answers
    rouge_scores: Optional[Dict[str, float]] = None  # ROUGE-1, ROUGE-2, ROUGE-L scores
    bert_score: float = 0.0  # BERTScore semantic similarity
    faithfulness: float = 0.0  # Response alignment to sources

    # System Performance
    latency_p50: float = 0.0  # 50th percentile response time
    latency_p95: float = 0.0  # 95th percentile response time
    throughput: float = 0.0  # Requests per second
    error_rate: float = 0.0  # Percentage of failed requests

    # User Experience
    user_satisfaction: float = 0.0  # Average user rating
    task_completion: float = 0.0  # Task success rate
    source_citation_accuracy: float = 0.0  # Accuracy of source citations

    # Initialize nested metrics containers
    retrieval_metrics: Optional[Dict[str, float]] = None
    generation_metrics: Optional[Dict[str, float]] = None
    system_metrics: Optional[Dict[str, float]] = None
    user_metrics: Optional[Dict[str, float]] = None

    def __post_init__(self):
        """Initialize nested metrics containers if not provided."""
        if self.rouge_scores is None:
            self.rouge_scores = {}
        if self.retrieval_metrics is None:
            self.retrieval_metrics = {}
        if self.generation_metrics is None:
            self.generation_metrics = {}
        if self.system_metrics is None:
            self.system_metrics = {}
        if self.user_metrics is None:
            self.user_metrics = {}


@dataclass
class EvaluationResult:
    """Single evaluation result for a question-answer pair."""

    query_id: str
    query: str
    metrics: EvaluationMetrics
    timestamp: float
    generated_answer: Optional[str] = None
    reference_answer: Optional[str] = None
    retrieved_sources: Optional[List[Dict[str, Any]]] = None
    expected_sources: Optional[List[str]] = None
    error_message: Optional[str] = None


@dataclass
class BenchmarkResults:
    """Comprehensive benchmark results for comparison."""

    total_queries: int = 0
    avg_retrieval_metrics: Optional[Dict[str, float]] = None
    avg_generation_metrics: Optional[Dict[str, float]] = None
    system_performance: Optional[Dict[str, float]] = None
    user_experience: Optional[Dict[str, float]] = None
    timestamp: float = 0.0
    evaluation_time: float = 0.0
    baseline_comparison: Optional[Dict[str, float]] = None

    def __post_init__(self):
        """Initialize metrics containers if not provided."""
        if self.avg_retrieval_metrics is None:
            self.avg_retrieval_metrics = {}
        if self.avg_generation_metrics is None:
            self.avg_generation_metrics = {}
        if self.system_performance is None:
            self.system_performance = {}
        if self.user_experience is None:
            self.user_experience = {}


@dataclass
class ComparisonReport:
    """Performance comparison against baseline metrics."""

    current_metrics: EvaluationMetrics
    baseline_metrics: EvaluationMetrics
    improvements: Dict[str, float]
    regressions: Dict[str, float]
    overall_score: float
    recommendations: List[str]