File size: 9,645 Bytes
78e8dd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cfa3a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""

RAG Evaluation Module

Comprehensive evaluation system for RAG-based Q&A

"""
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
from typing import Dict, List, Optional
from datetime import datetime

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass


class ComprehensiveRAGEvaluator:
    """Comprehensive evaluation system for RAG-based car manual Q&A"""
    
    def __init__(self, rag_system, client):
        self.rag_system = rag_system
        self.client = client
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.evaluation_results = {}
    
    def evaluate_answer_quality(self, question: str, generated_answer: str, 

                                expected_answer: str, retrieved_contexts: List[str]) -> Dict:
        """

        Comprehensive answer quality evaluation

        

        Args:

            question: The question asked

            generated_answer: Answer generated by RAG system

            expected_answer: Expected correct answer

            retrieved_contexts: Contexts retrieved for the answer

            

        Returns:

            Dictionary of quality metrics

        """
        metrics = {}
        
        # 1. Semantic Similarity to Expected Answer
        gen_embedding = self.sentence_model.encode([generated_answer])
        exp_embedding = self.sentence_model.encode([expected_answer])
        metrics['semantic_similarity'] = cosine_similarity(gen_embedding, exp_embedding)[0][0]
        
        # 2. Answer Relevance to Question
        q_embedding = self.sentence_model.encode([question])
        a_embedding = self.sentence_model.encode([generated_answer])
        metrics['answer_relevance'] = cosine_similarity(q_embedding, a_embedding)[0][0]
        
        # 3. Faithfulness (grounding in retrieved context)
        metrics['faithfulness'] = self._calculate_faithfulness(generated_answer, retrieved_contexts)
        
        # 4. Completeness Assessment
        metrics['completeness'] = self._assess_completeness(question, generated_answer, expected_answer)
        
        # 5. Safety Appropriateness
        metrics['safety_appropriateness'] = self._check_safety_appropriateness(question, generated_answer)
        
        # 6. Technical Accuracy
        metrics['technical_accuracy'] = self._assess_technical_accuracy(generated_answer, retrieved_contexts)
        
        # 7. Clarity and Actionability
        metrics['clarity'] = self._assess_clarity(generated_answer)
        metrics['actionability'] = self._assess_actionability(question, generated_answer)
        
        return metrics
    
    def _calculate_faithfulness(self, answer: str, contexts: List[str]) -> float:
        """Calculate how well the answer is grounded in the retrieved contexts"""
        if not contexts:
            return 0.0
        
        answer_sentences = nltk.sent_tokenize(answer)
        supported_sentences = 0
        
        for sentence in answer_sentences:
            sentence_embedding = self.sentence_model.encode([sentence])
            max_similarity = 0
            
            for context in contexts:
                context_embedding = self.sentence_model.encode([context])
                similarity = cosine_similarity(sentence_embedding, context_embedding)[0][0]
                max_similarity = max(max_similarity, similarity)
            
            if max_similarity > 0.7:
                supported_sentences += 1
        
        return supported_sentences / len(answer_sentences) if answer_sentences else 0.0
    
    def _assess_completeness(self, question: str, generated_answer: str, expected_answer: str) -> float:
        """Assess if the generated answer covers all aspects of the expected answer"""
        expected_words = set(expected_answer.lower().split())
        generated_words = set(generated_answer.lower().split())
        
        stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
        expected_words -= stop_words
        generated_words -= stop_words
        
        if not expected_words:
            return 1.0
        
        overlap = len(expected_words.intersection(generated_words))
        return overlap / len(expected_words)
    
    def _check_safety_appropriateness(self, question: str, answer: str) -> float:
        """Check if safety-critical information is handled appropriately"""
        safety_keywords = ['brake', 'airbag', 'emergency', 'warning', 'danger', 'caution', 'safety', 'speed', 'steering']
        
        question_lower = question.lower()
        answer_lower = answer.lower()
        
        is_safety_related = any(keyword in question_lower for keyword in safety_keywords)
        
        if not is_safety_related:
            return 1.0
        
        safety_indicators = ['warning', 'caution', 'important', 'ensure', 'never', 'always', 'must']
        has_safety_language = any(indicator in answer_lower for indicator in safety_indicators)
        
        return 1.0 if has_safety_language else 0.5
    
    def _assess_technical_accuracy(self, answer: str, contexts: List[str]) -> float:
        """Assess technical accuracy based on context alignment"""
        if not contexts:
            return 0.5
        
        answer_embedding = self.sentence_model.encode([answer])
        context_embeddings = self.sentence_model.encode(contexts)
        
        similarities = cosine_similarity(answer_embedding, context_embeddings)[0]
        return np.mean(similarities)
    
    def _assess_clarity(self, answer: str) -> float:
        """Assess clarity of the answer"""
        sentences = nltk.sent_tokenize(answer)
        
        if not sentences:
            return 0.0
        
        avg_sentence_length = np.mean([len(sentence.split()) for sentence in sentences])
        length_score = min(1.0, 15.0 / avg_sentence_length) if avg_sentence_length > 0 else 0.0
        
        structure_indicators = ['step', 'first', 'second', 'then', 'next', 'finally', '1.', '2.']
        has_structure = any(indicator in answer.lower() for indicator in structure_indicators)
        structure_score = 1.0 if has_structure else 0.7
        
        return (length_score + structure_score) / 2
    
    def _assess_actionability(self, question: str, answer: str) -> float:
        """Assess if the answer provides actionable information"""
        question_lower = question.lower()
        answer_lower = answer.lower()
        
        if 'how to' in question_lower or 'how do' in question_lower:
            action_indicators = ['press', 'turn', 'select', 'push', 'pull', 'set', 'adjust', 'follow', 'ensure']
            has_actions = any(indicator in answer_lower for indicator in action_indicators)
            return 1.0 if has_actions else 0.3
        
        return 0.8
    
    def generate_evaluation_report(self) -> str:
        """Generate comprehensive evaluation report"""
        if not self.evaluation_results:
            return "No evaluation results available. Run evaluation first."
        
        df = pd.DataFrame(self.evaluation_results)
        
        # Overall metrics
        overall_metrics = {
            'semantic_similarity': df['semantic_similarity'].mean(),
            'answer_relevance': df['answer_relevance'].mean(),
            'faithfulness': df['faithfulness'].mean(),
            'completeness': df['completeness'].mean(),
            'safety_appropriateness': df['safety_appropriateness'].mean(),
            'technical_accuracy': df['technical_accuracy'].mean(),
            'clarity': df['clarity'].mean(),
            'actionability': df['actionability'].mean()
        }
        
        # Performance by question type
        type_performance = df.groupby('question_type')[list(overall_metrics.keys())].mean()
        
        # Performance by difficulty
        difficulty_performance = df.groupby('difficulty')[list(overall_metrics.keys())].mean()
        
        # Generate report
        report = f"""

# RAG System Comprehensive Evaluation Report

Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}



## Overall Performance Metrics

{'-' * 40}

{'Metric':<25} {'Score':<10} {'Interpretation':<30}

{'-' * 40}

{'Semantic Similarity':<25} {overall_metrics['semantic_similarity']:.3f}      {'Answer matches expected content'}

{'Answer Relevance':<25} {overall_metrics['answer_relevance']:.3f}      {'Answer addresses the question'}

{'Faithfulness':<25} {overall_metrics['faithfulness']:.3f}      {'Answer is grounded in context'}

{'Completeness':<25} {overall_metrics['completeness']:.3f}      {'Answer covers all aspects'}

{'Safety Appropriateness':<25} {overall_metrics['safety_appropriateness']:.3f}      {'Safety info handled properly'}

{'Technical Accuracy':<25} {overall_metrics['technical_accuracy']:.3f}      {'Technically correct information'}

{'Clarity':<25} {overall_metrics['clarity']:.3f}      {'Clear and understandable'}

{'Actionability':<25} {overall_metrics['actionability']:.3f}      {'Provides actionable guidance'}

{'-' * 40}



## Performance by Question Type

{type_performance.round(3)}



## Performance by Difficulty Level

{difficulty_performance.round(3)}

"""
        
        return report