""" RAG Evaluation Module Comprehensive evaluation system for RAG-based Q&A """ import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import nltk import re from typing import Dict, List, Optional from datetime import datetime # Download required NLTK data try: nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) except: pass class ComprehensiveRAGEvaluator: """Comprehensive evaluation system for RAG-based car manual Q&A""" def __init__(self, rag_system, client): self.rag_system = rag_system self.client = client self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2') self.evaluation_results = {} def evaluate_answer_quality(self, question: str, generated_answer: str, expected_answer: str, retrieved_contexts: List[str]) -> Dict: """ Comprehensive answer quality evaluation Args: question: The question asked generated_answer: Answer generated by RAG system expected_answer: Expected correct answer retrieved_contexts: Contexts retrieved for the answer Returns: Dictionary of quality metrics """ metrics = {} # 1. Semantic Similarity to Expected Answer gen_embedding = self.sentence_model.encode([generated_answer]) exp_embedding = self.sentence_model.encode([expected_answer]) metrics['semantic_similarity'] = cosine_similarity(gen_embedding, exp_embedding)[0][0] # 2. Answer Relevance to Question q_embedding = self.sentence_model.encode([question]) a_embedding = self.sentence_model.encode([generated_answer]) metrics['answer_relevance'] = cosine_similarity(q_embedding, a_embedding)[0][0] # 3. Faithfulness (grounding in retrieved context) metrics['faithfulness'] = self._calculate_faithfulness(generated_answer, retrieved_contexts) # 4. Completeness Assessment metrics['completeness'] = self._assess_completeness(question, generated_answer, expected_answer) # 5. Safety Appropriateness metrics['safety_appropriateness'] = self._check_safety_appropriateness(question, generated_answer) # 6. Technical Accuracy metrics['technical_accuracy'] = self._assess_technical_accuracy(generated_answer, retrieved_contexts) # 7. Clarity and Actionability metrics['clarity'] = self._assess_clarity(generated_answer) metrics['actionability'] = self._assess_actionability(question, generated_answer) return metrics def _calculate_faithfulness(self, answer: str, contexts: List[str]) -> float: """Calculate how well the answer is grounded in the retrieved contexts""" if not contexts: return 0.0 answer_sentences = nltk.sent_tokenize(answer) supported_sentences = 0 for sentence in answer_sentences: sentence_embedding = self.sentence_model.encode([sentence]) max_similarity = 0 for context in contexts: context_embedding = self.sentence_model.encode([context]) similarity = cosine_similarity(sentence_embedding, context_embedding)[0][0] max_similarity = max(max_similarity, similarity) if max_similarity > 0.7: supported_sentences += 1 return supported_sentences / len(answer_sentences) if answer_sentences else 0.0 def _assess_completeness(self, question: str, generated_answer: str, expected_answer: str) -> float: """Assess if the generated answer covers all aspects of the expected answer""" expected_words = set(expected_answer.lower().split()) generated_words = set(generated_answer.lower().split()) stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']) expected_words -= stop_words generated_words -= stop_words if not expected_words: return 1.0 overlap = len(expected_words.intersection(generated_words)) return overlap / len(expected_words) def _check_safety_appropriateness(self, question: str, answer: str) -> float: """Check if safety-critical information is handled appropriately""" safety_keywords = ['brake', 'airbag', 'emergency', 'warning', 'danger', 'caution', 'safety', 'speed', 'steering'] question_lower = question.lower() answer_lower = answer.lower() is_safety_related = any(keyword in question_lower for keyword in safety_keywords) if not is_safety_related: return 1.0 safety_indicators = ['warning', 'caution', 'important', 'ensure', 'never', 'always', 'must'] has_safety_language = any(indicator in answer_lower for indicator in safety_indicators) return 1.0 if has_safety_language else 0.5 def _assess_technical_accuracy(self, answer: str, contexts: List[str]) -> float: """Assess technical accuracy based on context alignment""" if not contexts: return 0.5 answer_embedding = self.sentence_model.encode([answer]) context_embeddings = self.sentence_model.encode(contexts) similarities = cosine_similarity(answer_embedding, context_embeddings)[0] return np.mean(similarities) def _assess_clarity(self, answer: str) -> float: """Assess clarity of the answer""" sentences = nltk.sent_tokenize(answer) if not sentences: return 0.0 avg_sentence_length = np.mean([len(sentence.split()) for sentence in sentences]) length_score = min(1.0, 15.0 / avg_sentence_length) if avg_sentence_length > 0 else 0.0 structure_indicators = ['step', 'first', 'second', 'then', 'next', 'finally', '1.', '2.'] has_structure = any(indicator in answer.lower() for indicator in structure_indicators) structure_score = 1.0 if has_structure else 0.7 return (length_score + structure_score) / 2 def _assess_actionability(self, question: str, answer: str) -> float: """Assess if the answer provides actionable information""" question_lower = question.lower() answer_lower = answer.lower() if 'how to' in question_lower or 'how do' in question_lower: action_indicators = ['press', 'turn', 'select', 'push', 'pull', 'set', 'adjust', 'follow', 'ensure'] has_actions = any(indicator in answer_lower for indicator in action_indicators) return 1.0 if has_actions else 0.3 return 0.8 def generate_evaluation_report(self) -> str: """Generate comprehensive evaluation report""" if not self.evaluation_results: return "No evaluation results available. Run evaluation first." df = pd.DataFrame(self.evaluation_results) # Overall metrics overall_metrics = { 'semantic_similarity': df['semantic_similarity'].mean(), 'answer_relevance': df['answer_relevance'].mean(), 'faithfulness': df['faithfulness'].mean(), 'completeness': df['completeness'].mean(), 'safety_appropriateness': df['safety_appropriateness'].mean(), 'technical_accuracy': df['technical_accuracy'].mean(), 'clarity': df['clarity'].mean(), 'actionability': df['actionability'].mean() } # Performance by question type type_performance = df.groupby('question_type')[list(overall_metrics.keys())].mean() # Performance by difficulty difficulty_performance = df.groupby('difficulty')[list(overall_metrics.keys())].mean() # Generate report report = f""" # RAG System Comprehensive Evaluation Report Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ## Overall Performance Metrics {'-' * 40} {'Metric':<25} {'Score':<10} {'Interpretation':<30} {'-' * 40} {'Semantic Similarity':<25} {overall_metrics['semantic_similarity']:.3f} {'Answer matches expected content'} {'Answer Relevance':<25} {overall_metrics['answer_relevance']:.3f} {'Answer addresses the question'} {'Faithfulness':<25} {overall_metrics['faithfulness']:.3f} {'Answer is grounded in context'} {'Completeness':<25} {overall_metrics['completeness']:.3f} {'Answer covers all aspects'} {'Safety Appropriateness':<25} {overall_metrics['safety_appropriateness']:.3f} {'Safety info handled properly'} {'Technical Accuracy':<25} {overall_metrics['technical_accuracy']:.3f} {'Technically correct information'} {'Clarity':<25} {overall_metrics['clarity']:.3f} {'Clear and understandable'} {'Actionability':<25} {overall_metrics['actionability']:.3f} {'Provides actionable guidance'} {'-' * 40} ## Performance by Question Type {type_performance.round(3)} ## Performance by Difficulty Level {difficulty_performance.round(3)} """ return report