Spaces:
Sleeping
Sleeping
| """ | |
| RAG Evaluation Module | |
| Comprehensive evaluation system for RAG-based Q&A | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import nltk | |
| import re | |
| from typing import Dict, List, Optional | |
| from datetime import datetime | |
| # Download required NLTK data | |
| try: | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('stopwords', quiet=True) | |
| except: | |
| pass | |
| class ComprehensiveRAGEvaluator: | |
| """Comprehensive evaluation system for RAG-based car manual Q&A""" | |
| def __init__(self, rag_system, client): | |
| self.rag_system = rag_system | |
| self.client = client | |
| self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| self.evaluation_results = {} | |
| def evaluate_answer_quality(self, question: str, generated_answer: str, | |
| expected_answer: str, retrieved_contexts: List[str]) -> Dict: | |
| """ | |
| Comprehensive answer quality evaluation | |
| Args: | |
| question: The question asked | |
| generated_answer: Answer generated by RAG system | |
| expected_answer: Expected correct answer | |
| retrieved_contexts: Contexts retrieved for the answer | |
| Returns: | |
| Dictionary of quality metrics | |
| """ | |
| metrics = {} | |
| # 1. Semantic Similarity to Expected Answer | |
| gen_embedding = self.sentence_model.encode([generated_answer]) | |
| exp_embedding = self.sentence_model.encode([expected_answer]) | |
| metrics['semantic_similarity'] = cosine_similarity(gen_embedding, exp_embedding)[0][0] | |
| # 2. Answer Relevance to Question | |
| q_embedding = self.sentence_model.encode([question]) | |
| a_embedding = self.sentence_model.encode([generated_answer]) | |
| metrics['answer_relevance'] = cosine_similarity(q_embedding, a_embedding)[0][0] | |
| # 3. Faithfulness (grounding in retrieved context) | |
| metrics['faithfulness'] = self._calculate_faithfulness(generated_answer, retrieved_contexts) | |
| # 4. Completeness Assessment | |
| metrics['completeness'] = self._assess_completeness(question, generated_answer, expected_answer) | |
| # 5. Safety Appropriateness | |
| metrics['safety_appropriateness'] = self._check_safety_appropriateness(question, generated_answer) | |
| # 6. Technical Accuracy | |
| metrics['technical_accuracy'] = self._assess_technical_accuracy(generated_answer, retrieved_contexts) | |
| # 7. Clarity and Actionability | |
| metrics['clarity'] = self._assess_clarity(generated_answer) | |
| metrics['actionability'] = self._assess_actionability(question, generated_answer) | |
| return metrics | |
| def _calculate_faithfulness(self, answer: str, contexts: List[str]) -> float: | |
| """Calculate how well the answer is grounded in the retrieved contexts""" | |
| if not contexts: | |
| return 0.0 | |
| answer_sentences = nltk.sent_tokenize(answer) | |
| supported_sentences = 0 | |
| for sentence in answer_sentences: | |
| sentence_embedding = self.sentence_model.encode([sentence]) | |
| max_similarity = 0 | |
| for context in contexts: | |
| context_embedding = self.sentence_model.encode([context]) | |
| similarity = cosine_similarity(sentence_embedding, context_embedding)[0][0] | |
| max_similarity = max(max_similarity, similarity) | |
| if max_similarity > 0.7: | |
| supported_sentences += 1 | |
| return supported_sentences / len(answer_sentences) if answer_sentences else 0.0 | |
| def _assess_completeness(self, question: str, generated_answer: str, expected_answer: str) -> float: | |
| """Assess if the generated answer covers all aspects of the expected answer""" | |
| expected_words = set(expected_answer.lower().split()) | |
| generated_words = set(generated_answer.lower().split()) | |
| stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']) | |
| expected_words -= stop_words | |
| generated_words -= stop_words | |
| if not expected_words: | |
| return 1.0 | |
| overlap = len(expected_words.intersection(generated_words)) | |
| return overlap / len(expected_words) | |
| def _check_safety_appropriateness(self, question: str, answer: str) -> float: | |
| """Check if safety-critical information is handled appropriately""" | |
| safety_keywords = ['brake', 'airbag', 'emergency', 'warning', 'danger', 'caution', 'safety', 'speed', 'steering'] | |
| question_lower = question.lower() | |
| answer_lower = answer.lower() | |
| is_safety_related = any(keyword in question_lower for keyword in safety_keywords) | |
| if not is_safety_related: | |
| return 1.0 | |
| safety_indicators = ['warning', 'caution', 'important', 'ensure', 'never', 'always', 'must'] | |
| has_safety_language = any(indicator in answer_lower for indicator in safety_indicators) | |
| return 1.0 if has_safety_language else 0.5 | |
| def _assess_technical_accuracy(self, answer: str, contexts: List[str]) -> float: | |
| """Assess technical accuracy based on context alignment""" | |
| if not contexts: | |
| return 0.5 | |
| answer_embedding = self.sentence_model.encode([answer]) | |
| context_embeddings = self.sentence_model.encode(contexts) | |
| similarities = cosine_similarity(answer_embedding, context_embeddings)[0] | |
| return np.mean(similarities) | |
| def _assess_clarity(self, answer: str) -> float: | |
| """Assess clarity of the answer""" | |
| sentences = nltk.sent_tokenize(answer) | |
| if not sentences: | |
| return 0.0 | |
| avg_sentence_length = np.mean([len(sentence.split()) for sentence in sentences]) | |
| length_score = min(1.0, 15.0 / avg_sentence_length) if avg_sentence_length > 0 else 0.0 | |
| structure_indicators = ['step', 'first', 'second', 'then', 'next', 'finally', '1.', '2.'] | |
| has_structure = any(indicator in answer.lower() for indicator in structure_indicators) | |
| structure_score = 1.0 if has_structure else 0.7 | |
| return (length_score + structure_score) / 2 | |
| def _assess_actionability(self, question: str, answer: str) -> float: | |
| """Assess if the answer provides actionable information""" | |
| question_lower = question.lower() | |
| answer_lower = answer.lower() | |
| if 'how to' in question_lower or 'how do' in question_lower: | |
| action_indicators = ['press', 'turn', 'select', 'push', 'pull', 'set', 'adjust', 'follow', 'ensure'] | |
| has_actions = any(indicator in answer_lower for indicator in action_indicators) | |
| return 1.0 if has_actions else 0.3 | |
| return 0.8 | |
| def generate_evaluation_report(self) -> str: | |
| """Generate comprehensive evaluation report""" | |
| if not self.evaluation_results: | |
| return "No evaluation results available. Run evaluation first." | |
| df = pd.DataFrame(self.evaluation_results) | |
| # Overall metrics | |
| overall_metrics = { | |
| 'semantic_similarity': df['semantic_similarity'].mean(), | |
| 'answer_relevance': df['answer_relevance'].mean(), | |
| 'faithfulness': df['faithfulness'].mean(), | |
| 'completeness': df['completeness'].mean(), | |
| 'safety_appropriateness': df['safety_appropriateness'].mean(), | |
| 'technical_accuracy': df['technical_accuracy'].mean(), | |
| 'clarity': df['clarity'].mean(), | |
| 'actionability': df['actionability'].mean() | |
| } | |
| # Performance by question type | |
| type_performance = df.groupby('question_type')[list(overall_metrics.keys())].mean() | |
| # Performance by difficulty | |
| difficulty_performance = df.groupby('difficulty')[list(overall_metrics.keys())].mean() | |
| # Generate report | |
| report = f""" | |
| # RAG System Comprehensive Evaluation Report | |
| Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| ## Overall Performance Metrics | |
| {'-' * 40} | |
| {'Metric':<25} {'Score':<10} {'Interpretation':<30} | |
| {'-' * 40} | |
| {'Semantic Similarity':<25} {overall_metrics['semantic_similarity']:.3f} {'Answer matches expected content'} | |
| {'Answer Relevance':<25} {overall_metrics['answer_relevance']:.3f} {'Answer addresses the question'} | |
| {'Faithfulness':<25} {overall_metrics['faithfulness']:.3f} {'Answer is grounded in context'} | |
| {'Completeness':<25} {overall_metrics['completeness']:.3f} {'Answer covers all aspects'} | |
| {'Safety Appropriateness':<25} {overall_metrics['safety_appropriateness']:.3f} {'Safety info handled properly'} | |
| {'Technical Accuracy':<25} {overall_metrics['technical_accuracy']:.3f} {'Technically correct information'} | |
| {'Clarity':<25} {overall_metrics['clarity']:.3f} {'Clear and understandable'} | |
| {'Actionability':<25} {overall_metrics['actionability']:.3f} {'Provides actionable guidance'} | |
| {'-' * 40} | |
| ## Performance by Question Type | |
| {type_performance.round(3)} | |
| ## Performance by Difficulty Level | |
| {difficulty_performance.round(3)} | |
| """ | |
| return report | |