CSRC-Car-Manual-RAG / src /evaluation.py
Bryceeee's picture
Upload 17 files
0cfa3a6 verified
"""
RAG Evaluation Module
Comprehensive evaluation system for RAG-based Q&A
"""
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
from typing import Dict, List, Optional
from datetime import datetime
# Download required NLTK data
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
except:
pass
class ComprehensiveRAGEvaluator:
"""Comprehensive evaluation system for RAG-based car manual Q&A"""
def __init__(self, rag_system, client):
self.rag_system = rag_system
self.client = client
self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
self.evaluation_results = {}
def evaluate_answer_quality(self, question: str, generated_answer: str,
expected_answer: str, retrieved_contexts: List[str]) -> Dict:
"""
Comprehensive answer quality evaluation
Args:
question: The question asked
generated_answer: Answer generated by RAG system
expected_answer: Expected correct answer
retrieved_contexts: Contexts retrieved for the answer
Returns:
Dictionary of quality metrics
"""
metrics = {}
# 1. Semantic Similarity to Expected Answer
gen_embedding = self.sentence_model.encode([generated_answer])
exp_embedding = self.sentence_model.encode([expected_answer])
metrics['semantic_similarity'] = cosine_similarity(gen_embedding, exp_embedding)[0][0]
# 2. Answer Relevance to Question
q_embedding = self.sentence_model.encode([question])
a_embedding = self.sentence_model.encode([generated_answer])
metrics['answer_relevance'] = cosine_similarity(q_embedding, a_embedding)[0][0]
# 3. Faithfulness (grounding in retrieved context)
metrics['faithfulness'] = self._calculate_faithfulness(generated_answer, retrieved_contexts)
# 4. Completeness Assessment
metrics['completeness'] = self._assess_completeness(question, generated_answer, expected_answer)
# 5. Safety Appropriateness
metrics['safety_appropriateness'] = self._check_safety_appropriateness(question, generated_answer)
# 6. Technical Accuracy
metrics['technical_accuracy'] = self._assess_technical_accuracy(generated_answer, retrieved_contexts)
# 7. Clarity and Actionability
metrics['clarity'] = self._assess_clarity(generated_answer)
metrics['actionability'] = self._assess_actionability(question, generated_answer)
return metrics
def _calculate_faithfulness(self, answer: str, contexts: List[str]) -> float:
"""Calculate how well the answer is grounded in the retrieved contexts"""
if not contexts:
return 0.0
answer_sentences = nltk.sent_tokenize(answer)
supported_sentences = 0
for sentence in answer_sentences:
sentence_embedding = self.sentence_model.encode([sentence])
max_similarity = 0
for context in contexts:
context_embedding = self.sentence_model.encode([context])
similarity = cosine_similarity(sentence_embedding, context_embedding)[0][0]
max_similarity = max(max_similarity, similarity)
if max_similarity > 0.7:
supported_sentences += 1
return supported_sentences / len(answer_sentences) if answer_sentences else 0.0
def _assess_completeness(self, question: str, generated_answer: str, expected_answer: str) -> float:
"""Assess if the generated answer covers all aspects of the expected answer"""
expected_words = set(expected_answer.lower().split())
generated_words = set(generated_answer.lower().split())
stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
expected_words -= stop_words
generated_words -= stop_words
if not expected_words:
return 1.0
overlap = len(expected_words.intersection(generated_words))
return overlap / len(expected_words)
def _check_safety_appropriateness(self, question: str, answer: str) -> float:
"""Check if safety-critical information is handled appropriately"""
safety_keywords = ['brake', 'airbag', 'emergency', 'warning', 'danger', 'caution', 'safety', 'speed', 'steering']
question_lower = question.lower()
answer_lower = answer.lower()
is_safety_related = any(keyword in question_lower for keyword in safety_keywords)
if not is_safety_related:
return 1.0
safety_indicators = ['warning', 'caution', 'important', 'ensure', 'never', 'always', 'must']
has_safety_language = any(indicator in answer_lower for indicator in safety_indicators)
return 1.0 if has_safety_language else 0.5
def _assess_technical_accuracy(self, answer: str, contexts: List[str]) -> float:
"""Assess technical accuracy based on context alignment"""
if not contexts:
return 0.5
answer_embedding = self.sentence_model.encode([answer])
context_embeddings = self.sentence_model.encode(contexts)
similarities = cosine_similarity(answer_embedding, context_embeddings)[0]
return np.mean(similarities)
def _assess_clarity(self, answer: str) -> float:
"""Assess clarity of the answer"""
sentences = nltk.sent_tokenize(answer)
if not sentences:
return 0.0
avg_sentence_length = np.mean([len(sentence.split()) for sentence in sentences])
length_score = min(1.0, 15.0 / avg_sentence_length) if avg_sentence_length > 0 else 0.0
structure_indicators = ['step', 'first', 'second', 'then', 'next', 'finally', '1.', '2.']
has_structure = any(indicator in answer.lower() for indicator in structure_indicators)
structure_score = 1.0 if has_structure else 0.7
return (length_score + structure_score) / 2
def _assess_actionability(self, question: str, answer: str) -> float:
"""Assess if the answer provides actionable information"""
question_lower = question.lower()
answer_lower = answer.lower()
if 'how to' in question_lower or 'how do' in question_lower:
action_indicators = ['press', 'turn', 'select', 'push', 'pull', 'set', 'adjust', 'follow', 'ensure']
has_actions = any(indicator in answer_lower for indicator in action_indicators)
return 1.0 if has_actions else 0.3
return 0.8
def generate_evaluation_report(self) -> str:
"""Generate comprehensive evaluation report"""
if not self.evaluation_results:
return "No evaluation results available. Run evaluation first."
df = pd.DataFrame(self.evaluation_results)
# Overall metrics
overall_metrics = {
'semantic_similarity': df['semantic_similarity'].mean(),
'answer_relevance': df['answer_relevance'].mean(),
'faithfulness': df['faithfulness'].mean(),
'completeness': df['completeness'].mean(),
'safety_appropriateness': df['safety_appropriateness'].mean(),
'technical_accuracy': df['technical_accuracy'].mean(),
'clarity': df['clarity'].mean(),
'actionability': df['actionability'].mean()
}
# Performance by question type
type_performance = df.groupby('question_type')[list(overall_metrics.keys())].mean()
# Performance by difficulty
difficulty_performance = df.groupby('difficulty')[list(overall_metrics.keys())].mean()
# Generate report
report = f"""
# RAG System Comprehensive Evaluation Report
Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
## Overall Performance Metrics
{'-' * 40}
{'Metric':<25} {'Score':<10} {'Interpretation':<30}
{'-' * 40}
{'Semantic Similarity':<25} {overall_metrics['semantic_similarity']:.3f} {'Answer matches expected content'}
{'Answer Relevance':<25} {overall_metrics['answer_relevance']:.3f} {'Answer addresses the question'}
{'Faithfulness':<25} {overall_metrics['faithfulness']:.3f} {'Answer is grounded in context'}
{'Completeness':<25} {overall_metrics['completeness']:.3f} {'Answer covers all aspects'}
{'Safety Appropriateness':<25} {overall_metrics['safety_appropriateness']:.3f} {'Safety info handled properly'}
{'Technical Accuracy':<25} {overall_metrics['technical_accuracy']:.3f} {'Technically correct information'}
{'Clarity':<25} {overall_metrics['clarity']:.3f} {'Clear and understandable'}
{'Actionability':<25} {overall_metrics['actionability']:.3f} {'Provides actionable guidance'}
{'-' * 40}
## Performance by Question Type
{type_performance.round(3)}
## Performance by Difficulty Level
{difficulty_performance.round(3)}
"""
return report