from __future__ import annotations """Quality evaluation for responses.""" from dataclasses import dataclass from typing import Any @dataclass class QualityScore: """Quality evaluation scores.""" relevance: float # 0-1: How relevant is the answer completeness: float # 0-1: How complete is the answer accuracy: float # 0-1: Estimated accuracy clarity: float # 0-1: How clear is the answer sourcing: float # 0-1: Quality of source citations overall: float # 0-1: Overall quality score feedback: list[str] # Specific feedback items class QualityEvaluator: """Evaluates the quality of generated responses.""" def __init__(self, min_quality_threshold: float = 0.6): """Initialize the evaluator. Args: min_quality_threshold: Minimum acceptable quality score """ self.min_quality_threshold = min_quality_threshold def evaluate( self, query: str, answer: str, sources: list[dict[str, str]] | None = None, reasoning_steps: list[str] | None = None, ) -> QualityScore: """Evaluate the quality of a response. Args: query: Original user query answer: Generated answer sources: List of source citations reasoning_steps: Reasoning steps taken Returns: QualityScore with detailed evaluation """ feedback = [] # Evaluate relevance relevance = self._evaluate_relevance(query, answer) if relevance < 0.5: feedback.append("Answer may not be relevant to the question") # Evaluate completeness completeness = self._evaluate_completeness(query, answer) if completeness < 0.5: feedback.append("Answer appears incomplete") # Evaluate accuracy (based on source count and reasoning) accuracy = self._evaluate_accuracy(sources, reasoning_steps) if accuracy < 0.5: feedback.append("Accuracy could not be verified with sources") # Evaluate clarity clarity = self._evaluate_clarity(answer) if clarity < 0.5: feedback.append("Answer could be clearer") # Evaluate sourcing sourcing = self._evaluate_sourcing(answer, sources) if sourcing < 0.5: feedback.append("More sources would improve credibility") # Calculate overall score (weighted average) overall = ( relevance * 0.25 + completeness * 0.2 + accuracy * 0.25 + clarity * 0.15 + sourcing * 0.15 ) if overall >= self.min_quality_threshold: feedback.insert(0, "Response meets quality standards") else: feedback.insert(0, "Response may need refinement") return QualityScore( relevance=relevance, completeness=completeness, accuracy=accuracy, clarity=clarity, sourcing=sourcing, overall=overall, feedback=feedback, ) def is_acceptable(self, score: QualityScore) -> bool: """Check if quality score is acceptable. Args: score: Quality score to check Returns: True if acceptable """ return score.overall >= self.min_quality_threshold def _evaluate_relevance(self, query: str, answer: str) -> float: """Evaluate answer relevance to query. Args: query: User query answer: Generated answer Returns: Relevance score (0-1) """ if not answer: return 0.0 # Simple keyword matching query_words = set(query.lower().split()) answer_words = set(answer.lower().split()) # Remove common words stopwords = {"the", "a", "an", "is", "are", "was", "were", "what", "how", "when", "where", "why", "who"} query_words -= stopwords answer_words -= stopwords if not query_words: return 0.5 overlap = len(query_words & answer_words) return min(1.0, overlap / len(query_words) + 0.3) # Base score + overlap def _evaluate_completeness(self, query: str, answer: str) -> float: """Evaluate answer completeness. Args: query: User query answer: Generated answer Returns: Completeness score (0-1) """ if not answer: return 0.0 # Check answer length relative to query complexity query_words = len(query.split()) answer_words = len(answer.split()) # Longer queries typically need longer answers expected_min = max(20, query_words * 3) if answer_words < expected_min: return answer_words / expected_min # Check for explanation patterns explanation_markers = ["because", "since", "therefore", "this means", "in other words"] has_explanation = any(marker in answer.lower() for marker in explanation_markers) score = 0.7 if has_explanation: score += 0.2 if answer_words > expected_min * 2: score += 0.1 return min(1.0, score) def _evaluate_accuracy( self, sources: list[dict[str, str]] | None, reasoning_steps: list[str] | None, ) -> float: """Evaluate estimated accuracy. Args: sources: List of sources reasoning_steps: Reasoning steps Returns: Accuracy score (0-1) """ score = 0.3 # Base score # More sources = higher potential accuracy if sources: score += min(0.3, len(sources) * 0.1) # Reasoning steps suggest careful analysis if reasoning_steps: score += min(0.3, len(reasoning_steps) * 0.1) # Cap at 0.9 since we can't truly verify accuracy return min(0.9, score) def _evaluate_clarity(self, answer: str) -> float: """Evaluate answer clarity. Args: answer: Generated answer Returns: Clarity score (0-1) """ if not answer: return 0.0 score = 0.5 # Check sentence structure (average length) sentences = answer.split(".") if sentences: avg_sentence_length = len(answer.split()) / len(sentences) # Ideal: 15-25 words per sentence if 10 <= avg_sentence_length <= 30: score += 0.2 # Check for structure (paragraphs, lists) if "\n" in answer: score += 0.1 if any(marker in answer for marker in ["-", "•", "1.", "2."]): score += 0.1 # Check for hedge words (too many = less clear) hedge_words = ["might", "perhaps", "maybe", "possibly", "could"] hedge_count = sum(1 for word in hedge_words if word in answer.lower()) if hedge_count > 3: score -= 0.1 return min(1.0, max(0.0, score)) def _evaluate_sourcing( self, answer: str, sources: list[dict[str, str]] | None, ) -> float: """Evaluate source quality. Args: answer: Generated answer sources: List of sources Returns: Sourcing score (0-1) """ if not sources: return 0.2 score = 0.3 # More sources = better source_count = len(sources) score += min(0.3, source_count * 0.1) # Check for diverse domains urls = [s.get("url", "") for s in sources] domains = set() for url in urls: if url: try: from urllib.parse import urlparse domain = urlparse(url).netloc domains.add(domain) except Exception: pass # Domain diversity if len(domains) > 1: score += 0.2 # Check for reliable domains reliable_indicators = [".gov", ".edu", "wikipedia.org"] for url in urls: if any(ind in url.lower() for ind in reliable_indicators): score += 0.1 break return min(1.0, score)