Spaces:
Build error
Build error
| """Quality scorer for chatbot responses.""" | |
| import re | |
| import logging | |
| from typing import Dict, List | |
| logger = logging.getLogger(__name__) | |
| class ResponseQualityScorer: | |
| """Score the quality of chatbot responses.""" | |
| def __init__(self): | |
| """Initialize quality scorer.""" | |
| pass | |
| def score_response( | |
| self, | |
| response: str, | |
| query: str, | |
| has_rag: bool = False, | |
| sources: List[str] = None, | |
| ) -> Dict: | |
| """Score response quality. | |
| Args: | |
| response: The chatbot response text. | |
| query: The user query. | |
| has_rag: Whether RAG was used. | |
| sources: List of source references. | |
| Returns: | |
| Dictionary with scores and overall quality score. | |
| """ | |
| scores = { | |
| "relevance": self._score_relevance(response, query), | |
| "completeness": self._score_completeness(response), | |
| "citation_quality": self._score_citations(response, has_rag, sources), | |
| "clarity": self._score_clarity(response), | |
| } | |
| # Calculate weighted overall score | |
| weights = { | |
| "relevance": 0.35, | |
| "completeness": 0.25, | |
| "citation_quality": 0.25, | |
| "clarity": 0.15, | |
| } | |
| overall_score = sum(scores[k] * weights[k] for k in scores.keys()) | |
| return { | |
| "overall_score": round(overall_score, 1), | |
| "scores": scores, | |
| "grade": self._get_grade(overall_score), | |
| } | |
| def _score_relevance(self, response: str, query: str) -> float: | |
| """Score relevance to query (0-100). | |
| Args: | |
| response: Response text. | |
| query: User query. | |
| Returns: | |
| Relevance score. | |
| """ | |
| # Extract key terms from query | |
| query_terms = set(re.findall(r'\b\w+\b', query.lower())) | |
| # Remove common stop words | |
| stop_words = {'is', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or'} | |
| query_terms = query_terms - stop_words | |
| if not query_terms: | |
| return 80.0 # Default if no meaningful terms | |
| # Check how many query terms appear in response | |
| response_lower = response.lower() | |
| matched_terms = sum(1 for term in query_terms if term in response_lower) | |
| # Calculate relevance score | |
| relevance = (matched_terms / len(query_terms)) * 100 | |
| return min(relevance, 100.0) | |
| def _score_completeness(self, response: str) -> float: | |
| """Score completeness of response (0-100). | |
| Args: | |
| response: Response text. | |
| Returns: | |
| Completeness score. | |
| """ | |
| score = 0.0 | |
| # Length check (should be substantial) | |
| word_count = len(response.split()) | |
| if word_count >= 50: | |
| score += 30 | |
| elif word_count >= 30: | |
| score += 20 | |
| elif word_count >= 15: | |
| score += 10 | |
| # Structure check (has sections/paragraphs) | |
| paragraphs = response.split('\n\n') | |
| if len(paragraphs) >= 3: | |
| score += 25 | |
| elif len(paragraphs) >= 2: | |
| score += 15 | |
| # Contains specific details (numbers, percentages, examples) | |
| if re.search(r'\d+\.?\d*%', response): # Percentages | |
| score += 15 | |
| if re.search(r'\$\d+', response): # Dollar amounts | |
| score += 10 | |
| if re.search(r'\d+', response): # Any numbers | |
| score += 10 | |
| # Contains actionable information | |
| action_words = ['should', 'recommend', 'suggest', 'consider', 'analyze', 'check', 'verify'] | |
| if any(word in response.lower() for word in action_words): | |
| score += 10 | |
| return min(score, 100.0) | |
| def _score_citations(self, response: str, has_rag: bool, sources: List[str] = None) -> float: | |
| """Score citation quality (0-100). | |
| Args: | |
| response: Response text. | |
| has_rag: Whether RAG was used. | |
| sources: List of sources. | |
| Returns: | |
| Citation quality score. | |
| """ | |
| if not has_rag: | |
| return 100.0 # N/A if RAG not used | |
| score = 0.0 | |
| # Check for inline citations [Source X] | |
| inline_citations = re.findall(r'\[Source \d+\]', response) | |
| if inline_citations: | |
| score += 40 | |
| # Bonus for multiple citations | |
| if len(inline_citations) >= 3: | |
| score += 20 | |
| elif len(inline_citations) >= 2: | |
| score += 10 | |
| # Check for source reference section | |
| if sources and len(sources) > 0: | |
| score += 20 | |
| # Check citation distribution (not all at end) | |
| if inline_citations: | |
| # Find positions of citations | |
| positions = [response.find(cite) for cite in inline_citations] | |
| response_length = len(response) | |
| # Check if citations are spread throughout | |
| early_citations = sum(1 for pos in positions if pos < response_length * 0.5) | |
| if early_citations > 0: | |
| score += 20 | |
| return min(score, 100.0) | |
| def _score_clarity(self, response: str) -> float: | |
| """Score clarity and readability (0-100). | |
| Args: | |
| response: Response text. | |
| Returns: | |
| Clarity score. | |
| """ | |
| score = 0.0 | |
| # Sentence structure (not too long) | |
| sentences = re.split(r'[.!?]+', response) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| if sentences: | |
| avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) | |
| if 10 <= avg_sentence_length <= 25: # Optimal range | |
| score += 30 | |
| elif 8 <= avg_sentence_length <= 30: | |
| score += 20 | |
| else: | |
| score += 10 | |
| # Uses formatting (markdown, lists, etc.) | |
| if re.search(r'^\s*[-*]\s', response, re.MULTILINE): # Bullet points | |
| score += 20 | |
| if re.search(r'\*\*.*?\*\*', response): # Bold text | |
| score += 10 | |
| if re.search(r'^#{1,6}\s', response, re.MULTILINE): # Headers | |
| score += 10 | |
| # Avoids jargon overload (reasonable technical term usage) | |
| technical_terms = ['algorithm', 'heuristic', 'methodology', 'paradigm', 'infrastructure'] | |
| tech_count = sum(response.lower().count(term) for term in technical_terms) | |
| if tech_count <= 3: | |
| score += 15 | |
| elif tech_count <= 5: | |
| score += 10 | |
| # Clear structure (has intro/body/conclusion) | |
| if len(response.split('\n\n')) >= 2: | |
| score += 15 | |
| return min(score, 100.0) | |
| def _get_grade(self, score: float) -> str: | |
| """Get letter grade from score. | |
| Args: | |
| score: Overall score (0-100). | |
| Returns: | |
| Letter grade. | |
| """ | |
| if score >= 90: | |
| return "A" | |
| elif score >= 80: | |
| return "B" | |
| elif score >= 70: | |
| return "C" | |
| elif score >= 60: | |
| return "D" | |
| else: | |
| return "F" | |
| def format_score_display(self, score_result: Dict) -> str: | |
| """Format score for display. | |
| Args: | |
| score_result: Score result dictionary. | |
| Returns: | |
| Formatted score string. | |
| """ | |
| overall = score_result["overall_score"] | |
| grade = score_result["grade"] | |
| scores = score_result["scores"] | |
| display = f"\n\n---\n\n**📊 Response Quality Score: {overall}/100 (Grade: {grade})**\n\n" | |
| display += "**Breakdown:**\n" | |
| display += f"- Relevance: {scores['relevance']:.1f}/100\n" | |
| display += f"- Completeness: {scores['completeness']:.1f}/100\n" | |
| display += f"- Citation Quality: {scores['citation_quality']:.1f}/100\n" | |
| display += f"- Clarity: {scores['clarity']:.1f}/100\n" | |
| return display | |