Fraud-Chatbot / src /services /quality_scorer.py
ahmzakif's picture
feat: add new project
fd99b61 verified
"""Quality scorer for chatbot responses."""
import re
import logging
from typing import Dict, List
logger = logging.getLogger(__name__)
class ResponseQualityScorer:
"""Score the quality of chatbot responses."""
def __init__(self):
"""Initialize quality scorer."""
pass
def score_response(
self,
response: str,
query: str,
has_rag: bool = False,
sources: List[str] = None,
) -> Dict:
"""Score response quality.
Args:
response: The chatbot response text.
query: The user query.
has_rag: Whether RAG was used.
sources: List of source references.
Returns:
Dictionary with scores and overall quality score.
"""
scores = {
"relevance": self._score_relevance(response, query),
"completeness": self._score_completeness(response),
"citation_quality": self._score_citations(response, has_rag, sources),
"clarity": self._score_clarity(response),
}
# Calculate weighted overall score
weights = {
"relevance": 0.35,
"completeness": 0.25,
"citation_quality": 0.25,
"clarity": 0.15,
}
overall_score = sum(scores[k] * weights[k] for k in scores.keys())
return {
"overall_score": round(overall_score, 1),
"scores": scores,
"grade": self._get_grade(overall_score),
}
def _score_relevance(self, response: str, query: str) -> float:
"""Score relevance to query (0-100).
Args:
response: Response text.
query: User query.
Returns:
Relevance score.
"""
# Extract key terms from query
query_terms = set(re.findall(r'\b\w+\b', query.lower()))
# Remove common stop words
stop_words = {'is', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or'}
query_terms = query_terms - stop_words
if not query_terms:
return 80.0 # Default if no meaningful terms
# Check how many query terms appear in response
response_lower = response.lower()
matched_terms = sum(1 for term in query_terms if term in response_lower)
# Calculate relevance score
relevance = (matched_terms / len(query_terms)) * 100
return min(relevance, 100.0)
def _score_completeness(self, response: str) -> float:
"""Score completeness of response (0-100).
Args:
response: Response text.
Returns:
Completeness score.
"""
score = 0.0
# Length check (should be substantial)
word_count = len(response.split())
if word_count >= 50:
score += 30
elif word_count >= 30:
score += 20
elif word_count >= 15:
score += 10
# Structure check (has sections/paragraphs)
paragraphs = response.split('\n\n')
if len(paragraphs) >= 3:
score += 25
elif len(paragraphs) >= 2:
score += 15
# Contains specific details (numbers, percentages, examples)
if re.search(r'\d+\.?\d*%', response): # Percentages
score += 15
if re.search(r'\$\d+', response): # Dollar amounts
score += 10
if re.search(r'\d+', response): # Any numbers
score += 10
# Contains actionable information
action_words = ['should', 'recommend', 'suggest', 'consider', 'analyze', 'check', 'verify']
if any(word in response.lower() for word in action_words):
score += 10
return min(score, 100.0)
def _score_citations(self, response: str, has_rag: bool, sources: List[str] = None) -> float:
"""Score citation quality (0-100).
Args:
response: Response text.
has_rag: Whether RAG was used.
sources: List of sources.
Returns:
Citation quality score.
"""
if not has_rag:
return 100.0 # N/A if RAG not used
score = 0.0
# Check for inline citations [Source X]
inline_citations = re.findall(r'\[Source \d+\]', response)
if inline_citations:
score += 40
# Bonus for multiple citations
if len(inline_citations) >= 3:
score += 20
elif len(inline_citations) >= 2:
score += 10
# Check for source reference section
if sources and len(sources) > 0:
score += 20
# Check citation distribution (not all at end)
if inline_citations:
# Find positions of citations
positions = [response.find(cite) for cite in inline_citations]
response_length = len(response)
# Check if citations are spread throughout
early_citations = sum(1 for pos in positions if pos < response_length * 0.5)
if early_citations > 0:
score += 20
return min(score, 100.0)
def _score_clarity(self, response: str) -> float:
"""Score clarity and readability (0-100).
Args:
response: Response text.
Returns:
Clarity score.
"""
score = 0.0
# Sentence structure (not too long)
sentences = re.split(r'[.!?]+', response)
sentences = [s.strip() for s in sentences if s.strip()]
if sentences:
avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
if 10 <= avg_sentence_length <= 25: # Optimal range
score += 30
elif 8 <= avg_sentence_length <= 30:
score += 20
else:
score += 10
# Uses formatting (markdown, lists, etc.)
if re.search(r'^\s*[-*]\s', response, re.MULTILINE): # Bullet points
score += 20
if re.search(r'\*\*.*?\*\*', response): # Bold text
score += 10
if re.search(r'^#{1,6}\s', response, re.MULTILINE): # Headers
score += 10
# Avoids jargon overload (reasonable technical term usage)
technical_terms = ['algorithm', 'heuristic', 'methodology', 'paradigm', 'infrastructure']
tech_count = sum(response.lower().count(term) for term in technical_terms)
if tech_count <= 3:
score += 15
elif tech_count <= 5:
score += 10
# Clear structure (has intro/body/conclusion)
if len(response.split('\n\n')) >= 2:
score += 15
return min(score, 100.0)
def _get_grade(self, score: float) -> str:
"""Get letter grade from score.
Args:
score: Overall score (0-100).
Returns:
Letter grade.
"""
if score >= 90:
return "A"
elif score >= 80:
return "B"
elif score >= 70:
return "C"
elif score >= 60:
return "D"
else:
return "F"
def format_score_display(self, score_result: Dict) -> str:
"""Format score for display.
Args:
score_result: Score result dictionary.
Returns:
Formatted score string.
"""
overall = score_result["overall_score"]
grade = score_result["grade"]
scores = score_result["scores"]
display = f"\n\n---\n\n**📊 Response Quality Score: {overall}/100 (Grade: {grade})**\n\n"
display += "**Breakdown:**\n"
display += f"- Relevance: {scores['relevance']:.1f}/100\n"
display += f"- Completeness: {scores['completeness']:.1f}/100\n"
display += f"- Citation Quality: {scores['citation_quality']:.1f}/100\n"
display += f"- Clarity: {scores['clarity']:.1f}/100\n"
return display