Spaces:

ahmzakif
/

Fraud-Chatbot

Build error

App Files Files Community

Fraud-Chatbot / src /services /quality_scorer.py

ahmzakif

feat: add new project

fd99b61 verified 3 months ago

raw

history blame contribute delete

8.21 kB

	"""Quality scorer for chatbot responses."""

	import re
	import logging
	from typing import Dict, List

	logger = logging.getLogger(__name__)


	class ResponseQualityScorer:
	"""Score the quality of chatbot responses."""

	def __init__(self):
	"""Initialize quality scorer."""
	pass

	def score_response(
	self,
	response: str,
	query: str,
	has_rag: bool = False,
	sources: List[str] = None,
	) -> Dict:
	"""Score response quality.

	Args:
	response: The chatbot response text.
	query: The user query.
	has_rag: Whether RAG was used.
	sources: List of source references.

	Returns:
	Dictionary with scores and overall quality score.
	"""
	scores = {
	"relevance": self._score_relevance(response, query),
	"completeness": self._score_completeness(response),
	"citation_quality": self._score_citations(response, has_rag, sources),
	"clarity": self._score_clarity(response),
	}

	# Calculate weighted overall score
	weights = {
	"relevance": 0.35,
	"completeness": 0.25,
	"citation_quality": 0.25,
	"clarity": 0.15,
	}

	overall_score = sum(scores[k] * weights[k] for k in scores.keys())

	return {
	"overall_score": round(overall_score, 1),
	"scores": scores,
	"grade": self._get_grade(overall_score),
	}

	def _score_relevance(self, response: str, query: str) -> float:
	"""Score relevance to query (0-100).

	Args:
	response: Response text.
	query: User query.

	Returns:
	Relevance score.
	"""
	# Extract key terms from query
	query_terms = set(re.findall(r'\b\w+\b', query.lower()))
	# Remove common stop words
	stop_words = {'is', 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'and', 'or'}
	query_terms = query_terms - stop_words

	if not query_terms:
	return 80.0 # Default if no meaningful terms

	# Check how many query terms appear in response
	response_lower = response.lower()
	matched_terms = sum(1 for term in query_terms if term in response_lower)

	# Calculate relevance score
	relevance = (matched_terms / len(query_terms)) * 100
	return min(relevance, 100.0)

	def _score_completeness(self, response: str) -> float:
	"""Score completeness of response (0-100).

	Args:
	response: Response text.

	Returns:
	Completeness score.
	"""
	score = 0.0

	# Length check (should be substantial)
	word_count = len(response.split())
	if word_count >= 50:
	score += 30
	elif word_count >= 30:
	score += 20
	elif word_count >= 15:
	score += 10

	# Structure check (has sections/paragraphs)
	paragraphs = response.split('\n\n')
	if len(paragraphs) >= 3:
	score += 25
	elif len(paragraphs) >= 2:
	score += 15

	# Contains specific details (numbers, percentages, examples)
	if re.search(r'\d+\.?\d*%', response): # Percentages
	score += 15
	if re.search(r'\$\d+', response): # Dollar amounts
	score += 10
	if re.search(r'\d+', response): # Any numbers
	score += 10

	# Contains actionable information
	action_words = ['should', 'recommend', 'suggest', 'consider', 'analyze', 'check', 'verify']
	if any(word in response.lower() for word in action_words):
	score += 10

	return min(score, 100.0)

	def _score_citations(self, response: str, has_rag: bool, sources: List[str] = None) -> float:
	"""Score citation quality (0-100).

	Args:
	response: Response text.
	has_rag: Whether RAG was used.
	sources: List of sources.

	Returns:
	Citation quality score.
	"""
	if not has_rag:
	return 100.0 # N/A if RAG not used

	score = 0.0

	# Check for inline citations [Source X]
	inline_citations = re.findall(r'\[Source \d+\]', response)
	if inline_citations:
	score += 40
	# Bonus for multiple citations
	if len(inline_citations) >= 3:
	score += 20
	elif len(inline_citations) >= 2:
	score += 10

	# Check for source reference section
	if sources and len(sources) > 0:
	score += 20

	# Check citation distribution (not all at end)
	if inline_citations:
	# Find positions of citations
	positions = [response.find(cite) for cite in inline_citations]
	response_length = len(response)
	# Check if citations are spread throughout
	early_citations = sum(1 for pos in positions if pos < response_length * 0.5)
	if early_citations > 0:
	score += 20

	return min(score, 100.0)

	def _score_clarity(self, response: str) -> float:
	"""Score clarity and readability (0-100).

	Args:
	response: Response text.

	Returns:
	Clarity score.
	"""
	score = 0.0

	# Sentence structure (not too long)
	sentences = re.split(r'[.!?]+', response)
	sentences = [s.strip() for s in sentences if s.strip()]

	if sentences:
	avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences)
	if 10 <= avg_sentence_length <= 25: # Optimal range
	score += 30
	elif 8 <= avg_sentence_length <= 30:
	score += 20
	else:
	score += 10

	# Uses formatting (markdown, lists, etc.)
	if re.search(r'^\s[-]\s', response, re.MULTILINE): # Bullet points
	score += 20
	if re.search(r'\\.?\\*', response): # Bold text
	score += 10
	if re.search(r'^#{1,6}\s', response, re.MULTILINE): # Headers
	score += 10

	# Avoids jargon overload (reasonable technical term usage)
	technical_terms = ['algorithm', 'heuristic', 'methodology', 'paradigm', 'infrastructure']
	tech_count = sum(response.lower().count(term) for term in technical_terms)
	if tech_count <= 3:
	score += 15
	elif tech_count <= 5:
	score += 10

	# Clear structure (has intro/body/conclusion)
	if len(response.split('\n\n')) >= 2:
	score += 15

	return min(score, 100.0)

	def _get_grade(self, score: float) -> str:
	"""Get letter grade from score.

	Args:
	score: Overall score (0-100).

	Returns:
	Letter grade.
	"""
	if score >= 90:
	return "A"
	elif score >= 80:
	return "B"
	elif score >= 70:
	return "C"
	elif score >= 60:
	return "D"
	else:
	return "F"

	def format_score_display(self, score_result: Dict) -> str:
	"""Format score for display.

	Args:
	score_result: Score result dictionary.

	Returns:
	Formatted score string.
	"""
	overall = score_result["overall_score"]
	grade = score_result["grade"]
	scores = score_result["scores"]

	display = f"\n\n---\n\n📊 Response Quality Score: {overall}/100 (Grade: {grade})\n\n"
	display += "Breakdown:\n"
	display += f"- Relevance: {scores['relevance']:.1f}/100\n"
	display += f"- Completeness: {scores['completeness']:.1f}/100\n"
	display += f"- Citation Quality: {scores['citation_quality']:.1f}/100\n"
	display += f"- Clarity: {scores['clarity']:.1f}/100\n"

	return display