Spaces:

ajaymauryabbn
/

telecom-rag

Running

App Files Files Community

telecom-rag / src /evaluation.py

ajaymauryabbn

feat: Telecom RAG System - Production Ready

eb731f7 4 months ago

raw

history blame contribute delete

14.2 kB

	"""Telecom RAG - Evaluation Module

	Implements RAGAS-style evaluation metrics for hallucination detection:
	- Faithfulness scoring (is answer grounded in context?)
	- Answer relevancy (does answer address the question?)
	- Abstention logic (refuse when confidence is low)
	"""

	import re
	from typing import List, Dict, Any, Optional, Tuple
	from dataclasses import dataclass

	from .config import (
	LLM_PROVIDER,
	OPENAI_API_KEY,
	GOOGLE_API_KEY,
	OPENAI_MODEL,
	GEMINI_MODEL
	)


	@dataclass
	class EvaluationResult:
	"""Evaluation metrics for a RAG response."""
	faithfulness_score: float # 0-1: How grounded is answer in context
	relevancy_score: float # 0-1: How relevant is answer to question
	confidence_score: float # 0-1: Combined confidence
	should_abstain: bool # True if confidence too low
	abstention_reason: str # Why abstaining (if applicable)
	claims: List[str] # Extracted claims from answer
	supported_claims: int # Number of claims supported by context
	total_claims: int # Total claims in answer
	# Context quality metrics (per architecture doc Section 5.1)
	context_precision: float = 0.0 # Relevant chunks / total chunks (target: >0.70)
	context_recall: float = 0.0 # Covered claims / total claims (target: >0.85)

	# TLM Trust Metrics (Section 5.2)
	trust_score: float = 0.0 # Combined reliability metric
	consistency_score: float = 0.0 # Self-consistency agreement (0-1)


	class RAGEvaluator:
	"""
	Evaluates RAG responses for faithfulness and relevancy.
	Implements abstention logic for low-confidence answers.
	"""

	# Thresholds tuned for telecom domain with quality built-in KB
	FAITHFULNESS_THRESHOLD = 0.8 # Flag for review if below
	ABSTENTION_THRESHOLD = 0.3 # Refuse only for very low confidence
	MIN_SIMILARITY_THRESHOLD = 0.2 # Allow lower similarity since domain-specific

	def __init__(self):
	self.llm_available = self._check_llm()
	self.llm = None
	if self.llm_available:
	try:
	from .llm import TelecomLLM
	self.llm = TelecomLLM()
	except Exception as e:
	print(f"⚠️ Failed to init LLM for eval: {e}")

	def _check_llm(self) -> bool:
	"""Check if LLM is available for evaluation."""
	if LLM_PROVIDER == "openai" and OPENAI_API_KEY and OPENAI_API_KEY != "your_openai_api_key_here":
	return True
	if LLM_PROVIDER == "gemini" and GOOGLE_API_KEY and GOOGLE_API_KEY != "your_google_api_key_here":
	return True
	return False

	def extract_claims(self, answer: str) -> List[str]:
	"""
	Extract factual claims from an answer.
	Simple heuristic: split by sentences and filter.
	"""
	# Split into sentences
	sentences = re.split(r'[.!?]+', answer)

	claims = []
	for sent in sentences:
	sent = sent.strip()
	# Filter out very short or non-factual sentences
	if len(sent) > 20 and not sent.startswith(('I ', 'We ', 'You ')):
	# Check if it contains factual content (numbers, technical terms)
	if re.search(r'\d\|[A-Z]{2,}\|specifically\|defined\|means\|refers to', sent):
	claims.append(sent)

	return claims

	def check_claim_support(self, claim: str, context: str) -> bool:
	"""
	Check if a claim is supported by the context.
	Uses simple keyword/phrase overlap heuristic.
	"""
	claim_lower = claim.lower()
	context_lower = context.lower()

	# Extract key terms from claim
	terms = re.findall(r'\b[a-z]{3,}\b', claim_lower)
	technical_terms = re.findall(r'\b[A-Z]{2,6}\b', claim)

	# Count term overlap
	term_matches = sum(1 for t in terms if t in context_lower)
	tech_matches = sum(1 for t in technical_terms if t in context)

	# Calculate support ratio
	total_terms = len(terms) + len(technical_terms)
	if total_terms == 0:
	return True # No specific claims to verify

	support_ratio = (term_matches + tech_matches * 2) / (total_terms + len(technical_terms))

	return support_ratio > 0.3

	def calculate_faithfulness(
	self,
	answer: str,
	context: str
	) -> Tuple[float, List[str], int, int]:
	"""
	Calculate faithfulness score (Heuristic).
	Measures how grounded the answer is in the provided context.
	"""
	claims = self.extract_claims(answer)

	if not claims:
	return 1.0, [], 0, 0 # No claims = faithful by default

	supported = 0
	for claim in claims:
	if self.check_claim_support(claim, context):
	supported += 1

	score = supported / len(claims) if claims else 1.0
	return score, claims, supported, len(claims)

	def calculate_llm_faithfulness(self, answer: str, context: str) -> float:
	"""
	Calculate faithfulness using LLM (More accurate).
	"""
	if not self.llm:
	return 0.0

	prompt = f"""Rate the faithfulness of the answer to the context on a scale of 0.0 to 1.0.
	Faithfulness measures if the answer is derived solely from the context given.
	Return ONLY the float score.

	Context:
	{context[:2000]}...

	Answer:
	{answer}

	Score:"""
	try:
	response = self.llm.simple_generate(prompt).strip()
	# extract float
	match = re.search(r"0\.\d+\|1\.0\|0\|1", response)
	if match:
	return float(match.group())
	return 0.5 # Fallback
	except Exception as e:
	print(f"⚠️ LLM Faithfulness failed: {e}")
	return 0.5

	def calculate_relevancy(self, question: str, answer: str) -> float:
	"""
	Calculate how relevant the answer is to the question.
	Uses keyword overlap heuristic.
	"""
	question_terms = set(re.findall(r'\b[a-z]{3,}\b', question.lower()))
	question_tech = set(re.findall(r'\b[A-Z]{2,6}\b', question))

	answer_terms = set(re.findall(r'\b[a-z]{3,}\b', answer.lower()))
	answer_tech = set(re.findall(r'\b[A-Z]{2,6}\b', answer))

	# Remove common words
	common_words = {'what', 'how', 'why', 'when', 'where', 'which', 'the', 'and', 'for'}
	question_terms -= common_words

	if not question_terms and not question_tech:
	return 1.0

	# Calculate overlap
	term_overlap = len(question_terms & answer_terms)
	tech_overlap = len(question_tech & answer_tech)

	total_question = len(question_terms) + len(question_tech)
	overlap = term_overlap + tech_overlap * 2 # Weight technical terms higher

	return min(1.0, overlap / total_question) if total_question > 0 else 1.0

	def calculate_llm_relevancy(self, question: str, answer: str) -> float:
	"""
	Calculate relevancy using LLM.
	"""
	if not self.llm:
	return 0.0

	prompt = f"""Rate the relevancy of the answer to the question on a scale of 0.0 to 1.0.
	Relevancy measures if the answer actually answers the question asked.
	Return ONLY the float score.

	Question:
	{question}

	Answer:
	{answer}

	Score:"""
	try:
	response = self.llm.simple_generate(prompt).strip()
	match = re.search(r"0\.\d+\|1\.0\|0\|1", response)
	if match:
	return float(match.group())
	return 0.5
	except Exception as e:
	print(f"⚠️ LLM Relevancy failed: {e}")
	return 0.5

	def calculate_retrieval_confidence(
	self,
	similarity_scores: List[float]
	) -> float:
	"""
	Calculate confidence based on retrieval quality.
	Uses average of top similarity scores.
	"""
	if not similarity_scores:
	return 0.0

	# Use top 3 scores
	top_scores = sorted(similarity_scores, reverse=True)[:3]
	avg_score = sum(top_scores) / len(top_scores)

	# Check if best match is good enough
	best_score = max(similarity_scores)

	if best_score < self.MIN_SIMILARITY_THRESHOLD:
	return 0.3 # Very low confidence

	return avg_score

	def evaluate(
	self,
	question: str,
	answer: str,
	context: str,
	similarity_scores: List[float],
	use_llm: bool = False
	) -> EvaluationResult:
	"""
	Full evaluation of a RAG response.

	Args:
	question: User's question
	answer: Generated answer
	context: Retrieved context used for generation
	similarity_scores: Similarity scores from retrieval
	use_llm: Whether to use LLM for evaluation (slower, more accurate)

	Returns:
	EvaluationResult with all metrics
	"""
	# Calculate faithfulness
	if use_llm and self.llm:
	faithfulness = self.calculate_llm_faithfulness(answer, context)
	claims = ["LLM Evaluated"] # Skip claim extraction for LLM mode to save time? Or keep it?
	# Let's keep heuristic claims for display, but override score
	_, heuristic_claims, supported, total = self.calculate_faithfulness(answer, context)
	claims = heuristic_claims
	else:
	faithfulness, claims, supported, total = self.calculate_faithfulness(answer, context)

	# Calculate relevancy
	if use_llm and self.llm:
	relevancy = self.calculate_llm_relevancy(question, answer)
	else:
	relevancy = self.calculate_relevancy(question, answer)

	# Calculate retrieval confidence
	retrieval_confidence = self.calculate_retrieval_confidence(similarity_scores)

	# Combined confidence score (weighted average)
	confidence = (
	faithfulness * 0.4 +
	relevancy * 0.3 +
	retrieval_confidence * 0.3
	)

	# Determine abstention
	should_abstain = False
	abstention_reason = ""

	if retrieval_confidence < self.MIN_SIMILARITY_THRESHOLD:
	should_abstain = True
	abstention_reason = "Retrieved documents have low relevance to the question"
	elif faithfulness < self.ABSTENTION_THRESHOLD:
	should_abstain = True
	abstention_reason = "Answer may not be fully grounded in available information"
	elif confidence < self.ABSTENTION_THRESHOLD:
	should_abstain = True
	abstention_reason = "Insufficient confidence to provide a reliable answer"

	# Calculate context precision (relevant chunks / total)
	# Using similarity scores as proxy for relevance
	high_relevance_count = sum(1 for s in similarity_scores if s > 0.5)
	context_precision = high_relevance_count / len(similarity_scores) if similarity_scores else 0.0

	# Calculate context recall (supported claims / total claims)
	# Using heuristic supported count even in LLM mode for now as proxy
	context_recall = supported / total if total > 0 else 1.0

	# Calculate Reliability/Trust Score (Section 5.2 TLM)
	# Weighted average of key metrics:
	# - Faithfulness (40%): Is it true?
	# - Relevancy (30%): Is it useful?
	# - Ctx Precision (20%): Was retrieval good?
	# - Confidence (10%): Does model feel sure?
	trust_score = (faithfulness * 0.4) + (relevancy * 0.3) + (context_precision * 0.2) + (confidence * 0.1)

	return EvaluationResult(
	faithfulness_score=faithfulness,
	relevancy_score=relevancy,
	confidence_score=confidence,
	should_abstain=should_abstain,
	abstention_reason=abstention_reason,
	claims=claims,
	supported_claims=supported,
	total_claims=total,
	context_precision=context_precision,
	context_recall=context_recall,
	trust_score=trust_score,
	consistency_score=1.0 # Placeholder: Requires multi-generation logic
	)

	def get_abstention_message(self, reason: str) -> str:
	"""Generate a polite abstention message."""
	return f"""⚠️ Unable to provide a confident answer

	{reason}

	What you can do:
	- Try rephrasing your question with more specific terms
	- Check if the topic is covered in the knowledge base
	- Consult official 3GPP documentation for authoritative information

	This response was withheld because the system could not verify the accuracy of the answer based on available sources."""


	# Global instance
	_evaluator = None


	def get_evaluator() -> RAGEvaluator:
	"""Get or create global evaluator instance."""
	global _evaluator
	if _evaluator is None:
	_evaluator = RAGEvaluator()
	return _evaluator


	if __name__ == "__main__":
	# Test evaluation
	evaluator = RAGEvaluator()

	question = "What is HARQ in 5G NR?"
	answer = "HARQ (Hybrid Automatic Repeat Request) is a error correction mechanism in 5G NR that combines forward error correction with retransmission. It uses soft combining to improve reliability."
	context = "HARQ (Hybrid Automatic Repeat Request) is a combination of high-rate forward error correction (FEC) and ARQ error-control. In 5G NR, HARQ provides reliable data transmission by using incremental redundancy."

	result = evaluator.evaluate(question, answer, context, [0.85, 0.72, 0.65])

	print("\n📊 Evaluation Results:")
	print(f" Faithfulness: {result.faithfulness_score:.2f}")
	print(f" Relevancy: {result.relevancy_score:.2f}")
	print(f" Confidence: {result.confidence_score:.2f}")
	print(f" Should Abstain: {result.should_abstain}")
	print(f" Claims: {result.total_claims} total, {result.supported_claims} supported")