telecom-rag / src /evaluation.py
ajaymauryabbn's picture
feat: Telecom RAG System - Production Ready
eb731f7
"""Telecom RAG - Evaluation Module
Implements RAGAS-style evaluation metrics for hallucination detection:
- Faithfulness scoring (is answer grounded in context?)
- Answer relevancy (does answer address the question?)
- Abstention logic (refuse when confidence is low)
"""
import re
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from .config import (
LLM_PROVIDER,
OPENAI_API_KEY,
GOOGLE_API_KEY,
OPENAI_MODEL,
GEMINI_MODEL
)
@dataclass
class EvaluationResult:
"""Evaluation metrics for a RAG response."""
faithfulness_score: float # 0-1: How grounded is answer in context
relevancy_score: float # 0-1: How relevant is answer to question
confidence_score: float # 0-1: Combined confidence
should_abstain: bool # True if confidence too low
abstention_reason: str # Why abstaining (if applicable)
claims: List[str] # Extracted claims from answer
supported_claims: int # Number of claims supported by context
total_claims: int # Total claims in answer
# Context quality metrics (per architecture doc Section 5.1)
context_precision: float = 0.0 # Relevant chunks / total chunks (target: >0.70)
context_recall: float = 0.0 # Covered claims / total claims (target: >0.85)
# TLM Trust Metrics (Section 5.2)
trust_score: float = 0.0 # Combined reliability metric
consistency_score: float = 0.0 # Self-consistency agreement (0-1)
class RAGEvaluator:
"""
Evaluates RAG responses for faithfulness and relevancy.
Implements abstention logic for low-confidence answers.
"""
# Thresholds tuned for telecom domain with quality built-in KB
FAITHFULNESS_THRESHOLD = 0.8 # Flag for review if below
ABSTENTION_THRESHOLD = 0.3 # Refuse only for very low confidence
MIN_SIMILARITY_THRESHOLD = 0.2 # Allow lower similarity since domain-specific
def __init__(self):
self.llm_available = self._check_llm()
self.llm = None
if self.llm_available:
try:
from .llm import TelecomLLM
self.llm = TelecomLLM()
except Exception as e:
print(f"⚠️ Failed to init LLM for eval: {e}")
def _check_llm(self) -> bool:
"""Check if LLM is available for evaluation."""
if LLM_PROVIDER == "openai" and OPENAI_API_KEY and OPENAI_API_KEY != "your_openai_api_key_here":
return True
if LLM_PROVIDER == "gemini" and GOOGLE_API_KEY and GOOGLE_API_KEY != "your_google_api_key_here":
return True
return False
def extract_claims(self, answer: str) -> List[str]:
"""
Extract factual claims from an answer.
Simple heuristic: split by sentences and filter.
"""
# Split into sentences
sentences = re.split(r'[.!?]+', answer)
claims = []
for sent in sentences:
sent = sent.strip()
# Filter out very short or non-factual sentences
if len(sent) > 20 and not sent.startswith(('I ', 'We ', 'You ')):
# Check if it contains factual content (numbers, technical terms)
if re.search(r'\d|[A-Z]{2,}|specifically|defined|means|refers to', sent):
claims.append(sent)
return claims
def check_claim_support(self, claim: str, context: str) -> bool:
"""
Check if a claim is supported by the context.
Uses simple keyword/phrase overlap heuristic.
"""
claim_lower = claim.lower()
context_lower = context.lower()
# Extract key terms from claim
terms = re.findall(r'\b[a-z]{3,}\b', claim_lower)
technical_terms = re.findall(r'\b[A-Z]{2,6}\b', claim)
# Count term overlap
term_matches = sum(1 for t in terms if t in context_lower)
tech_matches = sum(1 for t in technical_terms if t in context)
# Calculate support ratio
total_terms = len(terms) + len(technical_terms)
if total_terms == 0:
return True # No specific claims to verify
support_ratio = (term_matches + tech_matches * 2) / (total_terms + len(technical_terms))
return support_ratio > 0.3
def calculate_faithfulness(
self,
answer: str,
context: str
) -> Tuple[float, List[str], int, int]:
"""
Calculate faithfulness score (Heuristic).
Measures how grounded the answer is in the provided context.
"""
claims = self.extract_claims(answer)
if not claims:
return 1.0, [], 0, 0 # No claims = faithful by default
supported = 0
for claim in claims:
if self.check_claim_support(claim, context):
supported += 1
score = supported / len(claims) if claims else 1.0
return score, claims, supported, len(claims)
def calculate_llm_faithfulness(self, answer: str, context: str) -> float:
"""
Calculate faithfulness using LLM (More accurate).
"""
if not self.llm:
return 0.0
prompt = f"""Rate the faithfulness of the answer to the context on a scale of 0.0 to 1.0.
Faithfulness measures if the answer is derived solely from the context given.
Return ONLY the float score.
Context:
{context[:2000]}...
Answer:
{answer}
Score:"""
try:
response = self.llm.simple_generate(prompt).strip()
# extract float
match = re.search(r"0\.\d+|1\.0|0|1", response)
if match:
return float(match.group())
return 0.5 # Fallback
except Exception as e:
print(f"⚠️ LLM Faithfulness failed: {e}")
return 0.5
def calculate_relevancy(self, question: str, answer: str) -> float:
"""
Calculate how relevant the answer is to the question.
Uses keyword overlap heuristic.
"""
question_terms = set(re.findall(r'\b[a-z]{3,}\b', question.lower()))
question_tech = set(re.findall(r'\b[A-Z]{2,6}\b', question))
answer_terms = set(re.findall(r'\b[a-z]{3,}\b', answer.lower()))
answer_tech = set(re.findall(r'\b[A-Z]{2,6}\b', answer))
# Remove common words
common_words = {'what', 'how', 'why', 'when', 'where', 'which', 'the', 'and', 'for'}
question_terms -= common_words
if not question_terms and not question_tech:
return 1.0
# Calculate overlap
term_overlap = len(question_terms & answer_terms)
tech_overlap = len(question_tech & answer_tech)
total_question = len(question_terms) + len(question_tech)
overlap = term_overlap + tech_overlap * 2 # Weight technical terms higher
return min(1.0, overlap / total_question) if total_question > 0 else 1.0
def calculate_llm_relevancy(self, question: str, answer: str) -> float:
"""
Calculate relevancy using LLM.
"""
if not self.llm:
return 0.0
prompt = f"""Rate the relevancy of the answer to the question on a scale of 0.0 to 1.0.
Relevancy measures if the answer actually answers the question asked.
Return ONLY the float score.
Question:
{question}
Answer:
{answer}
Score:"""
try:
response = self.llm.simple_generate(prompt).strip()
match = re.search(r"0\.\d+|1\.0|0|1", response)
if match:
return float(match.group())
return 0.5
except Exception as e:
print(f"⚠️ LLM Relevancy failed: {e}")
return 0.5
def calculate_retrieval_confidence(
self,
similarity_scores: List[float]
) -> float:
"""
Calculate confidence based on retrieval quality.
Uses average of top similarity scores.
"""
if not similarity_scores:
return 0.0
# Use top 3 scores
top_scores = sorted(similarity_scores, reverse=True)[:3]
avg_score = sum(top_scores) / len(top_scores)
# Check if best match is good enough
best_score = max(similarity_scores)
if best_score < self.MIN_SIMILARITY_THRESHOLD:
return 0.3 # Very low confidence
return avg_score
def evaluate(
self,
question: str,
answer: str,
context: str,
similarity_scores: List[float],
use_llm: bool = False
) -> EvaluationResult:
"""
Full evaluation of a RAG response.
Args:
question: User's question
answer: Generated answer
context: Retrieved context used for generation
similarity_scores: Similarity scores from retrieval
use_llm: Whether to use LLM for evaluation (slower, more accurate)
Returns:
EvaluationResult with all metrics
"""
# Calculate faithfulness
if use_llm and self.llm:
faithfulness = self.calculate_llm_faithfulness(answer, context)
claims = ["LLM Evaluated"] # Skip claim extraction for LLM mode to save time? Or keep it?
# Let's keep heuristic claims for display, but override score
_, heuristic_claims, supported, total = self.calculate_faithfulness(answer, context)
claims = heuristic_claims
else:
faithfulness, claims, supported, total = self.calculate_faithfulness(answer, context)
# Calculate relevancy
if use_llm and self.llm:
relevancy = self.calculate_llm_relevancy(question, answer)
else:
relevancy = self.calculate_relevancy(question, answer)
# Calculate retrieval confidence
retrieval_confidence = self.calculate_retrieval_confidence(similarity_scores)
# Combined confidence score (weighted average)
confidence = (
faithfulness * 0.4 +
relevancy * 0.3 +
retrieval_confidence * 0.3
)
# Determine abstention
should_abstain = False
abstention_reason = ""
if retrieval_confidence < self.MIN_SIMILARITY_THRESHOLD:
should_abstain = True
abstention_reason = "Retrieved documents have low relevance to the question"
elif faithfulness < self.ABSTENTION_THRESHOLD:
should_abstain = True
abstention_reason = "Answer may not be fully grounded in available information"
elif confidence < self.ABSTENTION_THRESHOLD:
should_abstain = True
abstention_reason = "Insufficient confidence to provide a reliable answer"
# Calculate context precision (relevant chunks / total)
# Using similarity scores as proxy for relevance
high_relevance_count = sum(1 for s in similarity_scores if s > 0.5)
context_precision = high_relevance_count / len(similarity_scores) if similarity_scores else 0.0
# Calculate context recall (supported claims / total claims)
# Using heuristic supported count even in LLM mode for now as proxy
context_recall = supported / total if total > 0 else 1.0
# Calculate Reliability/Trust Score (Section 5.2 TLM)
# Weighted average of key metrics:
# - Faithfulness (40%): Is it true?
# - Relevancy (30%): Is it useful?
# - Ctx Precision (20%): Was retrieval good?
# - Confidence (10%): Does model feel sure?
trust_score = (faithfulness * 0.4) + (relevancy * 0.3) + (context_precision * 0.2) + (confidence * 0.1)
return EvaluationResult(
faithfulness_score=faithfulness,
relevancy_score=relevancy,
confidence_score=confidence,
should_abstain=should_abstain,
abstention_reason=abstention_reason,
claims=claims,
supported_claims=supported,
total_claims=total,
context_precision=context_precision,
context_recall=context_recall,
trust_score=trust_score,
consistency_score=1.0 # Placeholder: Requires multi-generation logic
)
def get_abstention_message(self, reason: str) -> str:
"""Generate a polite abstention message."""
return f"""⚠️ **Unable to provide a confident answer**
{reason}
**What you can do:**
- Try rephrasing your question with more specific terms
- Check if the topic is covered in the knowledge base
- Consult official 3GPP documentation for authoritative information
*This response was withheld because the system could not verify the accuracy of the answer based on available sources.*"""
# Global instance
_evaluator = None
def get_evaluator() -> RAGEvaluator:
"""Get or create global evaluator instance."""
global _evaluator
if _evaluator is None:
_evaluator = RAGEvaluator()
return _evaluator
if __name__ == "__main__":
# Test evaluation
evaluator = RAGEvaluator()
question = "What is HARQ in 5G NR?"
answer = "HARQ (Hybrid Automatic Repeat Request) is a error correction mechanism in 5G NR that combines forward error correction with retransmission. It uses soft combining to improve reliability."
context = "HARQ (Hybrid Automatic Repeat Request) is a combination of high-rate forward error correction (FEC) and ARQ error-control. In 5G NR, HARQ provides reliable data transmission by using incremental redundancy."
result = evaluator.evaluate(question, answer, context, [0.85, 0.72, 0.65])
print("\n📊 Evaluation Results:")
print(f" Faithfulness: {result.faithfulness_score:.2f}")
print(f" Relevancy: {result.relevancy_score:.2f}")
print(f" Confidence: {result.confidence_score:.2f}")
print(f" Should Abstain: {result.should_abstain}")
print(f" Claims: {result.total_claims} total, {result.supported_claims} supported")