enterprise-rag-system / src /evaluation.py
Faraz618's picture
Update src/evaluation.py
9f6648b verified
Raw
History Blame Contribute Delete
4.95 kB
"""
evaluation.py — Custom RAG evaluation metrics (no paid LLM required).
Three cosine-similarity-based metrics:
1. FAITHFULNESS
Does the answer contain only claims supported by the retrieved context?
Method: For each answer sentence, find its max cosine similarity to any
context chunk. Average these scores.
Low score = potential hallucination.
2. ANSWER RELEVANCE
Does the answer address what was actually asked?
Method: Cosine similarity between answer embedding and query embedding.
3. CONTEXT PRECISION
Are the retrieved chunks relevant to the query?
Method: Rank-weighted average of FAISS retrieval scores.
These are proxy metrics. They correlate well with quality and run in
milliseconds with zero API cost. For production, augment with periodic
human evaluation on a curated golden test set.
"""
import logging
import re
import numpy as np
from src.embeddings import embed_texts
logger = logging.getLogger("enterprise-rag.evaluation")
def compute_faithfulness(answer: str, context_chunks: list) -> float:
"""
Faithfulness score: how grounded is the answer in the retrieved context?
Range 0.0 (hallucinated) to 1.0 (fully grounded).
"""
if not answer or not context_chunks:
return 0.0
sentences = _split_sentences(answer)
if not sentences:
return 0.0
try:
answer_embs = embed_texts(sentences)
context_embs = embed_texts(context_chunks)
if answer_embs.shape[0] == 0 or context_embs.shape[0] == 0:
return 0.0
# shape: (n_sentences, n_chunks)
sim_matrix = np.dot(answer_embs, context_embs.T)
max_sims = sim_matrix.max(axis=1)
score = float(np.mean(max_sims))
return round(min(max(score, 0.0), 1.0), 4)
except Exception as e:
logger.warning(f"Faithfulness computation failed: {e}")
return 0.0
def compute_answer_relevance(answer: str, query: str) -> float:
"""
Answer relevance: does the answer address the question?
Range 0.0 (off-topic) to 1.0 (directly answers question).
"""
if not answer or not query:
return 0.0
try:
embs = embed_texts([query.strip(), answer.strip()])
if embs.shape[0] < 2:
return 0.0
score = float(np.dot(embs[0], embs[1]))
return round(min(max(score, 0.0), 1.0), 4)
except Exception as e:
logger.warning(f"Answer relevance computation failed: {e}")
return 0.0
def compute_context_precision(scores: list) -> float:
"""
Context precision: are the retrieved chunks relevant?
Uses rank-weighted average of retrieval similarity scores.
Range 0.0 (poor retrieval) to 1.0 (perfect retrieval).
"""
if not scores:
return 0.0
weighted = sum(
score * (1.0 / (rank + 1))
for rank, score in enumerate(scores)
)
normalizer = sum(1.0 / (rank + 1) for rank in range(len(scores)))
score = weighted / normalizer if normalizer > 0 else 0.0
return round(min(max(score, 0.0), 1.0), 4)
def run_evaluation(
query: str,
answer: str,
context_chunks: list,
retrieval_scores: list,
) -> dict:
"""
Run all three evaluation metrics and return a summary dict.
Called by app.py after each query to populate the metrics panel.
"""
# Skip evaluation for fallback/error responses
if not answer or answer.startswith("⚠️") or "unable to find" in answer.lower():
return {
"faithfulness": 0.0,
"answer_relevance": 0.0,
"context_precision": 0.0,
"overall": 0.0,
"quality_label": "N/A",
"note": "Evaluation skipped — fallback or error response.",
}
faithfulness = compute_faithfulness(answer, context_chunks)
answer_relevance = compute_answer_relevance(answer, query)
context_precision = compute_context_precision(retrieval_scores)
overall = round((faithfulness + answer_relevance + context_precision) / 3, 4)
quality_label = _quality_label(overall)
logger.info(
f"Eval scores — Faithfulness: {faithfulness} | "
f"Relevance: {answer_relevance} | "
f"Precision: {context_precision} | "
f"Overall: {overall}"
)
return {
"faithfulness": faithfulness,
"answer_relevance": answer_relevance,
"context_precision": context_precision,
"overall": overall,
"quality_label": quality_label,
"note": "",
}
def _quality_label(score: float) -> str:
if score >= 0.75:
return "✅ High Quality"
elif score >= 0.50:
return "🟡 Moderate Quality"
else:
return "🔴 Low Quality — Review Answer"
def _split_sentences(text: str) -> list:
"""Simple sentence splitter — no NLTK dependency required."""
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
return [s.strip() for s in sentences if len(s.strip()) > 10]