enterprise-rag-system

Build error

App Files Files Community

enterprise-rag-system / src /evaluation.py

Faraz618

Update src/evaluation.py

9f6648b verified about 1 month ago

Raw

History Blame Contribute Delete

4.95 kB

	"""
	evaluation.py — Custom RAG evaluation metrics (no paid LLM required).

	Three cosine-similarity-based metrics:

	1. FAITHFULNESS
	Does the answer contain only claims supported by the retrieved context?
	Method: For each answer sentence, find its max cosine similarity to any
	context chunk. Average these scores.
	Low score = potential hallucination.

	2. ANSWER RELEVANCE
	Does the answer address what was actually asked?
	Method: Cosine similarity between answer embedding and query embedding.

	3. CONTEXT PRECISION
	Are the retrieved chunks relevant to the query?
	Method: Rank-weighted average of FAISS retrieval scores.

	These are proxy metrics. They correlate well with quality and run in
	milliseconds with zero API cost. For production, augment with periodic
	human evaluation on a curated golden test set.
	"""

	import logging
	import re
	import numpy as np
	from src.embeddings import embed_texts

	logger = logging.getLogger("enterprise-rag.evaluation")


	def compute_faithfulness(answer: str, context_chunks: list) -> float:
	"""
	Faithfulness score: how grounded is the answer in the retrieved context?
	Range 0.0 (hallucinated) to 1.0 (fully grounded).
	"""
	if not answer or not context_chunks:
	return 0.0

	sentences = _split_sentences(answer)
	if not sentences:
	return 0.0

	try:
	answer_embs = embed_texts(sentences)
	context_embs = embed_texts(context_chunks)

	if answer_embs.shape[0] == 0 or context_embs.shape[0] == 0:
	return 0.0

	# shape: (n_sentences, n_chunks)
	sim_matrix = np.dot(answer_embs, context_embs.T)
	max_sims = sim_matrix.max(axis=1)
	score = float(np.mean(max_sims))
	return round(min(max(score, 0.0), 1.0), 4)
	except Exception as e:
	logger.warning(f"Faithfulness computation failed: {e}")
	return 0.0


	def compute_answer_relevance(answer: str, query: str) -> float:
	"""
	Answer relevance: does the answer address the question?
	Range 0.0 (off-topic) to 1.0 (directly answers question).
	"""
	if not answer or not query:
	return 0.0

	try:
	embs = embed_texts([query.strip(), answer.strip()])
	if embs.shape[0] < 2:
	return 0.0
	score = float(np.dot(embs[0], embs[1]))
	return round(min(max(score, 0.0), 1.0), 4)
	except Exception as e:
	logger.warning(f"Answer relevance computation failed: {e}")
	return 0.0


	def compute_context_precision(scores: list) -> float:
	"""
	Context precision: are the retrieved chunks relevant?
	Uses rank-weighted average of retrieval similarity scores.
	Range 0.0 (poor retrieval) to 1.0 (perfect retrieval).
	"""
	if not scores:
	return 0.0

	weighted = sum(
	score * (1.0 / (rank + 1))
	for rank, score in enumerate(scores)
	)
	normalizer = sum(1.0 / (rank + 1) for rank in range(len(scores)))
	score = weighted / normalizer if normalizer > 0 else 0.0
	return round(min(max(score, 0.0), 1.0), 4)


	def run_evaluation(
	query: str,
	answer: str,
	context_chunks: list,
	retrieval_scores: list,
	) -> dict:
	"""
	Run all three evaluation metrics and return a summary dict.
	Called by app.py after each query to populate the metrics panel.
	"""
	# Skip evaluation for fallback/error responses
	if not answer or answer.startswith("⚠️") or "unable to find" in answer.lower():
	return {
	"faithfulness": 0.0,
	"answer_relevance": 0.0,
	"context_precision": 0.0,
	"overall": 0.0,
	"quality_label": "N/A",
	"note": "Evaluation skipped — fallback or error response.",
	}

	faithfulness = compute_faithfulness(answer, context_chunks)
	answer_relevance = compute_answer_relevance(answer, query)
	context_precision = compute_context_precision(retrieval_scores)

	overall = round((faithfulness + answer_relevance + context_precision) / 3, 4)
	quality_label = _quality_label(overall)

	logger.info(
	f"Eval scores — Faithfulness: {faithfulness} \| "
	f"Relevance: {answer_relevance} \| "
	f"Precision: {context_precision} \| "
	f"Overall: {overall}"
	)

	return {
	"faithfulness": faithfulness,
	"answer_relevance": answer_relevance,
	"context_precision": context_precision,
	"overall": overall,
	"quality_label": quality_label,
	"note": "",
	}


	def _quality_label(score: float) -> str:
	if score >= 0.75:
	return "✅ High Quality"
	elif score >= 0.50:
	return "🟡 Moderate Quality"
	else:
	return "🔴 Low Quality — Review Answer"


	def _split_sentences(text: str) -> list:
	"""Simple sentence splitter — no NLTK dependency required."""
	sentences = re.split(r"(?<=[.!?])\s+", text.strip())
	return [s.strip() for s in sentences if len(s.strip()) > 10]