Spaces:
Build error
Build error
| """ | |
| evaluation.py — Custom RAG evaluation metrics (no paid LLM required). | |
| Three cosine-similarity-based metrics: | |
| 1. FAITHFULNESS | |
| Does the answer contain only claims supported by the retrieved context? | |
| Method: For each answer sentence, find its max cosine similarity to any | |
| context chunk. Average these scores. | |
| Low score = potential hallucination. | |
| 2. ANSWER RELEVANCE | |
| Does the answer address what was actually asked? | |
| Method: Cosine similarity between answer embedding and query embedding. | |
| 3. CONTEXT PRECISION | |
| Are the retrieved chunks relevant to the query? | |
| Method: Rank-weighted average of FAISS retrieval scores. | |
| These are proxy metrics. They correlate well with quality and run in | |
| milliseconds with zero API cost. For production, augment with periodic | |
| human evaluation on a curated golden test set. | |
| """ | |
| import logging | |
| import re | |
| import numpy as np | |
| from src.embeddings import embed_texts | |
| logger = logging.getLogger("enterprise-rag.evaluation") | |
| def compute_faithfulness(answer: str, context_chunks: list) -> float: | |
| """ | |
| Faithfulness score: how grounded is the answer in the retrieved context? | |
| Range 0.0 (hallucinated) to 1.0 (fully grounded). | |
| """ | |
| if not answer or not context_chunks: | |
| return 0.0 | |
| sentences = _split_sentences(answer) | |
| if not sentences: | |
| return 0.0 | |
| try: | |
| answer_embs = embed_texts(sentences) | |
| context_embs = embed_texts(context_chunks) | |
| if answer_embs.shape[0] == 0 or context_embs.shape[0] == 0: | |
| return 0.0 | |
| # shape: (n_sentences, n_chunks) | |
| sim_matrix = np.dot(answer_embs, context_embs.T) | |
| max_sims = sim_matrix.max(axis=1) | |
| score = float(np.mean(max_sims)) | |
| return round(min(max(score, 0.0), 1.0), 4) | |
| except Exception as e: | |
| logger.warning(f"Faithfulness computation failed: {e}") | |
| return 0.0 | |
| def compute_answer_relevance(answer: str, query: str) -> float: | |
| """ | |
| Answer relevance: does the answer address the question? | |
| Range 0.0 (off-topic) to 1.0 (directly answers question). | |
| """ | |
| if not answer or not query: | |
| return 0.0 | |
| try: | |
| embs = embed_texts([query.strip(), answer.strip()]) | |
| if embs.shape[0] < 2: | |
| return 0.0 | |
| score = float(np.dot(embs[0], embs[1])) | |
| return round(min(max(score, 0.0), 1.0), 4) | |
| except Exception as e: | |
| logger.warning(f"Answer relevance computation failed: {e}") | |
| return 0.0 | |
| def compute_context_precision(scores: list) -> float: | |
| """ | |
| Context precision: are the retrieved chunks relevant? | |
| Uses rank-weighted average of retrieval similarity scores. | |
| Range 0.0 (poor retrieval) to 1.0 (perfect retrieval). | |
| """ | |
| if not scores: | |
| return 0.0 | |
| weighted = sum( | |
| score * (1.0 / (rank + 1)) | |
| for rank, score in enumerate(scores) | |
| ) | |
| normalizer = sum(1.0 / (rank + 1) for rank in range(len(scores))) | |
| score = weighted / normalizer if normalizer > 0 else 0.0 | |
| return round(min(max(score, 0.0), 1.0), 4) | |
| def run_evaluation( | |
| query: str, | |
| answer: str, | |
| context_chunks: list, | |
| retrieval_scores: list, | |
| ) -> dict: | |
| """ | |
| Run all three evaluation metrics and return a summary dict. | |
| Called by app.py after each query to populate the metrics panel. | |
| """ | |
| # Skip evaluation for fallback/error responses | |
| if not answer or answer.startswith("⚠️") or "unable to find" in answer.lower(): | |
| return { | |
| "faithfulness": 0.0, | |
| "answer_relevance": 0.0, | |
| "context_precision": 0.0, | |
| "overall": 0.0, | |
| "quality_label": "N/A", | |
| "note": "Evaluation skipped — fallback or error response.", | |
| } | |
| faithfulness = compute_faithfulness(answer, context_chunks) | |
| answer_relevance = compute_answer_relevance(answer, query) | |
| context_precision = compute_context_precision(retrieval_scores) | |
| overall = round((faithfulness + answer_relevance + context_precision) / 3, 4) | |
| quality_label = _quality_label(overall) | |
| logger.info( | |
| f"Eval scores — Faithfulness: {faithfulness} | " | |
| f"Relevance: {answer_relevance} | " | |
| f"Precision: {context_precision} | " | |
| f"Overall: {overall}" | |
| ) | |
| return { | |
| "faithfulness": faithfulness, | |
| "answer_relevance": answer_relevance, | |
| "context_precision": context_precision, | |
| "overall": overall, | |
| "quality_label": quality_label, | |
| "note": "", | |
| } | |
| def _quality_label(score: float) -> str: | |
| if score >= 0.75: | |
| return "✅ High Quality" | |
| elif score >= 0.50: | |
| return "🟡 Moderate Quality" | |
| else: | |
| return "🔴 Low Quality — Review Answer" | |
| def _split_sentences(text: str) -> list: | |
| """Simple sentence splitter — no NLTK dependency required.""" | |
| sentences = re.split(r"(?<=[.!?])\s+", text.strip()) | |
| return [s.strip() for s in sentences if len(s.strip()) > 10] |