""" FR-06: src/evaluation/ragas_eval.py — RAGAS Faithfulness + Answer Relevancy ============================================================================= Wraps the ragas library to compute: - faithfulness : context-grounded claim verification - answer_relevancy : semantic similarity of answer to question Requires an LLM backend. Supported backends (in priority order): 1. Ollama (local, free) — set OLLAMA_HOST env var or use default localhost:11434 2. OpenAI API — set OPENAI_API_KEY env var 3. Graceful degradation — returns score=None with explanation if no LLM available Usage: from src.evaluation.ragas_eval import score_ragas result = score_ragas(question, answer, context_docs) SRS reference: FR-06, Section 7 (Evaluation Pipeline) """ from __future__ import annotations import logging import os import time from typing import Optional from src.modules.base import EvalResult logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Backend detection # --------------------------------------------------------------------------- def _detect_llm_backend() -> Optional[str]: """Return 'ollama', 'openai', or None.""" if os.getenv("OPENAI_API_KEY"): return "openai" # Check if Ollama is running locally try: import requests host = os.getenv("OLLAMA_HOST", "http://localhost:11434") resp = requests.get(f"{host}/api/tags", timeout=2) if resp.status_code == 200: return "ollama" except Exception: pass return None def _build_ragas_llm(backend: str): """Build a ragas-compatible LLM wrapper.""" if backend == "openai": from langchain_openai import ChatOpenAI return ChatOpenAI(model="gpt-3.5-turbo", temperature=0) elif backend == "ollama": from langchain_community.chat_models import ChatOllama host = os.getenv("OLLAMA_HOST", "http://localhost:11434") model = os.getenv("OLLAMA_MODEL", "mistral") return ChatOllama(base_url=host, model=model) raise ValueError(f"Unknown backend: {backend}") def _build_ragas_embeddings(backend: str): """Build a ragas-compatible embeddings wrapper.""" if backend == "openai": from langchain_openai import OpenAIEmbeddings return OpenAIEmbeddings() elif backend == "ollama": from langchain_community.embeddings import OllamaEmbeddings host = os.getenv("OLLAMA_HOST", "http://localhost:11434") model = os.getenv("OLLAMA_EMBED_MODEL", "nomic-embed-text") return OllamaEmbeddings(base_url=host, model=model) raise ValueError(f"Unknown backend: {backend}") # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def score_ragas( question: str, answer: str, context_docs: list[str], max_contexts: int = 3, ) -> EvalResult: """ Compute RAGAS faithfulness and answer_relevancy scores. Args: question : Original user question. answer : LLM-generated answer. context_docs : Retrieved context passages. max_contexts : Max context chunks to pass to RAGAS (to limit token cost). Returns: EvalResult with module_name="ragas", score in [0,1]. score = mean(faithfulness, answer_relevancy). Returns score=0.5 (neutral) if no LLM backend is available. """ t0 = time.perf_counter() backend = _detect_llm_backend() if backend is None: logger.warning( "No LLM backend available for RAGAS. " "Set OPENAI_API_KEY or start Ollama (ollama serve). " "Returning neutral score (0.5)." ) return EvalResult( module_name="ragas", score=0.5, details={ "backend": None, "faithfulness": None, "answer_relevancy": None, "note": "No LLM backend — set OPENAI_API_KEY or start Ollama", }, latency_ms=int((time.perf_counter() - t0) * 1000), ) try: from datasets import Dataset from ragas import evaluate from ragas.metrics import faithfulness, answer_relevancy llm = _build_ragas_llm(backend) embeddings = _build_ragas_embeddings(backend) # Configure metrics to use our chosen backend faithfulness.llm = llm faithfulness.embeddings = embeddings answer_relevancy.llm = llm answer_relevancy.embeddings = embeddings contexts = context_docs[:max_contexts] dataset = Dataset.from_dict( { "question": [question], "answer": [answer], "contexts": [contexts], } ) result = evaluate(dataset, metrics=[faithfulness, answer_relevancy]) faith_score = float(result["faithfulness"]) relevancy_score = float(result["answer_relevancy"]) composite = (faith_score + relevancy_score) / 2.0 details = { "backend": backend, "faithfulness": round(faith_score, 4), "answer_relevancy": round(relevancy_score, 4), } latency_ms = int((time.perf_counter() - t0) * 1000) logger.info( "RAGAS: faith=%.3f, relevancy=%.3f → composite=%.3f in %d ms", faith_score, relevancy_score, composite, latency_ms, ) return EvalResult( module_name="ragas", score=composite, details=details, latency_ms=latency_ms, ) except Exception as exc: logger.error("RAGAS evaluation failed: %s", exc) return EvalResult( module_name="ragas", score=0.5, details={"backend": backend, "error": str(exc)}, error=str(exc), latency_ms=int((time.perf_counter() - t0) * 1000), )