MediRAG-API / src /evaluation /ragas_eval.py
joytheslothh's picture
deploy: clean build
b6f9fa8
"""
FR-06: src/evaluation/ragas_eval.py — RAGAS Faithfulness + Answer Relevancy
=============================================================================
Wraps the ragas library to compute:
- faithfulness : context-grounded claim verification
- answer_relevancy : semantic similarity of answer to question
Requires an LLM backend. Supported backends (in priority order):
1. Ollama (local, free) — set OLLAMA_HOST env var or use default localhost:11434
2. OpenAI API — set OPENAI_API_KEY env var
3. Graceful degradation — returns score=None with explanation if no LLM available
Usage:
from src.evaluation.ragas_eval import score_ragas
result = score_ragas(question, answer, context_docs)
SRS reference: FR-06, Section 7 (Evaluation Pipeline)
"""
from __future__ import annotations
import logging
import os
import time
from typing import Optional
from src.modules.base import EvalResult
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Backend detection
# ---------------------------------------------------------------------------
def _detect_llm_backend() -> Optional[str]:
"""Return 'ollama', 'openai', or None."""
if os.getenv("OPENAI_API_KEY"):
return "openai"
# Check if Ollama is running locally
try:
import requests
host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
resp = requests.get(f"{host}/api/tags", timeout=2)
if resp.status_code == 200:
return "ollama"
except Exception:
pass
return None
def _build_ragas_llm(backend: str):
"""Build a ragas-compatible LLM wrapper."""
if backend == "openai":
from langchain_openai import ChatOpenAI
return ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
elif backend == "ollama":
from langchain_community.chat_models import ChatOllama
host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
model = os.getenv("OLLAMA_MODEL", "mistral")
return ChatOllama(base_url=host, model=model)
raise ValueError(f"Unknown backend: {backend}")
def _build_ragas_embeddings(backend: str):
"""Build a ragas-compatible embeddings wrapper."""
if backend == "openai":
from langchain_openai import OpenAIEmbeddings
return OpenAIEmbeddings()
elif backend == "ollama":
from langchain_community.embeddings import OllamaEmbeddings
host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
model = os.getenv("OLLAMA_EMBED_MODEL", "nomic-embed-text")
return OllamaEmbeddings(base_url=host, model=model)
raise ValueError(f"Unknown backend: {backend}")
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def score_ragas(
question: str,
answer: str,
context_docs: list[str],
max_contexts: int = 3,
) -> EvalResult:
"""
Compute RAGAS faithfulness and answer_relevancy scores.
Args:
question : Original user question.
answer : LLM-generated answer.
context_docs : Retrieved context passages.
max_contexts : Max context chunks to pass to RAGAS (to limit token cost).
Returns:
EvalResult with module_name="ragas", score in [0,1].
score = mean(faithfulness, answer_relevancy).
Returns score=0.5 (neutral) if no LLM backend is available.
"""
t0 = time.perf_counter()
backend = _detect_llm_backend()
if backend is None:
logger.warning(
"No LLM backend available for RAGAS. "
"Set OPENAI_API_KEY or start Ollama (ollama serve). "
"Returning neutral score (0.5)."
)
return EvalResult(
module_name="ragas",
score=0.5,
details={
"backend": None,
"faithfulness": None,
"answer_relevancy": None,
"note": "No LLM backend — set OPENAI_API_KEY or start Ollama",
},
latency_ms=int((time.perf_counter() - t0) * 1000),
)
try:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy
llm = _build_ragas_llm(backend)
embeddings = _build_ragas_embeddings(backend)
# Configure metrics to use our chosen backend
faithfulness.llm = llm
faithfulness.embeddings = embeddings
answer_relevancy.llm = llm
answer_relevancy.embeddings = embeddings
contexts = context_docs[:max_contexts]
dataset = Dataset.from_dict(
{
"question": [question],
"answer": [answer],
"contexts": [contexts],
}
)
result = evaluate(dataset, metrics=[faithfulness, answer_relevancy])
faith_score = float(result["faithfulness"])
relevancy_score = float(result["answer_relevancy"])
composite = (faith_score + relevancy_score) / 2.0
details = {
"backend": backend,
"faithfulness": round(faith_score, 4),
"answer_relevancy": round(relevancy_score, 4),
}
latency_ms = int((time.perf_counter() - t0) * 1000)
logger.info(
"RAGAS: faith=%.3f, relevancy=%.3f → composite=%.3f in %d ms",
faith_score, relevancy_score, composite, latency_ms,
)
return EvalResult(
module_name="ragas",
score=composite,
details=details,
latency_ms=latency_ms,
)
except Exception as exc:
logger.error("RAGAS evaluation failed: %s", exc)
return EvalResult(
module_name="ragas",
score=0.5,
details={"backend": backend, "error": str(exc)},
error=str(exc),
latency_ms=int((time.perf_counter() - t0) * 1000),
)