File size: 6,061 Bytes
b6f9fa8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
FR-06: src/evaluation/ragas_eval.py — RAGAS Faithfulness + Answer Relevancy
=============================================================================
Wraps the ragas library to compute:
    - faithfulness      : context-grounded claim verification
    - answer_relevancy  : semantic similarity of answer to question

Requires an LLM backend. Supported backends (in priority order):
    1. Ollama (local, free)  — set OLLAMA_HOST env var or use default localhost:11434
    2. OpenAI API            — set OPENAI_API_KEY env var
    3. Graceful degradation  — returns score=None with explanation if no LLM available

Usage:
    from src.evaluation.ragas_eval import score_ragas
    result = score_ragas(question, answer, context_docs)

SRS reference: FR-06, Section 7 (Evaluation Pipeline)
"""
from __future__ import annotations

import logging
import os
import time
from typing import Optional

from src.modules.base import EvalResult

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Backend detection
# ---------------------------------------------------------------------------

def _detect_llm_backend() -> Optional[str]:
    """Return 'ollama', 'openai', or None."""
    if os.getenv("OPENAI_API_KEY"):
        return "openai"
    # Check if Ollama is running locally
    try:
        import requests
        host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
        resp = requests.get(f"{host}/api/tags", timeout=2)
        if resp.status_code == 200:
            return "ollama"
    except Exception:
        pass
    return None


def _build_ragas_llm(backend: str):
    """Build a ragas-compatible LLM wrapper."""
    if backend == "openai":
        from langchain_openai import ChatOpenAI
        return ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    elif backend == "ollama":
        from langchain_community.chat_models import ChatOllama
        host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
        model = os.getenv("OLLAMA_MODEL", "mistral")
        return ChatOllama(base_url=host, model=model)
    raise ValueError(f"Unknown backend: {backend}")


def _build_ragas_embeddings(backend: str):
    """Build a ragas-compatible embeddings wrapper."""
    if backend == "openai":
        from langchain_openai import OpenAIEmbeddings
        return OpenAIEmbeddings()
    elif backend == "ollama":
        from langchain_community.embeddings import OllamaEmbeddings
        host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
        model = os.getenv("OLLAMA_EMBED_MODEL", "nomic-embed-text")
        return OllamaEmbeddings(base_url=host, model=model)
    raise ValueError(f"Unknown backend: {backend}")


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def score_ragas(
    question: str,
    answer: str,
    context_docs: list[str],
    max_contexts: int = 3,
) -> EvalResult:
    """
    Compute RAGAS faithfulness and answer_relevancy scores.

    Args:
        question     : Original user question.
        answer       : LLM-generated answer.
        context_docs : Retrieved context passages.
        max_contexts : Max context chunks to pass to RAGAS (to limit token cost).

    Returns:
        EvalResult with module_name="ragas", score in [0,1].
        score = mean(faithfulness, answer_relevancy).
        Returns score=0.5 (neutral) if no LLM backend is available.
    """
    t0 = time.perf_counter()

    backend = _detect_llm_backend()
    if backend is None:
        logger.warning(
            "No LLM backend available for RAGAS. "
            "Set OPENAI_API_KEY or start Ollama (ollama serve). "
            "Returning neutral score (0.5)."
        )
        return EvalResult(
            module_name="ragas",
            score=0.5,
            details={
                "backend": None,
                "faithfulness": None,
                "answer_relevancy": None,
                "note": "No LLM backend — set OPENAI_API_KEY or start Ollama",
            },
            latency_ms=int((time.perf_counter() - t0) * 1000),
        )

    try:
        from datasets import Dataset
        from ragas import evaluate
        from ragas.metrics import faithfulness, answer_relevancy

        llm = _build_ragas_llm(backend)
        embeddings = _build_ragas_embeddings(backend)

        # Configure metrics to use our chosen backend
        faithfulness.llm = llm
        faithfulness.embeddings = embeddings
        answer_relevancy.llm = llm
        answer_relevancy.embeddings = embeddings

        contexts = context_docs[:max_contexts]
        dataset = Dataset.from_dict(
            {
                "question": [question],
                "answer": [answer],
                "contexts": [contexts],
            }
        )

        result = evaluate(dataset, metrics=[faithfulness, answer_relevancy])

        faith_score = float(result["faithfulness"])
        relevancy_score = float(result["answer_relevancy"])
        composite = (faith_score + relevancy_score) / 2.0

        details = {
            "backend": backend,
            "faithfulness": round(faith_score, 4),
            "answer_relevancy": round(relevancy_score, 4),
        }

        latency_ms = int((time.perf_counter() - t0) * 1000)
        logger.info(
            "RAGAS: faith=%.3f, relevancy=%.3f → composite=%.3f in %d ms",
            faith_score, relevancy_score, composite, latency_ms,
        )
        return EvalResult(
            module_name="ragas",
            score=composite,
            details=details,
            latency_ms=latency_ms,
        )

    except Exception as exc:
        logger.error("RAGAS evaluation failed: %s", exc)
        return EvalResult(
            module_name="ragas",
            score=0.5,
            details={"backend": backend, "error": str(exc)},
            error=str(exc),
            latency_ms=int((time.perf_counter() - t0) * 1000),
        )