File size: 6,144 Bytes
cdb228e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""Evaluation module for the RAG system using Ragas.
This script provides tools to measure faithfulness, relevancy, and retrieval precision.

How to run:
    python eval.py <testset_csv_path>
"""

# pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,wrong-import-order,ungrouped-imports,line-too-long,logging-fstring-interpolation,import-outside-toplevel

import os
import logging
import pandas as pd
from typing import List, Optional, Any
from datasets import Dataset
from ragas import evaluate
from ragas.metrics.collections import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

try:
    from langchain.chat_models import ChatOpenAI
except Exception:
    from langchain_openai import ChatOpenAI

try:
    from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
    from langchain_community.embeddings import HuggingFaceEmbeddings


def run_evaluation(
    questions: List[str],
    answers: List[str],
    contexts: List[List[str]],
    ground_truths: Optional[List[str]] = None,
) -> Any:
    """
    Run Ragas evaluation on a set of QA results.

    Parameters
    ----------
    questions : List[str]
        List of user questions.
    answers : List[str]
        List of generated answers.
    contexts : List[List[str]]
        List of context strings retrieved for each question.
    ground_truths : List[str], optional
        Optional list of ground truth answers for recall metrics.

    Returns
    -------
    Any
        Ragas evaluation results containing metric scores.
    """
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
    }
    if ground_truths:
        data["ground_truth"] = ground_truths

    # Ragas evaluate works best with dataset objects
    dataset = Dataset.from_dict(data)

    # Use OpenRouter if key is available, else default to OpenAI
    openrouter_key = os.getenv("OPENROUTER_API_KEY")
    if openrouter_key:
        # Use OpenRouter-compatible base and forward the key as the OpenAI key
        os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
        os.environ["OPENAI_API_KEY"] = openrouter_key

    # Allow overriding the eval/model via env var; default to a compatible model
    eval_model = os.getenv(
        "OPENAI_MODEL", os.getenv("EVAL_MODEL", "openai/gpt-oss-120b")
    )
    logging.info("Using evaluation LLM model=%s", eval_model)
    # Allow overriding how many generations ragas requests from the LLM.
    # Some providers (or models) ignore multi-generation requests; default to 1 to avoid warnings.
    try:
        num_gens = int(os.getenv("RAGAS_NUM_GENERATIONS", "1"))
    except Exception:
        num_gens = 1
    logging.info("Requesting %s generation(s) per prompt", num_gens)
    try:
        llm = ChatOpenAI(model=eval_model, n=num_gens)
    except TypeError:
        # Some ChatOpenAI wrappers do not accept `n` at construction; fall back to default.
        llm = ChatOpenAI(model=eval_model)

    # Use the same embeddings as the main app for consistency
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

    logging.info("Starting Ragas evaluation...")
    result = evaluate(
        dataset=dataset,
        metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
        llm=llm,
        embeddings=embeddings,
    )
    logging.info("Evaluation complete.")

    return result


def extract_scalar_metrics(result: Any) -> dict:
    """Extract common scalar metrics (faithfulness, relevancy, precision, recall)
    from a ragas evaluation result. Returns a dict of metric->float or empty dict.
    """
    keys_of_interest = {
        "faithfulness",
        "answer_relevancy",
        "context_precision",
        "context_recall",
        "relevancy",
        "precision",
        "recall",
    }

    found: dict = {}

    def is_number(x):
        return isinstance(x, (int, float)) and not isinstance(x, bool)

    def traverse(obj):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if (
                    isinstance(k, str)
                    and k.lower() in keys_of_interest
                    and is_number(v)
                ):
                    found[k.lower()] = float(v)
                traverse(v)
        elif isinstance(obj, (list, tuple)):
            for v in obj:
                traverse(v)
        else:
            try:
                if hasattr(obj, "__dict__"):
                    traverse(vars(obj))
            except Exception:
                pass

    try:
        traverse(result)
        # check common attrs
        for attr in ("metrics", "results", "scores", "score"):
            try:
                val = getattr(result, attr, None)
                if val is not None:
                    traverse(val)
            except Exception:
                pass
    except Exception:
        pass

    return found


def evaluate_from_csv(csv_path: str) -> Any:
    """
    Load a testset from CSV and run evaluation.

    Parameters
    ----------
    csv_path : str
        Path to the testset CSV.

    Returns
    -------
    Any
        Evaluation results.
    """
    df = pd.read_csv(csv_path)
    # Ragas testset generation typically provides 'question', 'answer', 'contexts', 'ground_truth'
    # 'contexts' is often stored as a string representation of a list in CSV
    import ast

    df["contexts"] = df["contexts"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )

    return run_evaluation(
        questions=df["question"].tolist(),
        answers=df["answer"].tolist(),
        contexts=df["contexts"].tolist(),
        ground_truths=(
            df["ground_truth"].tolist() if "ground_truth" in df.columns else None
        ),
    )


if __name__ == "__main__":
    import sys

    logging.basicConfig(level=logging.INFO)
    if len(sys.argv) > 1:
        res = evaluate_from_csv(sys.argv[1])
        print(res)
    else:
        logging.info("Eval module ready. Pass a CSV file to evaluate.")