Spaces:
Sleeping
Sleeping
File size: 6,144 Bytes
cdb228e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | """Evaluation module for the RAG system using Ragas.
This script provides tools to measure faithfulness, relevancy, and retrieval precision.
How to run:
python eval.py <testset_csv_path>
"""
# pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,wrong-import-order,ungrouped-imports,line-too-long,logging-fstring-interpolation,import-outside-toplevel
import os
import logging
import pandas as pd
from typing import List, Optional, Any
from datasets import Dataset
from ragas import evaluate
from ragas.metrics.collections import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
try:
from langchain.chat_models import ChatOpenAI
except Exception:
from langchain_openai import ChatOpenAI
try:
from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
from langchain_community.embeddings import HuggingFaceEmbeddings
def run_evaluation(
questions: List[str],
answers: List[str],
contexts: List[List[str]],
ground_truths: Optional[List[str]] = None,
) -> Any:
"""
Run Ragas evaluation on a set of QA results.
Parameters
----------
questions : List[str]
List of user questions.
answers : List[str]
List of generated answers.
contexts : List[List[str]]
List of context strings retrieved for each question.
ground_truths : List[str], optional
Optional list of ground truth answers for recall metrics.
Returns
-------
Any
Ragas evaluation results containing metric scores.
"""
data = {
"question": questions,
"answer": answers,
"contexts": contexts,
}
if ground_truths:
data["ground_truth"] = ground_truths
# Ragas evaluate works best with dataset objects
dataset = Dataset.from_dict(data)
# Use OpenRouter if key is available, else default to OpenAI
openrouter_key = os.getenv("OPENROUTER_API_KEY")
if openrouter_key:
# Use OpenRouter-compatible base and forward the key as the OpenAI key
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
os.environ["OPENAI_API_KEY"] = openrouter_key
# Allow overriding the eval/model via env var; default to a compatible model
eval_model = os.getenv(
"OPENAI_MODEL", os.getenv("EVAL_MODEL", "openai/gpt-oss-120b")
)
logging.info("Using evaluation LLM model=%s", eval_model)
# Allow overriding how many generations ragas requests from the LLM.
# Some providers (or models) ignore multi-generation requests; default to 1 to avoid warnings.
try:
num_gens = int(os.getenv("RAGAS_NUM_GENERATIONS", "1"))
except Exception:
num_gens = 1
logging.info("Requesting %s generation(s) per prompt", num_gens)
try:
llm = ChatOpenAI(model=eval_model, n=num_gens)
except TypeError:
# Some ChatOpenAI wrappers do not accept `n` at construction; fall back to default.
llm = ChatOpenAI(model=eval_model)
# Use the same embeddings as the main app for consistency
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
logging.info("Starting Ragas evaluation...")
result = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
llm=llm,
embeddings=embeddings,
)
logging.info("Evaluation complete.")
return result
def extract_scalar_metrics(result: Any) -> dict:
"""Extract common scalar metrics (faithfulness, relevancy, precision, recall)
from a ragas evaluation result. Returns a dict of metric->float or empty dict.
"""
keys_of_interest = {
"faithfulness",
"answer_relevancy",
"context_precision",
"context_recall",
"relevancy",
"precision",
"recall",
}
found: dict = {}
def is_number(x):
return isinstance(x, (int, float)) and not isinstance(x, bool)
def traverse(obj):
if isinstance(obj, dict):
for k, v in obj.items():
if (
isinstance(k, str)
and k.lower() in keys_of_interest
and is_number(v)
):
found[k.lower()] = float(v)
traverse(v)
elif isinstance(obj, (list, tuple)):
for v in obj:
traverse(v)
else:
try:
if hasattr(obj, "__dict__"):
traverse(vars(obj))
except Exception:
pass
try:
traverse(result)
# check common attrs
for attr in ("metrics", "results", "scores", "score"):
try:
val = getattr(result, attr, None)
if val is not None:
traverse(val)
except Exception:
pass
except Exception:
pass
return found
def evaluate_from_csv(csv_path: str) -> Any:
"""
Load a testset from CSV and run evaluation.
Parameters
----------
csv_path : str
Path to the testset CSV.
Returns
-------
Any
Evaluation results.
"""
df = pd.read_csv(csv_path)
# Ragas testset generation typically provides 'question', 'answer', 'contexts', 'ground_truth'
# 'contexts' is often stored as a string representation of a list in CSV
import ast
df["contexts"] = df["contexts"].apply(
lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
return run_evaluation(
questions=df["question"].tolist(),
answers=df["answer"].tolist(),
contexts=df["contexts"].tolist(),
ground_truths=(
df["ground_truth"].tolist() if "ground_truth" in df.columns else None
),
)
if __name__ == "__main__":
import sys
logging.basicConfig(level=logging.INFO)
if len(sys.argv) > 1:
res = evaluate_from_csv(sys.argv[1])
print(res)
else:
logging.info("Eval module ready. Pass a CSV file to evaluate.")
|