rags_api / eval.py
Skier8402's picture
Upload 3 files
cdb228e verified
"""Evaluation module for the RAG system using Ragas.
This script provides tools to measure faithfulness, relevancy, and retrieval precision.
How to run:
python eval.py <testset_csv_path>
"""
# pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,wrong-import-order,ungrouped-imports,line-too-long,logging-fstring-interpolation,import-outside-toplevel
import os
import logging
import pandas as pd
from typing import List, Optional, Any
from datasets import Dataset
from ragas import evaluate
from ragas.metrics.collections import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
try:
from langchain.chat_models import ChatOpenAI
except Exception:
from langchain_openai import ChatOpenAI
try:
from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
from langchain_community.embeddings import HuggingFaceEmbeddings
def run_evaluation(
questions: List[str],
answers: List[str],
contexts: List[List[str]],
ground_truths: Optional[List[str]] = None,
) -> Any:
"""
Run Ragas evaluation on a set of QA results.
Parameters
----------
questions : List[str]
List of user questions.
answers : List[str]
List of generated answers.
contexts : List[List[str]]
List of context strings retrieved for each question.
ground_truths : List[str], optional
Optional list of ground truth answers for recall metrics.
Returns
-------
Any
Ragas evaluation results containing metric scores.
"""
data = {
"question": questions,
"answer": answers,
"contexts": contexts,
}
if ground_truths:
data["ground_truth"] = ground_truths
# Ragas evaluate works best with dataset objects
dataset = Dataset.from_dict(data)
# Use OpenRouter if key is available, else default to OpenAI
openrouter_key = os.getenv("OPENROUTER_API_KEY")
if openrouter_key:
# Use OpenRouter-compatible base and forward the key as the OpenAI key
os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
os.environ["OPENAI_API_KEY"] = openrouter_key
# Allow overriding the eval/model via env var; default to a compatible model
eval_model = os.getenv(
"OPENAI_MODEL", os.getenv("EVAL_MODEL", "openai/gpt-oss-120b")
)
logging.info("Using evaluation LLM model=%s", eval_model)
# Allow overriding how many generations ragas requests from the LLM.
# Some providers (or models) ignore multi-generation requests; default to 1 to avoid warnings.
try:
num_gens = int(os.getenv("RAGAS_NUM_GENERATIONS", "1"))
except Exception:
num_gens = 1
logging.info("Requesting %s generation(s) per prompt", num_gens)
try:
llm = ChatOpenAI(model=eval_model, n=num_gens)
except TypeError:
# Some ChatOpenAI wrappers do not accept `n` at construction; fall back to default.
llm = ChatOpenAI(model=eval_model)
# Use the same embeddings as the main app for consistency
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
logging.info("Starting Ragas evaluation...")
result = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
llm=llm,
embeddings=embeddings,
)
logging.info("Evaluation complete.")
return result
def extract_scalar_metrics(result: Any) -> dict:
"""Extract common scalar metrics (faithfulness, relevancy, precision, recall)
from a ragas evaluation result. Returns a dict of metric->float or empty dict.
"""
keys_of_interest = {
"faithfulness",
"answer_relevancy",
"context_precision",
"context_recall",
"relevancy",
"precision",
"recall",
}
found: dict = {}
def is_number(x):
return isinstance(x, (int, float)) and not isinstance(x, bool)
def traverse(obj):
if isinstance(obj, dict):
for k, v in obj.items():
if (
isinstance(k, str)
and k.lower() in keys_of_interest
and is_number(v)
):
found[k.lower()] = float(v)
traverse(v)
elif isinstance(obj, (list, tuple)):
for v in obj:
traverse(v)
else:
try:
if hasattr(obj, "__dict__"):
traverse(vars(obj))
except Exception:
pass
try:
traverse(result)
# check common attrs
for attr in ("metrics", "results", "scores", "score"):
try:
val = getattr(result, attr, None)
if val is not None:
traverse(val)
except Exception:
pass
except Exception:
pass
return found
def evaluate_from_csv(csv_path: str) -> Any:
"""
Load a testset from CSV and run evaluation.
Parameters
----------
csv_path : str
Path to the testset CSV.
Returns
-------
Any
Evaluation results.
"""
df = pd.read_csv(csv_path)
# Ragas testset generation typically provides 'question', 'answer', 'contexts', 'ground_truth'
# 'contexts' is often stored as a string representation of a list in CSV
import ast
df["contexts"] = df["contexts"].apply(
lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
return run_evaluation(
questions=df["question"].tolist(),
answers=df["answer"].tolist(),
contexts=df["contexts"].tolist(),
ground_truths=(
df["ground_truth"].tolist() if "ground_truth" in df.columns else None
),
)
if __name__ == "__main__":
import sys
logging.basicConfig(level=logging.INFO)
if len(sys.argv) > 1:
res = evaluate_from_csv(sys.argv[1])
print(res)
else:
logging.info("Eval module ready. Pass a CSV file to evaluate.")