Spaces:

newhorizons
/

rags_api

Sleeping

App Files Files Community

rags_api / eval.py

Skier8402

Upload 3 files

cdb228e verified 28 days ago

raw

history blame contribute delete

6.14 kB

	"""Evaluation module for the RAG system using Ragas.
	This script provides tools to measure faithfulness, relevancy, and retrieval precision.

	How to run:
	python eval.py <testset_csv_path>
	"""

	# pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,wrong-import-order,ungrouped-imports,line-too-long,logging-fstring-interpolation,import-outside-toplevel

	import os
	import logging
	import pandas as pd
	from typing import List, Optional, Any
	from datasets import Dataset
	from ragas import evaluate
	from ragas.metrics.collections import (
	faithfulness,
	answer_relevancy,
	context_precision,
	context_recall,
	)

	try:
	from langchain.chat_models import ChatOpenAI
	except Exception:
	from langchain_openai import ChatOpenAI

	try:
	from langchain_huggingface import HuggingFaceEmbeddings
	except Exception:
	from langchain_community.embeddings import HuggingFaceEmbeddings


	def run_evaluation(
	questions: List[str],
	answers: List[str],
	contexts: List[List[str]],
	ground_truths: Optional[List[str]] = None,
	) -> Any:
	"""
	Run Ragas evaluation on a set of QA results.

	Parameters
	----------
	questions : List[str]
	List of user questions.
	answers : List[str]
	List of generated answers.
	contexts : List[List[str]]
	List of context strings retrieved for each question.
	ground_truths : List[str], optional
	Optional list of ground truth answers for recall metrics.

	Returns
	-------
	Any
	Ragas evaluation results containing metric scores.
	"""
	data = {
	"question": questions,
	"answer": answers,
	"contexts": contexts,
	}
	if ground_truths:
	data["ground_truth"] = ground_truths

	# Ragas evaluate works best with dataset objects
	dataset = Dataset.from_dict(data)

	# Use OpenRouter if key is available, else default to OpenAI
	openrouter_key = os.getenv("OPENROUTER_API_KEY")
	if openrouter_key:
	# Use OpenRouter-compatible base and forward the key as the OpenAI key
	os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
	os.environ["OPENAI_API_KEY"] = openrouter_key

	# Allow overriding the eval/model via env var; default to a compatible model
	eval_model = os.getenv(
	"OPENAI_MODEL", os.getenv("EVAL_MODEL", "openai/gpt-oss-120b")
	)
	logging.info("Using evaluation LLM model=%s", eval_model)
	# Allow overriding how many generations ragas requests from the LLM.
	# Some providers (or models) ignore multi-generation requests; default to 1 to avoid warnings.
	try:
	num_gens = int(os.getenv("RAGAS_NUM_GENERATIONS", "1"))
	except Exception:
	num_gens = 1
	logging.info("Requesting %s generation(s) per prompt", num_gens)
	try:
	llm = ChatOpenAI(model=eval_model, n=num_gens)
	except TypeError:
	# Some ChatOpenAI wrappers do not accept `n` at construction; fall back to default.
	llm = ChatOpenAI(model=eval_model)

	# Use the same embeddings as the main app for consistency
	embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

	logging.info("Starting Ragas evaluation...")
	result = evaluate(
	dataset=dataset,
	metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
	llm=llm,
	embeddings=embeddings,
	)
	logging.info("Evaluation complete.")

	return result


	def extract_scalar_metrics(result: Any) -> dict:
	"""Extract common scalar metrics (faithfulness, relevancy, precision, recall)
	from a ragas evaluation result. Returns a dict of metric->float or empty dict.
	"""
	keys_of_interest = {
	"faithfulness",
	"answer_relevancy",
	"context_precision",
	"context_recall",
	"relevancy",
	"precision",
	"recall",
	}

	found: dict = {}

	def is_number(x):
	return isinstance(x, (int, float)) and not isinstance(x, bool)

	def traverse(obj):
	if isinstance(obj, dict):
	for k, v in obj.items():
	if (
	isinstance(k, str)
	and k.lower() in keys_of_interest
	and is_number(v)
	):
	found[k.lower()] = float(v)
	traverse(v)
	elif isinstance(obj, (list, tuple)):
	for v in obj:
	traverse(v)
	else:
	try:
	if hasattr(obj, "__dict__"):
	traverse(vars(obj))
	except Exception:
	pass

	try:
	traverse(result)
	# check common attrs
	for attr in ("metrics", "results", "scores", "score"):
	try:
	val = getattr(result, attr, None)
	if val is not None:
	traverse(val)
	except Exception:
	pass
	except Exception:
	pass

	return found


	def evaluate_from_csv(csv_path: str) -> Any:
	"""
	Load a testset from CSV and run evaluation.

	Parameters
	----------
	csv_path : str
	Path to the testset CSV.

	Returns
	-------
	Any
	Evaluation results.
	"""
	df = pd.read_csv(csv_path)
	# Ragas testset generation typically provides 'question', 'answer', 'contexts', 'ground_truth'
	# 'contexts' is often stored as a string representation of a list in CSV
	import ast

	df["contexts"] = df["contexts"].apply(
	lambda x: ast.literal_eval(x) if isinstance(x, str) else x
	)

	return run_evaluation(
	questions=df["question"].tolist(),
	answers=df["answer"].tolist(),
	contexts=df["contexts"].tolist(),
	ground_truths=(
	df["ground_truth"].tolist() if "ground_truth" in df.columns else None
	),
	)


	if __name__ == "__main__":
	import sys

	logging.basicConfig(level=logging.INFO)
	if len(sys.argv) > 1:
	res = evaluate_from_csv(sys.argv[1])
	print(res)
	else:
	logging.info("Eval module ready. Pass a CSV file to evaluate.")