"""Evaluation module for the RAG system using Ragas. This script provides tools to measure faithfulness, relevancy, and retrieval precision. How to run: python eval.py """ # pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,wrong-import-order,ungrouped-imports,line-too-long,logging-fstring-interpolation,import-outside-toplevel import os import logging import pandas as pd from typing import List, Optional, Any from datasets import Dataset from ragas import evaluate from ragas.metrics.collections import ( faithfulness, answer_relevancy, context_precision, context_recall, ) try: from langchain.chat_models import ChatOpenAI except Exception: from langchain_openai import ChatOpenAI try: from langchain_huggingface import HuggingFaceEmbeddings except Exception: from langchain_community.embeddings import HuggingFaceEmbeddings def run_evaluation( questions: List[str], answers: List[str], contexts: List[List[str]], ground_truths: Optional[List[str]] = None, ) -> Any: """ Run Ragas evaluation on a set of QA results. Parameters ---------- questions : List[str] List of user questions. answers : List[str] List of generated answers. contexts : List[List[str]] List of context strings retrieved for each question. ground_truths : List[str], optional Optional list of ground truth answers for recall metrics. Returns ------- Any Ragas evaluation results containing metric scores. """ data = { "question": questions, "answer": answers, "contexts": contexts, } if ground_truths: data["ground_truth"] = ground_truths # Ragas evaluate works best with dataset objects dataset = Dataset.from_dict(data) # Use OpenRouter if key is available, else default to OpenAI openrouter_key = os.getenv("OPENROUTER_API_KEY") if openrouter_key: # Use OpenRouter-compatible base and forward the key as the OpenAI key os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1" os.environ["OPENAI_API_KEY"] = openrouter_key # Allow overriding the eval/model via env var; default to a compatible model eval_model = os.getenv( "OPENAI_MODEL", os.getenv("EVAL_MODEL", "openai/gpt-oss-120b") ) logging.info("Using evaluation LLM model=%s", eval_model) # Allow overriding how many generations ragas requests from the LLM. # Some providers (or models) ignore multi-generation requests; default to 1 to avoid warnings. try: num_gens = int(os.getenv("RAGAS_NUM_GENERATIONS", "1")) except Exception: num_gens = 1 logging.info("Requesting %s generation(s) per prompt", num_gens) try: llm = ChatOpenAI(model=eval_model, n=num_gens) except TypeError: # Some ChatOpenAI wrappers do not accept `n` at construction; fall back to default. llm = ChatOpenAI(model=eval_model) # Use the same embeddings as the main app for consistency embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5") logging.info("Starting Ragas evaluation...") result = evaluate( dataset=dataset, metrics=[faithfulness, answer_relevancy, context_precision, context_recall], llm=llm, embeddings=embeddings, ) logging.info("Evaluation complete.") return result def extract_scalar_metrics(result: Any) -> dict: """Extract common scalar metrics (faithfulness, relevancy, precision, recall) from a ragas evaluation result. Returns a dict of metric->float or empty dict. """ keys_of_interest = { "faithfulness", "answer_relevancy", "context_precision", "context_recall", "relevancy", "precision", "recall", } found: dict = {} def is_number(x): return isinstance(x, (int, float)) and not isinstance(x, bool) def traverse(obj): if isinstance(obj, dict): for k, v in obj.items(): if ( isinstance(k, str) and k.lower() in keys_of_interest and is_number(v) ): found[k.lower()] = float(v) traverse(v) elif isinstance(obj, (list, tuple)): for v in obj: traverse(v) else: try: if hasattr(obj, "__dict__"): traverse(vars(obj)) except Exception: pass try: traverse(result) # check common attrs for attr in ("metrics", "results", "scores", "score"): try: val = getattr(result, attr, None) if val is not None: traverse(val) except Exception: pass except Exception: pass return found def evaluate_from_csv(csv_path: str) -> Any: """ Load a testset from CSV and run evaluation. Parameters ---------- csv_path : str Path to the testset CSV. Returns ------- Any Evaluation results. """ df = pd.read_csv(csv_path) # Ragas testset generation typically provides 'question', 'answer', 'contexts', 'ground_truth' # 'contexts' is often stored as a string representation of a list in CSV import ast df["contexts"] = df["contexts"].apply( lambda x: ast.literal_eval(x) if isinstance(x, str) else x ) return run_evaluation( questions=df["question"].tolist(), answers=df["answer"].tolist(), contexts=df["contexts"].tolist(), ground_truths=( df["ground_truth"].tolist() if "ground_truth" in df.columns else None ), ) if __name__ == "__main__": import sys logging.basicConfig(level=logging.INFO) if len(sys.argv) > 1: res = evaluate_from_csv(sys.argv[1]) print(res) else: logging.info("Eval module ready. Pass a CSV file to evaluate.")