Spaces:
Sleeping
Sleeping
| """Evaluation module for the RAG system using Ragas. | |
| This script provides tools to measure faithfulness, relevancy, and retrieval precision. | |
| How to run: | |
| python eval.py <testset_csv_path> | |
| """ | |
| # pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,wrong-import-order,ungrouped-imports,line-too-long,logging-fstring-interpolation,import-outside-toplevel | |
| import os | |
| import logging | |
| import pandas as pd | |
| from typing import List, Optional, Any | |
| from datasets import Dataset | |
| from ragas import evaluate | |
| from ragas.metrics.collections import ( | |
| faithfulness, | |
| answer_relevancy, | |
| context_precision, | |
| context_recall, | |
| ) | |
| try: | |
| from langchain.chat_models import ChatOpenAI | |
| except Exception: | |
| from langchain_openai import ChatOpenAI | |
| try: | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| except Exception: | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| def run_evaluation( | |
| questions: List[str], | |
| answers: List[str], | |
| contexts: List[List[str]], | |
| ground_truths: Optional[List[str]] = None, | |
| ) -> Any: | |
| """ | |
| Run Ragas evaluation on a set of QA results. | |
| Parameters | |
| ---------- | |
| questions : List[str] | |
| List of user questions. | |
| answers : List[str] | |
| List of generated answers. | |
| contexts : List[List[str]] | |
| List of context strings retrieved for each question. | |
| ground_truths : List[str], optional | |
| Optional list of ground truth answers for recall metrics. | |
| Returns | |
| ------- | |
| Any | |
| Ragas evaluation results containing metric scores. | |
| """ | |
| data = { | |
| "question": questions, | |
| "answer": answers, | |
| "contexts": contexts, | |
| } | |
| if ground_truths: | |
| data["ground_truth"] = ground_truths | |
| # Ragas evaluate works best with dataset objects | |
| dataset = Dataset.from_dict(data) | |
| # Use OpenRouter if key is available, else default to OpenAI | |
| openrouter_key = os.getenv("OPENROUTER_API_KEY") | |
| if openrouter_key: | |
| # Use OpenRouter-compatible base and forward the key as the OpenAI key | |
| os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1" | |
| os.environ["OPENAI_API_KEY"] = openrouter_key | |
| # Allow overriding the eval/model via env var; default to a compatible model | |
| eval_model = os.getenv( | |
| "OPENAI_MODEL", os.getenv("EVAL_MODEL", "openai/gpt-oss-120b") | |
| ) | |
| logging.info("Using evaluation LLM model=%s", eval_model) | |
| # Allow overriding how many generations ragas requests from the LLM. | |
| # Some providers (or models) ignore multi-generation requests; default to 1 to avoid warnings. | |
| try: | |
| num_gens = int(os.getenv("RAGAS_NUM_GENERATIONS", "1")) | |
| except Exception: | |
| num_gens = 1 | |
| logging.info("Requesting %s generation(s) per prompt", num_gens) | |
| try: | |
| llm = ChatOpenAI(model=eval_model, n=num_gens) | |
| except TypeError: | |
| # Some ChatOpenAI wrappers do not accept `n` at construction; fall back to default. | |
| llm = ChatOpenAI(model=eval_model) | |
| # Use the same embeddings as the main app for consistency | |
| embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5") | |
| logging.info("Starting Ragas evaluation...") | |
| result = evaluate( | |
| dataset=dataset, | |
| metrics=[faithfulness, answer_relevancy, context_precision, context_recall], | |
| llm=llm, | |
| embeddings=embeddings, | |
| ) | |
| logging.info("Evaluation complete.") | |
| return result | |
| def extract_scalar_metrics(result: Any) -> dict: | |
| """Extract common scalar metrics (faithfulness, relevancy, precision, recall) | |
| from a ragas evaluation result. Returns a dict of metric->float or empty dict. | |
| """ | |
| keys_of_interest = { | |
| "faithfulness", | |
| "answer_relevancy", | |
| "context_precision", | |
| "context_recall", | |
| "relevancy", | |
| "precision", | |
| "recall", | |
| } | |
| found: dict = {} | |
| def is_number(x): | |
| return isinstance(x, (int, float)) and not isinstance(x, bool) | |
| def traverse(obj): | |
| if isinstance(obj, dict): | |
| for k, v in obj.items(): | |
| if ( | |
| isinstance(k, str) | |
| and k.lower() in keys_of_interest | |
| and is_number(v) | |
| ): | |
| found[k.lower()] = float(v) | |
| traverse(v) | |
| elif isinstance(obj, (list, tuple)): | |
| for v in obj: | |
| traverse(v) | |
| else: | |
| try: | |
| if hasattr(obj, "__dict__"): | |
| traverse(vars(obj)) | |
| except Exception: | |
| pass | |
| try: | |
| traverse(result) | |
| # check common attrs | |
| for attr in ("metrics", "results", "scores", "score"): | |
| try: | |
| val = getattr(result, attr, None) | |
| if val is not None: | |
| traverse(val) | |
| except Exception: | |
| pass | |
| except Exception: | |
| pass | |
| return found | |
| def evaluate_from_csv(csv_path: str) -> Any: | |
| """ | |
| Load a testset from CSV and run evaluation. | |
| Parameters | |
| ---------- | |
| csv_path : str | |
| Path to the testset CSV. | |
| Returns | |
| ------- | |
| Any | |
| Evaluation results. | |
| """ | |
| df = pd.read_csv(csv_path) | |
| # Ragas testset generation typically provides 'question', 'answer', 'contexts', 'ground_truth' | |
| # 'contexts' is often stored as a string representation of a list in CSV | |
| import ast | |
| df["contexts"] = df["contexts"].apply( | |
| lambda x: ast.literal_eval(x) if isinstance(x, str) else x | |
| ) | |
| return run_evaluation( | |
| questions=df["question"].tolist(), | |
| answers=df["answer"].tolist(), | |
| contexts=df["contexts"].tolist(), | |
| ground_truths=( | |
| df["ground_truth"].tolist() if "ground_truth" in df.columns else None | |
| ), | |
| ) | |
| if __name__ == "__main__": | |
| import sys | |
| logging.basicConfig(level=logging.INFO) | |
| if len(sys.argv) > 1: | |
| res = evaluate_from_csv(sys.argv[1]) | |
| print(res) | |
| else: | |
| logging.info("Eval module ready. Pass a CSV file to evaluate.") | |