Spaces:

newhorizons
/

rags_api

Sleeping

File size: 6,144 Bytes

cdb228e

"""Evaluation module for the RAG system using Ragas.
This script provides tools to measure faithfulness, relevancy, and retrieval precision.

How to run:
    python eval.py <testset_csv_path>
"""

# pylint: disable=import-error,no-name-in-module,invalid-name,broad-except,missing-function-docstring,missing-class-docstring,wrong-import-order,ungrouped-imports,line-too-long,logging-fstring-interpolation,import-outside-toplevel

import os
import logging
import pandas as pd
from typing import List, Optional, Any
from datasets import Dataset
from ragas import evaluate
from ragas.metrics.collections import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

try:
    from langchain.chat_models import ChatOpenAI
except Exception:
    from langchain_openai import ChatOpenAI

try:
    from langchain_huggingface import HuggingFaceEmbeddings
except Exception:
    from langchain_community.embeddings import HuggingFaceEmbeddings


def run_evaluation(
    questions: List[str],
    answers: List[str],
    contexts: List[List[str]],
    ground_truths: Optional[List[str]] = None,
) -> Any:
    """
    Run Ragas evaluation on a set of QA results.

    Parameters
    ----------
    questions : List[str]
        List of user questions.
    answers : List[str]
        List of generated answers.
    contexts : List[List[str]]
        List of context strings retrieved for each question.
    ground_truths : List[str], optional
        Optional list of ground truth answers for recall metrics.

    Returns
    -------
    Any
        Ragas evaluation results containing metric scores.
    """
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
    }
    if ground_truths:
        data["ground_truth"] = ground_truths

    # Ragas evaluate works best with dataset objects
    dataset = Dataset.from_dict(data)

    # Use OpenRouter if key is available, else default to OpenAI
    openrouter_key = os.getenv("OPENROUTER_API_KEY")
    if openrouter_key:
        # Use OpenRouter-compatible base and forward the key as the OpenAI key
        os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
        os.environ["OPENAI_API_KEY"] = openrouter_key

    # Allow overriding the eval/model via env var; default to a compatible model
    eval_model = os.getenv(
        "OPENAI_MODEL", os.getenv("EVAL_MODEL", "openai/gpt-oss-120b")
    )
    logging.info("Using evaluation LLM model=%s", eval_model)
    # Allow overriding how many generations ragas requests from the LLM.
    # Some providers (or models) ignore multi-generation requests; default to 1 to avoid warnings.
    try:
        num_gens = int(os.getenv("RAGAS_NUM_GENERATIONS", "1"))
    except Exception:
        num_gens = 1
    logging.info("Requesting %s generation(s) per prompt", num_gens)
    try:
        llm = ChatOpenAI(model=eval_model, n=num_gens)
    except TypeError:
        # Some ChatOpenAI wrappers do not accept `n` at construction; fall back to default.
        llm = ChatOpenAI(model=eval_model)

    # Use the same embeddings as the main app for consistency
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

    logging.info("Starting Ragas evaluation...")
    result = evaluate(
        dataset=dataset,
        metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
        llm=llm,
        embeddings=embeddings,
    )
    logging.info("Evaluation complete.")

    return result


def extract_scalar_metrics(result: Any) -> dict:
    """Extract common scalar metrics (faithfulness, relevancy, precision, recall)
    from a ragas evaluation result. Returns a dict of metric->float or empty dict.
    """
    keys_of_interest = {
        "faithfulness",
        "answer_relevancy",
        "context_precision",
        "context_recall",
        "relevancy",
        "precision",
        "recall",
    }

    found: dict = {}

    def is_number(x):
        return isinstance(x, (int, float)) and not isinstance(x, bool)

    def traverse(obj):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if (
                    isinstance(k, str)
                    and k.lower() in keys_of_interest
                    and is_number(v)
                ):
                    found[k.lower()] = float(v)
                traverse(v)
        elif isinstance(obj, (list, tuple)):
            for v in obj:
                traverse(v)
        else:
            try:
                if hasattr(obj, "__dict__"):
                    traverse(vars(obj))
            except Exception:
                pass

    try:
        traverse(result)
        # check common attrs
        for attr in ("metrics", "results", "scores", "score"):
            try:
                val = getattr(result, attr, None)
                if val is not None:
                    traverse(val)
            except Exception:
                pass
    except Exception:
        pass

    return found


def evaluate_from_csv(csv_path: str) -> Any:
    """
    Load a testset from CSV and run evaluation.

    Parameters
    ----------
    csv_path : str
        Path to the testset CSV.

    Returns
    -------
    Any
        Evaluation results.
    """
    df = pd.read_csv(csv_path)
    # Ragas testset generation typically provides 'question', 'answer', 'contexts', 'ground_truth'
    # 'contexts' is often stored as a string representation of a list in CSV
    import ast

    df["contexts"] = df["contexts"].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )

    return run_evaluation(
        questions=df["question"].tolist(),
        answers=df["answer"].tolist(),
        contexts=df["contexts"].tolist(),
        ground_truths=(
            df["ground_truth"].tolist() if "ground_truth" in df.columns else None
        ),
    )


if __name__ == "__main__":
    import sys

    logging.basicConfig(level=logging.INFO)
    if len(sys.argv) > 1:
        res = evaluate_from_csv(sys.argv[1])
        print(res)
    else:
        logging.info("Eval module ready. Pass a CSV file to evaluate.")