File size: 6,374 Bytes

336f4a9

from statistics import variance
from typing import Optional, Union

import numpy as np
import weave
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from weave import Scorer


class AnswerRelevancyScorer(Scorer):
    """Evaluate the relevancy of answers generated by a LLM.

    This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
    compared to the input query.

    The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
    actual output of an LLM application is in relation to the input query.

    Attributes:
        threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
        model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
        include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
        strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
                            1. Defaults to False.
        async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
        verbose (bool): Whether to print intermediate steps to the console, defaults to False.
        metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
    """

    threshold: float = Optional[None]
    model: str = Optional[None]
    include_reason: bool = Optional[None]
    strict_mode: bool = Optional[None]
    async_mode: bool = Optional[None]
    verbose: bool = Optional[None]
    metric: AnswerRelevancyMetric = Optional[None]

    def __init__(
        self,
        threshold: float = 0.5,
        model: str = "gpt-4",
        include_reason: bool = True,
        strict_mode: bool = False,
        async_mode: bool = True,
        verbose: bool = False,
    ):
        """Initialize the AnswerRelevancy Scorer with the specified parameters.

        Args:
            threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
            model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
            include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
            strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
            async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
            verbose (bool): Whether to print intermediate steps to the console, defaults to False.
        """
        super().__init__(
            threshold=threshold,
            model=model,
            include_reason=include_reason,
            strict_mode=strict_mode,
            async_mode=async_mode,
            verbose=verbose,
        )

        self.threshold = threshold
        self.model = model
        self.include_reason = include_reason
        self.strict_mode = strict_mode
        self.async_mode = async_mode
        self.verbose = verbose

        self.metric = AnswerRelevancyMetric(
            threshold=self.threshold,
            model=self.model,
            include_reason=self.include_reason,
            async_mode=self.async_mode,
            strict_mode=self.strict_mode,
            verbose_mode=self.verbose,
        )

    @weave.op
    def score(
        self,
        input: str,
        output: Optional[dict] = None,
        expected_output: Optional[str] = None,
        context: Optional[list[str]] = None,
    ) -> dict[str, Union[str, float]]:
        """Evaluate the relevancy and accuracy of answers generated by a LLM.

        The AnswerRelevancy score is calculated according to the following equation:

        Answer Relevancy = Total Number of Statements / Number of Relevant Statements

        The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.


        Args:
            input (str): The input query or prompt that triggered the output.
            output (dict): The LLM generated response to evaluate and the retrieval context.
            expected_output (Optional[str]): The expected or reference output, defaults to None.
            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.

        Returns:
            dict[str, Union[str, float]]: A dictionary containing:
                - "score" (float): The computed answer relevancy score.
        """
        test_case = LLMTestCase(
            input=input,
            actual_output=output.get("output", ""),
            expected_output=expected_output,
            retrieval_context=output.get("retrieval_context", [""]),
            context=context,
        )

        result: dict[str, Union[str, float]] = {}

        self.metric.measure(test_case)
        result = {"score": self.metric.score}

        return result

    @weave.op()
    def summarize(self, score_rows: list) -> dict:
        """Summarize the results of the AnswerRelevancy Scorer.

        Args:
            score_rows (list): A list of dictionaries containing the following keys:
                - "score" (float): The computed answer relevancy score.
                - "reason" (str): A detailed explanation for the assigned score.

        Returns:
            dict: A dictionary containing the following keys:
                - "answer_relevancy_score" (dict): A dictionary containing the following keys:
                    - "score" (float): The average answer relevancy score.
                    - "variance" (float): The variance of the answer relevancy scores.
                    - "std" (float): The standard deviation of the answer relevancy scores.
                    - "count" (int): The number of answer relevancy scores.
        """
        scores = []
        for row in score_rows:
            score = row.get("score", 0.0)
            scores.append(float(score))

        score = np.mean(scores).item()
        variance = np.var(scores).item()
        std = np.std(scores).item()
        count = len(scores)

        return {"answer_relevancy_score": {"score": score, "variance": variance, "std": std, "count": count}}