from statistics import variance from typing import Optional, Union import numpy as np import weave from deepeval.metrics import AnswerRelevancyMetric from deepeval.test_case import LLMTestCase from weave import Scorer class AnswerRelevancyScorer(Scorer): """Evaluate the relevancy of answers generated by a LLM. This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers compared to the input query. The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the actual output of an LLM application is in relation to the input query. Attributes: threshold (float): The minimum passing threshold for relevancy, defaults to 0.5. model (str): The name of the LLM model used for evaluation, defaults to "gpt-4". include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True. strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False. async_mode (bool): Whether to perform scoring asynchronously, defaults to True. verbose (bool): Whether to print intermediate steps to the console, defaults to False. metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score. """ threshold: float = Optional[None] model: str = Optional[None] include_reason: bool = Optional[None] strict_mode: bool = Optional[None] async_mode: bool = Optional[None] verbose: bool = Optional[None] metric: AnswerRelevancyMetric = Optional[None] def __init__( self, threshold: float = 0.5, model: str = "gpt-4", include_reason: bool = True, strict_mode: bool = False, async_mode: bool = True, verbose: bool = False, ): """Initialize the AnswerRelevancy Scorer with the specified parameters. Args: threshold (float): The minimum passing threshold for relevancy, defaults to 0.5. model (str): The name of the LLM model used for evaluation, defaults to "gpt-4". include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True. strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False. async_mode (bool): Whether to perform scoring asynchronously, defaults to True. verbose (bool): Whether to print intermediate steps to the console, defaults to False. """ super().__init__( threshold=threshold, model=model, include_reason=include_reason, strict_mode=strict_mode, async_mode=async_mode, verbose=verbose, ) self.threshold = threshold self.model = model self.include_reason = include_reason self.strict_mode = strict_mode self.async_mode = async_mode self.verbose = verbose self.metric = AnswerRelevancyMetric( threshold=self.threshold, model=self.model, include_reason=self.include_reason, async_mode=self.async_mode, strict_mode=self.strict_mode, verbose_mode=self.verbose, ) @weave.op def score( self, input: str, output: Optional[dict] = None, expected_output: Optional[str] = None, context: Optional[list[str]] = None, ) -> dict[str, Union[str, float]]: """Evaluate the relevancy and accuracy of answers generated by a LLM. The AnswerRelevancy score is calculated according to the following equation: Answer Relevancy = Total Number of Statements / Number of Relevant Statements The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input. Args: input (str): The input query or prompt that triggered the output. output (dict): The LLM generated response to evaluate and the retrieval context. expected_output (Optional[str]): The expected or reference output, defaults to None. context (Optional[list[str]]): Additional context for the evaluation, defaults to None. Returns: dict[str, Union[str, float]]: A dictionary containing: - "score" (float): The computed answer relevancy score. """ test_case = LLMTestCase( input=input, actual_output=output.get("output", ""), expected_output=expected_output, retrieval_context=output.get("retrieval_context", [""]), context=context, ) result: dict[str, Union[str, float]] = {} self.metric.measure(test_case) result = {"score": self.metric.score} return result @weave.op() def summarize(self, score_rows: list) -> dict: """Summarize the results of the AnswerRelevancy Scorer. Args: score_rows (list): A list of dictionaries containing the following keys: - "score" (float): The computed answer relevancy score. - "reason" (str): A detailed explanation for the assigned score. Returns: dict: A dictionary containing the following keys: - "answer_relevancy_score" (dict): A dictionary containing the following keys: - "score" (float): The average answer relevancy score. - "variance" (float): The variance of the answer relevancy scores. - "std" (float): The standard deviation of the answer relevancy scores. - "count" (int): The number of answer relevancy scores. """ scores = [] for row in score_rows: score = row.get("score", 0.0) scores.append(float(score)) score = np.mean(scores).item() variance = np.var(scores).item() std = np.std(scores).item() count = len(scores) return {"answer_relevancy_score": {"score": score, "variance": variance, "std": std, "count": count}}