File size: 6,374 Bytes
336f4a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from statistics import variance
from typing import Optional, Union

import numpy as np
import weave
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from weave import Scorer


class AnswerRelevancyScorer(Scorer):
    """Evaluate the relevancy of answers generated by a LLM.

    This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
    compared to the input query.

    The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
    actual output of an LLM application is in relation to the input query.

    Attributes:
        threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
        model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
        include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
        strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
                            1. Defaults to False.
        async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
        verbose (bool): Whether to print intermediate steps to the console, defaults to False.
        metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
    """

    threshold: float = Optional[None]
    model: str = Optional[None]
    include_reason: bool = Optional[None]
    strict_mode: bool = Optional[None]
    async_mode: bool = Optional[None]
    verbose: bool = Optional[None]
    metric: AnswerRelevancyMetric = Optional[None]

    def __init__(
        self,
        threshold: float = 0.5,
        model: str = "gpt-4",
        include_reason: bool = True,
        strict_mode: bool = False,
        async_mode: bool = True,
        verbose: bool = False,
    ):
        """Initialize the AnswerRelevancy Scorer with the specified parameters.

        Args:
            threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
            model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
            include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
            strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
            async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
            verbose (bool): Whether to print intermediate steps to the console, defaults to False.
        """
        super().__init__(
            threshold=threshold,
            model=model,
            include_reason=include_reason,
            strict_mode=strict_mode,
            async_mode=async_mode,
            verbose=verbose,
        )

        self.threshold = threshold
        self.model = model
        self.include_reason = include_reason
        self.strict_mode = strict_mode
        self.async_mode = async_mode
        self.verbose = verbose

        self.metric = AnswerRelevancyMetric(
            threshold=self.threshold,
            model=self.model,
            include_reason=self.include_reason,
            async_mode=self.async_mode,
            strict_mode=self.strict_mode,
            verbose_mode=self.verbose,
        )

    @weave.op
    def score(
        self,
        input: str,
        output: Optional[dict] = None,
        expected_output: Optional[str] = None,
        context: Optional[list[str]] = None,
    ) -> dict[str, Union[str, float]]:
        """Evaluate the relevancy and accuracy of answers generated by a LLM.

        The AnswerRelevancy score is calculated according to the following equation:

        Answer Relevancy = Total Number of Statements / Number of Relevant Statements

        The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.


        Args:
            input (str): The input query or prompt that triggered the output.
            output (dict): The LLM generated response to evaluate and the retrieval context.
            expected_output (Optional[str]): The expected or reference output, defaults to None.
            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.

        Returns:
            dict[str, Union[str, float]]: A dictionary containing:
                - "score" (float): The computed answer relevancy score.
        """
        test_case = LLMTestCase(
            input=input,
            actual_output=output.get("output", ""),
            expected_output=expected_output,
            retrieval_context=output.get("retrieval_context", [""]),
            context=context,
        )

        result: dict[str, Union[str, float]] = {}

        self.metric.measure(test_case)
        result = {"score": self.metric.score}

        return result

    @weave.op()
    def summarize(self, score_rows: list) -> dict:
        """Summarize the results of the AnswerRelevancy Scorer.

        Args:
            score_rows (list): A list of dictionaries containing the following keys:
                - "score" (float): The computed answer relevancy score.
                - "reason" (str): A detailed explanation for the assigned score.

        Returns:
            dict: A dictionary containing the following keys:
                - "answer_relevancy_score" (dict): A dictionary containing the following keys:
                    - "score" (float): The average answer relevancy score.
                    - "variance" (float): The variance of the answer relevancy scores.
                    - "std" (float): The standard deviation of the answer relevancy scores.
                    - "count" (int): The number of answer relevancy scores.
        """
        scores = []
        for row in score_rows:
            score = row.get("score", 0.0)
            scores.append(float(score))

        score = np.mean(scores).item()
        variance = np.var(scores).item()
        std = np.std(scores).item()
        count = len(scores)

        return {"answer_relevancy_score": {"score": score, "variance": variance, "std": std, "count": count}}