File size: 7,236 Bytes
336f4a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from typing import Optional, Union

import weave
from deepeval.metrics import SummarizationMetric
from deepeval.test_case import LLMTestCase
from weave import Scorer


class SummarizationScorer(Scorer):
    """Summarization Scorer.

    This scorer uses DeepEval's `Summarization` Metric to assess how well the generated output
    aligns with the reference context.

    The summarization metric uses LLMs to determine whether the LLM application is generating factually correct
    summaries while including the neccessary details from the original text.

    Attributes:
        threshold (float): Minimum passing threshold, defaults to 0.5.
        model (str): LLM model for scoring, defaults to "gpt-4".
        assessment_questions: a list of close-ended questions that can be answered with either a 'yes' or a 'no'.
        These are questions you want your summary to be able to ideally answer,
        and is especially helpful if you already know what a good summary for your use case looks like. If
        include_reason (bool): Include reason for the evaluation score, defaults to True.
        strict_mode (bool): Enforces binary metric scoring (1 or 0), defaults to False.
        async_mode (bool): Use asynchronous scoring, defaults to True.
        verbose (bool): Print intermediate steps used for scoring, defaults to False.
        truths_extraction_limit (Optional[int]): Maximum number of factual truths to extract
            from the retrieval_context. Defaults to None.
        metric (SummarizationMetric): An instance of DeepEval's `SummarizationMetric` for scoring.
    """

    threshold: float = Optional[None]
    model: str = Optional[None]
    include_reason: bool = Optional[None]
    strict_mode: bool = Optional[None]
    async_mode: bool = Optional[None]
    verbose: bool = Optional[None]
    assessment_questions: Optional[list[str]] = Optional[None]
    n: Optional[int] = Optional[None]
    truths_extraction_limit: Optional[int] = Optional[None]
    metric: SummarizationMetric = Optional[None]

    def __init__(
        self,
        threshold: float = 0.5,
        model: str = "gpt-4",
        include_reason: bool = True,
        strict_mode: bool = False,
        async_mode: bool = True,
        verbose: bool = False,
        assessment_questions: Optional[list[str]] = None,
        n: Optional[int] = 5,
        truths_extraction_limit: Optional[int] = None,
    ):
        """Initialize the Summarization Scorer with DeepEval's Summarization Metric.

        Args:
            threshold (float): Minimum passing threshold, defaults to 0.5.
            model (str): LLM model for scoring, defaults to "gpt-4".
            include_reason (bool): Include reason for the evaluation score, defaults to True.
            strict_mode (bool): Enforces binary metric scoring (1 or 0), defaults to False.
            async_mode (bool): Use asynchronous scoring, defaults to True.
            verbose (bool): Print intermediate steps used for scoring, defaults to False.
            assessment_questions (Optional[list[str]]): a list of close-ended questions that can be answered with either
                a 'yes' or a 'no'. These are questions you want your summary to be able to ideally answer, and is
                especially helpful if you already know what a good summary for your use case looks like. If
                `assessment_questions` is not provided, the metric will generate a set of `assessment_questions` at
                evaluation time.
            n (Optional[int]): The number of assessment questions to generate when `assessment_questions` is not
                provided. Defaults to 5.
            truths_extraction_limit (Optional[int]): Maximum number of factual truths to extract
                from the retrieval_context. Defaults to None.
        """
        super().__init__(
            threshold=threshold,
            model=model,
            include_reason=include_reason,
            strict_mode=strict_mode,
            async_mode=async_mode,
            verbose=verbose,
            assessment_questions=assessment_questions,
            n=n,
            truths_extraction_limit=truths_extraction_limit,
        )

        self.threshold = threshold
        self.model = model
        self.include_reason = include_reason
        self.strict_mode = strict_mode
        self.async_mode = async_mode
        self.verbose = verbose
        self.assessment_questions = assessment_questions
        self.n = n
        self.truths_extraction_limit = truths_extraction_limit

        self.metric = SummarizationMetric(
            threshold=self.threshold,
            model=self.model,
            include_reason=self.include_reason,
            async_mode=self.async_mode,
            strict_mode=self.strict_mode,
            verbose_mode=self.verbose,
            assessment_questions=self.assessment_questions,
            n=self.n,
            truths_extraction_limit=self.truths_extraction_limit,
        )

    @weave.op
    def score(
        self,
        input: str,
        actual_output: str,
        expected_output: Optional[str] = None,
        retrieval_context: Optional[list[str]] = None,
        context: Optional[list[str]] = None,
    ) -> dict[str, Union[str, float]]:
        """Evaluate the quality of summarization of an LLM generated response.

        The Summarization score is calculated according to the following equation:

        Summarization = min(Alignment Score, Coverage Score)

        where,
        - Alignment Score: determines whether the summary contains hallucinated or contradictory information to the original text.
        - Coverage Score: determines whether the summary contains the neccessary information from the original text.


        While the Alignment Score is similar to that of the Hallucination Score, the Coverage Score is first calculated
        by generating n closed-ended questions that can only be answered with either a 'yes or a 'no', before
        calculating the ratio of which the original text and summary yields the same answer.

        Args:
            input (str): The input query or prompt that triggered the output.
            actual_output (str): The LLM generated response to evaluate.
            expected_output (Optional[str]): The expected or reference output, defaults to None.
            retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
            context (Optional[list[str]]): Additional context for the evaluation, defaults to None.

        Returns:
            dict[str, Union[str, float]]: A dictionary containing:
                - "score" (float): The computed summarization score.
                - "reason" (str): A detailed explanation for the assigned score.
        """
        test_case = LLMTestCase(
            input=input,
            actual_output=actual_output,
            expected_output=expected_output,
            retrieval_context=retrieval_context,
            context=context,
        )

        result: dict[str, Union[str, float]] = {}

        self.metric.measure(test_case)
        result = {"score": self.metric.score, "reason": self.metric.reason}

        return result