|
|
from statistics import variance |
|
|
from typing import Optional, Union |
|
|
|
|
|
import numpy as np |
|
|
import weave |
|
|
from deepeval.metrics import AnswerRelevancyMetric |
|
|
from deepeval.test_case import LLMTestCase |
|
|
from weave import Scorer |
|
|
|
|
|
|
|
|
class AnswerRelevancyScorer(Scorer): |
|
|
"""Evaluate the relevancy of answers generated by a LLM. |
|
|
|
|
|
This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers |
|
|
compared to the input query. |
|
|
|
|
|
The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the |
|
|
actual output of an LLM application is in relation to the input query. |
|
|
|
|
|
Attributes: |
|
|
threshold (float): The minimum passing threshold for relevancy, defaults to 0.5. |
|
|
model (str): The name of the LLM model used for evaluation, defaults to "gpt-4". |
|
|
include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True. |
|
|
strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to |
|
|
1. Defaults to False. |
|
|
async_mode (bool): Whether to perform scoring asynchronously, defaults to True. |
|
|
verbose (bool): Whether to print intermediate steps to the console, defaults to False. |
|
|
metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score. |
|
|
""" |
|
|
|
|
|
threshold: float = Optional[None] |
|
|
model: str = Optional[None] |
|
|
include_reason: bool = Optional[None] |
|
|
strict_mode: bool = Optional[None] |
|
|
async_mode: bool = Optional[None] |
|
|
verbose: bool = Optional[None] |
|
|
metric: AnswerRelevancyMetric = Optional[None] |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
threshold: float = 0.5, |
|
|
model: str = "gpt-4", |
|
|
include_reason: bool = True, |
|
|
strict_mode: bool = False, |
|
|
async_mode: bool = True, |
|
|
verbose: bool = False, |
|
|
): |
|
|
"""Initialize the AnswerRelevancy Scorer with the specified parameters. |
|
|
|
|
|
Args: |
|
|
threshold (float): The minimum passing threshold for relevancy, defaults to 0.5. |
|
|
model (str): The name of the LLM model used for evaluation, defaults to "gpt-4". |
|
|
include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True. |
|
|
strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False. |
|
|
async_mode (bool): Whether to perform scoring asynchronously, defaults to True. |
|
|
verbose (bool): Whether to print intermediate steps to the console, defaults to False. |
|
|
""" |
|
|
super().__init__( |
|
|
threshold=threshold, |
|
|
model=model, |
|
|
include_reason=include_reason, |
|
|
strict_mode=strict_mode, |
|
|
async_mode=async_mode, |
|
|
verbose=verbose, |
|
|
) |
|
|
|
|
|
self.threshold = threshold |
|
|
self.model = model |
|
|
self.include_reason = include_reason |
|
|
self.strict_mode = strict_mode |
|
|
self.async_mode = async_mode |
|
|
self.verbose = verbose |
|
|
|
|
|
self.metric = AnswerRelevancyMetric( |
|
|
threshold=self.threshold, |
|
|
model=self.model, |
|
|
include_reason=self.include_reason, |
|
|
async_mode=self.async_mode, |
|
|
strict_mode=self.strict_mode, |
|
|
verbose_mode=self.verbose, |
|
|
) |
|
|
|
|
|
@weave.op |
|
|
def score( |
|
|
self, |
|
|
input: str, |
|
|
output: Optional[dict] = None, |
|
|
expected_output: Optional[str] = None, |
|
|
context: Optional[list[str]] = None, |
|
|
) -> dict[str, Union[str, float]]: |
|
|
"""Evaluate the relevancy and accuracy of answers generated by a LLM. |
|
|
|
|
|
The AnswerRelevancy score is calculated according to the following equation: |
|
|
|
|
|
Answer Relevancy = Total Number of Statements / Number of Relevant Statements |
|
|
|
|
|
The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input. |
|
|
|
|
|
|
|
|
Args: |
|
|
input (str): The input query or prompt that triggered the output. |
|
|
output (dict): The LLM generated response to evaluate and the retrieval context. |
|
|
expected_output (Optional[str]): The expected or reference output, defaults to None. |
|
|
context (Optional[list[str]]): Additional context for the evaluation, defaults to None. |
|
|
|
|
|
Returns: |
|
|
dict[str, Union[str, float]]: A dictionary containing: |
|
|
- "score" (float): The computed answer relevancy score. |
|
|
""" |
|
|
test_case = LLMTestCase( |
|
|
input=input, |
|
|
actual_output=output.get("output", ""), |
|
|
expected_output=expected_output, |
|
|
retrieval_context=output.get("retrieval_context", [""]), |
|
|
context=context, |
|
|
) |
|
|
|
|
|
result: dict[str, Union[str, float]] = {} |
|
|
|
|
|
self.metric.measure(test_case) |
|
|
result = {"score": self.metric.score} |
|
|
|
|
|
return result |
|
|
|
|
|
@weave.op() |
|
|
def summarize(self, score_rows: list) -> dict: |
|
|
"""Summarize the results of the AnswerRelevancy Scorer. |
|
|
|
|
|
Args: |
|
|
score_rows (list): A list of dictionaries containing the following keys: |
|
|
- "score" (float): The computed answer relevancy score. |
|
|
- "reason" (str): A detailed explanation for the assigned score. |
|
|
|
|
|
Returns: |
|
|
dict: A dictionary containing the following keys: |
|
|
- "answer_relevancy_score" (dict): A dictionary containing the following keys: |
|
|
- "score" (float): The average answer relevancy score. |
|
|
- "variance" (float): The variance of the answer relevancy scores. |
|
|
- "std" (float): The standard deviation of the answer relevancy scores. |
|
|
- "count" (int): The number of answer relevancy scores. |
|
|
""" |
|
|
scores = [] |
|
|
for row in score_rows: |
|
|
score = row.get("score", 0.0) |
|
|
scores.append(float(score)) |
|
|
|
|
|
score = np.mean(scores).item() |
|
|
variance = np.var(scores).item() |
|
|
std = np.std(scores).item() |
|
|
count = len(scores) |
|
|
|
|
|
return {"answer_relevancy_score": {"score": score, "variance": variance, "std": std, "count": count}} |
|
|
|