test-ragp / src /rag_pipelines /evaluation /response /answer_relevancy.py

Upload 107 files

336f4a9 verified 11 months ago

6.37 kB

	from statistics import variance
	from typing import Optional, Union

	import numpy as np
	import weave
	from deepeval.metrics import AnswerRelevancyMetric
	from deepeval.test_case import LLMTestCase
	from weave import Scorer


	class AnswerRelevancyScorer(Scorer):
	"""Evaluate the relevancy of answers generated by a LLM.

	This scorer uses DeepEval's `AnswerRelevancy` Metric to assess the relevance and accuracy of LLM generated answers
	compared to the input query.

	The answer relevancy metric measures the quality of the RAG pipeline's generator by determining how relevant the
	actual output of an LLM application is in relation to the input query.

	Attributes:
	threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
	model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
	include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
	strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to
	1. Defaults to False.
	async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
	verbose (bool): Whether to print intermediate steps to the console, defaults to False.
	metric (AnswerRelevancyMetric): An instance of AnswerRelevancyMetric to calculate the score.
	"""

	threshold: float = Optional[None]
	model: str = Optional[None]
	include_reason: bool = Optional[None]
	strict_mode: bool = Optional[None]
	async_mode: bool = Optional[None]
	verbose: bool = Optional[None]
	metric: AnswerRelevancyMetric = Optional[None]

	def __init__(
	self,
	threshold: float = 0.5,
	model: str = "gpt-4",
	include_reason: bool = True,
	strict_mode: bool = False,
	async_mode: bool = True,
	verbose: bool = False,
	):
	"""Initialize the AnswerRelevancy Scorer with the specified parameters.

	Args:
	threshold (float): The minimum passing threshold for relevancy, defaults to 0.5.
	model (str): The name of the LLM model used for evaluation, defaults to "gpt-4".
	include_reason (bool): Whether to include an explanation for the evaluation score, defaults to True.
	strict_mode (bool): Enforces binary scoring (1 for perfect relevancy, 0 otherwise). Overrides the threshold to 1. Defaults to False.
	async_mode (bool): Whether to perform scoring asynchronously, defaults to True.
	verbose (bool): Whether to print intermediate steps to the console, defaults to False.
	"""
	super().__init__(
	threshold=threshold,
	model=model,
	include_reason=include_reason,
	strict_mode=strict_mode,
	async_mode=async_mode,
	verbose=verbose,
	)

	self.threshold = threshold
	self.model = model
	self.include_reason = include_reason
	self.strict_mode = strict_mode
	self.async_mode = async_mode
	self.verbose = verbose

	self.metric = AnswerRelevancyMetric(
	threshold=self.threshold,
	model=self.model,
	include_reason=self.include_reason,
	async_mode=self.async_mode,
	strict_mode=self.strict_mode,
	verbose_mode=self.verbose,
	)

	@weave.op
	def score(
	self,
	input: str,
	output: Optional[dict] = None,
	expected_output: Optional[str] = None,
	context: Optional[list[str]] = None,
	) -> dict[str, Union[str, float]]:
	"""Evaluate the relevancy and accuracy of answers generated by a LLM.

	The AnswerRelevancy score is calculated according to the following equation:

	Answer Relevancy = Total Number of Statements / Number of Relevant Statements

	The AnswerRelevancy Scorer uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the input.


	Args:
	input (str): The input query or prompt that triggered the output.
	output (dict): The LLM generated response to evaluate and the retrieval context.
	expected_output (Optional[str]): The expected or reference output, defaults to None.
	context (Optional[list[str]]): Additional context for the evaluation, defaults to None.

	Returns:
	dict[str, Union[str, float]]: A dictionary containing:
	- "score" (float): The computed answer relevancy score.
	"""
	test_case = LLMTestCase(
	input=input,
	actual_output=output.get("output", ""),
	expected_output=expected_output,
	retrieval_context=output.get("retrieval_context", [""]),
	context=context,
	)

	result: dict[str, Union[str, float]] = {}

	self.metric.measure(test_case)
	result = {"score": self.metric.score}

	return result

	@weave.op()
	def summarize(self, score_rows: list) -> dict:
	"""Summarize the results of the AnswerRelevancy Scorer.

	Args:
	score_rows (list): A list of dictionaries containing the following keys:
	- "score" (float): The computed answer relevancy score.
	- "reason" (str): A detailed explanation for the assigned score.

	Returns:
	dict: A dictionary containing the following keys:
	- "answer_relevancy_score" (dict): A dictionary containing the following keys:
	- "score" (float): The average answer relevancy score.
	- "variance" (float): The variance of the answer relevancy scores.
	- "std" (float): The standard deviation of the answer relevancy scores.
	- "count" (int): The number of answer relevancy scores.
	"""
	scores = []
	for row in score_rows:
	score = row.get("score", 0.0)
	scores.append(float(score))

	score = np.mean(scores).item()
	variance = np.var(scores).item()
	std = np.std(scores).item()
	count = len(scores)

	return {"answer_relevancy_score": {"score": score, "variance": variance, "std": std, "count": count}}