Upload 107 files

336f4a9 verified about 1 year ago

6.99 kB

	from typing import Optional, Union

	import numpy as np
	import weave
	from deepeval.metrics import ContextualPrecisionMetric
	from deepeval.test_case import LLMTestCase
	from weave import Scorer


	class ContextualPrecisionScorer(Scorer):
	"""Evaluate the contextual precision of the generated output with the provided context.

	This scorer uses DeepEval's `Contextual Precision` Metric to assess how well the generated output
	aligns with the reference context.

	The contextual precision metric measures the quality of the pipeline's retriever by evaluating whether results in the `retrieval_context` that are relevant to the given input are ranked higher than irrelevant ones.

	Attributes:
	threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
	model (str): The LLM model to use for scoring, defaults to "gpt-4".
	include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
	strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
	0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
	async_mode (bool): Whether to use asynchronous scoring, defaults to True.
	verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
	to False.
	metric (ContextualPrecisionMetric): The DeepEval ContextualPrecisionMetric.
	"""

	threshold: float = Optional[None]
	model: str = Optional[None]
	include_reason: bool = Optional[None]
	strict_mode: bool = Optional[None]
	async_mode: bool = Optional[None]
	verbose: bool = Optional[None]
	metric: ContextualPrecisionMetric = Optional[None]

	def __init__(
	self,
	threshold: float = 0.5,
	model: str = "gpt-4",
	include_reason: bool = True,
	strict_mode: bool = True,
	async_mode: bool = True,
	verbose: bool = False,
	):
	"""Initialize the Contextual Precision Scorer using DeepEval's Contextual Precision Metric.

	Args:
	threshold (float): A float representing the minimum passing threshold, defaults to 0.5.
	model (str): The LLM model to use for scoring, defaults to "gpt-4".
	include_reason (bool): Whether to include a reason for the evaluation score, defaults to True.
	strict_mode (bool): A boolean which when set to True, enforces a binary metric score: 1 for perfection,
	0 otherwise. It also overrides the current threshold and sets it to 1. Defaults to False.
	async_mode (bool): Whether to use asynchronous scoring, defaults to True.
	verbose (bool): Whether to print the intermediate steps used to calculate said metric to the console, defaults
	to False.
	"""
	super().__init__(
	threshold=threshold,
	model=model,
	include_reason=include_reason,
	strict_mode=strict_mode,
	async_mode=async_mode,
	verbose=verbose,
	)

	self.threshold = threshold
	self.model = model
	self.include_reason = include_reason
	self.strict_mode = strict_mode
	self.async_mode = async_mode
	self.verbose = verbose

	self.metric = ContextualPrecisionMetric(
	threshold=self.threshold,
	model=self.model,
	include_reason=self.include_reason,
	async_mode=self.async_mode,
	strict_mode=self.strict_mode,
	verbose_mode=self.verbose,
	)

	@weave.op
	def score(
	self,
	input: str,
	output: Optional[dict] = None,
	expected_output: Optional[str] = None,
	context: Optional[list[str]] = None,
	) -> dict[str, Union[str, float]]:
	"""Evaluate the contextual precision of the generated output with the provided context.

	The Contextual Precision Score is calculated according to the following equation:

	Contextual Precision = (1 / Number of Relevant Results) * (Sum(Number of Relevant Results up to position k) / k) * Binary Relevance of k'th result)

	where,
	- k: The position of the result in the list of all results.

	The Contextual Precision Scorer first uses an LLM to determine for each result in the `retrieval_context`
	whether it is relevant to the input based on information in the `expected_output`, before calculating the
	weighted cumulative precision as the contextual precision score.

	Args:
	input (str): The input query or prompt that triggered the output.
	output (str): The LLM generated response to evaluate.
	expected_output (Optional[str]): The expected or reference output, defaults to None.
	retrieval_context (Optional[list[str]]): The context containing factual information to compare against.
	context (Optional[list[str]]): Additional context for the evaluation, defaults to None.

	Returns:
	dict[str, Union[str, float]]: A dictionary containing:
	- "score" (float): The computed contextual precision score.
	"""
	test_case = LLMTestCase(
	input=input,
	actual_output=output.get("output", ""),
	expected_output=expected_output,
	retrieval_context=output.get("retrieval_context", [""]),
	context=context,
	)

	result: dict[str, Union[str, float]] = {}

	self.metric.measure(test_case)
	result = {
	"score": self.metric.score,
	}

	return result

	@weave.op()
	def summarize(self, score_rows: list) -> dict:
	"""Summarize the results of the Contextual Precision Scorer.

	Args:
	score_rows (list): A list of dictionaries containing the following keys:
	- "score" (float): The computed answer relevancy score.
	- "reason" (str): A detailed explanation for the assigned score.

	Returns:
	dict: A dictionary containing the following keys:
	- "answer_relevancy_score" (dict): A dictionary containing the following keys:
	- "score" (float): The average answer relevancy score.
	- "variance" (float): The variance of the answer relevancy scores.
	- "std" (float): The standard deviation of the answer relevancy scores.
	- "count" (int): The number of answer relevancy scores.
	"""
	scores = []
	for row in score_rows:
	score = row.get("score", 0.0)
	scores.append(float(score))

	score = np.mean(scores).item()
	variance = np.var(scores).item()
	std = np.std(scores).item()
	count = len(scores)

	return {"contextual_precision_score": {"score": score, "variance": variance, "std": std, "count": count}}