test-ragp / src /rag_pipelines /evaluation /evaluator.py

Upload 107 files

336f4a9 verified 11 months ago

2.02 kB

	import asyncio

	from weave import Dataset, Evaluation, Model, Scorer


	class Evaluator:
	"""Evaluate a model on a dataset using a list of scorers.

	Attributes:
	evaluation_name (str): The name of the evaluation run.
	evaluation_dataset (Dataset): The dataset used for evaluation.
	evaluation_scorers (list[Scorer]): A list of scorer objects used to evaluate the pipeline.
	pipeline (Model): The pipeline (model) to be evaluated.
	"""

	def __init__(
	self,
	evaluation_name: str,
	evaluation_dataset: Dataset,
	evaluation_scorers: list[Scorer],
	pipeline: Model,
	):
	"""Initialize the Evaluator instance with the specified evaluation parameters.

	Args:
	evaluation_name (str): A unique identifier for the evaluation run.
	evaluation_dataset (Dataset): A `Dataset` object representing the data for evaluation.
	evaluation_scorers (list[Scorer]): A list of `Scorer` objects that calculate various metrics.
	pipeline (Model): The model or pipeline to evaluate.
	"""
	self.evaluation_name = evaluation_name
	self.evaluation_dataset = evaluation_dataset
	self.evaluation_scorers = evaluation_scorers
	self.pipeline = pipeline

	def evaluate(self) -> None:
	"""Perform evaluation of the pipeline using the specified dataset and scorers.

	This method creates an `Evaluation` object, executes the evaluation process, and
	returns the results as a dictionary.
	"""
	evaluation = Evaluation(
	evaluation_name=self.evaluation_name,
	dataset=self.evaluation_dataset,
	scorers=self.evaluation_scorers,
	)

	try:
	evaluation_results = asyncio.run(evaluation.evaluate(self.pipeline))
	except Exception as exception:
	msg = f"Evaluation run failed: {exception}"
	raise RuntimeError(msg) from exception

	return evaluation_results