| import asyncio | |
| from weave import Dataset, Evaluation, Model, Scorer | |
| class Evaluator: | |
| """Evaluate a model on a dataset using a list of scorers. | |
| Attributes: | |
| evaluation_name (str): The name of the evaluation run. | |
| evaluation_dataset (Dataset): The dataset used for evaluation. | |
| evaluation_scorers (list[Scorer]): A list of scorer objects used to evaluate the pipeline. | |
| pipeline (Model): The pipeline (model) to be evaluated. | |
| """ | |
| def __init__( | |
| self, | |
| evaluation_name: str, | |
| evaluation_dataset: Dataset, | |
| evaluation_scorers: list[Scorer], | |
| pipeline: Model, | |
| ): | |
| """Initialize the Evaluator instance with the specified evaluation parameters. | |
| Args: | |
| evaluation_name (str): A unique identifier for the evaluation run. | |
| evaluation_dataset (Dataset): A `Dataset` object representing the data for evaluation. | |
| evaluation_scorers (list[Scorer]): A list of `Scorer` objects that calculate various metrics. | |
| pipeline (Model): The model or pipeline to evaluate. | |
| """ | |
| self.evaluation_name = evaluation_name | |
| self.evaluation_dataset = evaluation_dataset | |
| self.evaluation_scorers = evaluation_scorers | |
| self.pipeline = pipeline | |
| def evaluate(self) -> None: | |
| """Perform evaluation of the pipeline using the specified dataset and scorers. | |
| This method creates an `Evaluation` object, executes the evaluation process, and | |
| returns the results as a dictionary. | |
| """ | |
| evaluation = Evaluation( | |
| evaluation_name=self.evaluation_name, | |
| dataset=self.evaluation_dataset, | |
| scorers=self.evaluation_scorers, | |
| ) | |
| try: | |
| evaluation_results = asyncio.run(evaluation.evaluate(self.pipeline)) | |
| except Exception as exception: | |
| msg = f"Evaluation run failed: {exception}" | |
| raise RuntimeError(msg) from exception | |
| return evaluation_results | |