Spaces:

nowhuggingface
/

LLM1-Fine-tuning-and-deployment

Sleeping

App Files Files Community

LLM1-Fine-tuning-and-deployment / evaluate /tests /test_evaluator.py

nowhuggingface

Add my folder

d733479 2 months ago

raw

history blame contribute delete

41.8 kB

	# Copyright 2022 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# Lint as: python3

	from time import sleep
	from unittest import TestCase, mock

	import numpy as np
	from datasets import ClassLabel, Dataset, Features, Sequence, Value
	from PIL import Image
	from transformers import (
	AutoConfig,
	AutoFeatureExtractor,
	AutoModelForAudioClassification,
	AutoModelForImageClassification,
	AutoModelForQuestionAnswering,
	AutoModelForSequenceClassification,
	AutoModelForTokenClassification,
	AutoTokenizer,
	pipeline,
	)

	from evaluate import (
	AudioClassificationEvaluator,
	AutomaticSpeechRecognitionEvaluator,
	Evaluator,
	ImageClassificationEvaluator,
	QuestionAnsweringEvaluator,
	Text2TextGenerationEvaluator,
	TextClassificationEvaluator,
	TextGenerationEvaluator,
	TokenClassificationEvaluator,
	evaluator,
	load,
	)

	from .utils import slow


	class DummyTextGenerationPipeline:
	def __init__(self, prefix="generated", task="text-generation", num_return_sequences=1):
	self.task = task
	self.prefix = prefix
	self.num_return_sequences = num_return_sequences

	def __call__(self, inputs, **kwargs):
	return [[{f"{self.prefix}_text": "Lorem ipsum"} for _ in range(self.num_return_sequences)] for _ in inputs]


	class DummyText2TextGenerationPipeline:
	def __init__(self, prefix="generated", task="text2text-generation"):
	self.task = task
	self.prefix = prefix

	def __call__(self, inputs, **kwargs):
	return [{f"{self.prefix}_text": "Lorem ipsum"} for _ in inputs]


	class DummyTextClassificationPipeline:
	def __init__(self, sleep_time=None):
	self.task = "text-classification"
	self.sleep_time = sleep_time

	def __call__(self, inputs, **kwargs):
	if self.sleep_time is not None:
	sleep(self.sleep_time)
	return [{"label": "NEGATIVE"} if i % 2 == 1 else {"label": "POSITIVE"} for i, _ in enumerate(inputs)]


	class DummyImageClassificationPipeline:
	def __init__(self):
	self.task = "image-classification"

	def __call__(self, images, **kwargs):
	return [[{"score": 0.9, "label": "yurt"}, {"score": 0.1, "label": "umbrella"}] for i, _ in enumerate(images)]


	class DummyQuestionAnsweringPipeline:
	def __init__(self, v2: bool):
	self.task = "question-answering"
	self.v2 = v2

	def __call__(self, question, context, **kwargs):
	if self.v2:
	return [
	{"score": 0.95, "start": 31, "end": 39, "answer": "Felix"}
	if i % 2 == 0
	else {"score": 0.95, "start": 0, "end": 0, "answer": ""}
	for i in range(len(question))
	]
	else:
	return [{"score": 0.95, "start": 31, "end": 39, "answer": "Felix"} for _ in question]


	class DummyTokenClassificationPipeline:
	def __init__(self):
	self.task = "token-classification"

	def __call__(self, inputs, **kwargs):
	result = [
	{"start": 0, "entity": "B-LOC"},
	{"start": 2, "entity": "I-LOC"},
	{"start": 4, "entity": "I-LOC"},
	{"start": 9, "entity": "O"},
	{"start": 11, "entity": "O"},
	{"start": 16, "entity": "B-LOC"},
	{"start": 21, "entity": "O"},
	]

	return [result]


	class DummyAutomaticSpeechRecognitionPipeline:
	def __init__(self) -> None:
	self.task = "automatic-speech-recognition"

	def __call__(self, inputs, **kwargs):
	return [{"text": "Lorem ipsum"} for _ in inputs]


	class DummyAudioClassificationPipeline:
	def __init__(self):
	self.task = "audio-classification"

	def __call__(self, audio, **kwargs):
	return [[{"score": 0.9, "label": "yes"}, {"score": 0.1, "label": "no"}] for i, _ in enumerate(audio)]


	class TestEvaluator(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]})
	self.default_ckpt = "hf-internal-testing/tiny-random-bert"
	self.default_model = AutoModelForSequenceClassification.from_pretrained(self.default_ckpt, num_labels=2)
	self.default_tokenizer = AutoTokenizer.from_pretrained(self.default_ckpt)
	self.pipe = pipeline("text-classification", model=self.default_model, tokenizer=self.default_tokenizer)
	self.evaluator = evaluator("text-classification")
	self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]})
	self.label_mapping = {"LABEL_0": 0.0, "LABEL_1": 1.0}

	def test_wrong_task(self):
	self.assertRaises(KeyError, evaluator, "bad_task")

	def test_device_placement(self):
	orig_import = __import__

	pt_mock = mock.Mock()
	tf_mock = mock.Mock()

	# mock import of torch and tensorflow
	def import_pt_tf_mock(name, *args):
	if name == "torch":
	if pt_available:
	return pt_mock
	else:
	raise ImportError
	if name == "tensorflow":
	if tf_available:
	return tf_mock
	else:
	raise ImportError
	return orig_import(name, *args)

	with mock.patch("builtins.__import__", side_effect=import_pt_tf_mock):
	# neither pt or tf are available
	pt_available = False
	tf_available = False
	self.assertEqual(Evaluator._infer_device(), -1)

	# pt available but no GPU
	pt_available = True
	pt_mock.cuda.is_available.return_value = False
	self.assertEqual(Evaluator._infer_device(), -1)

	# pt available and GPU found
	pt_mock.cuda.is_available.return_value = True
	self.assertEqual(Evaluator._infer_device(), 0)

	# tf available but no GPU
	pt_available = False
	tf_available = True
	tf_mock.config.list_physical_devices.return_value = []
	self.assertEqual(Evaluator._infer_device(), -1)

	# tf available and GPU found
	tf_mock.config.list_physical_devices.return_value = ["GPU:0", "GPU:1"]
	self.assertEqual(Evaluator._infer_device(), 0)

	# pt accelerator found and pipeline instantiated on CPU
	pt_mock.cuda.is_available.return_value = True
	self.assertRaises(
	ValueError, Evaluator.check_for_mismatch_in_device_setup, Evaluator._infer_device(), self.pipe
	)

	# tf accelerator found and pipeline instantiated on CPU
	pt_available = False
	tf_available = True
	self.assertRaises(
	ValueError, Evaluator.check_for_mismatch_in_device_setup, Evaluator._infer_device(), self.pipe
	)

	def test_pipe_init(self):
	self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	input_column="text",
	label_column="label",
	label_mapping=self.label_mapping,
	)

	def test_model_init(self):
	self.evaluator.compute(
	model_or_pipeline=self.default_model,
	tokenizer=self.default_tokenizer,
	data=self.data,
	input_column="text",
	label_column="label",
	label_mapping=self.label_mapping,
	)

	def test_model_str_init(self):
	self.evaluator.compute(
	model_or_pipeline=self.default_ckpt,
	data=self.data,
	input_column="text",
	label_column="label",
	label_mapping=self.label_mapping,
	)


	class TestTextClassificationEvaluator(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]})
	self.default_model = "lvwerra/distilbert-imdb"
	self.input_column = "text"
	self.label_column = "label"
	self.pipe = DummyTextClassificationPipeline()
	self.perf_pipe = DummyTextClassificationPipeline(sleep_time=0.1)
	self.evaluator = evaluator("text-classification")
	self.label_mapping = {"NEGATIVE": 0.0, "POSITIVE": 1.0}

	def test_pipe_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	input_column="text",
	label_column="label",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 1.0)

	@slow
	def test_model_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.default_model,
	data=self.data,
	metric="accuracy",
	input_column=self.input_column,
	label_column=self.label_column,
	label_mapping=self.label_mapping,
	)

	model = AutoModelForSequenceClassification.from_pretrained(self.default_model)
	tokenizer = AutoTokenizer.from_pretrained(self.default_model)

	self.assertEqual(results["accuracy"], 1.0)
	results = self.evaluator.compute(
	model_or_pipeline=model,
	data=self.data,
	metric="accuracy",
	tokenizer=tokenizer,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 1.0)

	def test_class_init(self):
	evaluator = TextClassificationEvaluator()
	self.assertEqual(evaluator.task, "text-classification")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="f1",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["f1"], 1.0)

	@slow
	def test_default_pipe_init(self):
	results = self.evaluator.compute(
	data=self.data,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 1.0)

	def test_data_loading(self):

	# Test passing in dataset by name with split
	data = self.evaluator.load_data("evaluate/imdb-ci", split="test[:1]")
	self.evaluator.prepare_data(data=data, input_column="text", label_column="label", second_input_column=None)

	# Test passing in dataset by name without split and inferring the optimal split
	data = self.evaluator.load_data("evaluate/imdb-ci")
	self.evaluator.prepare_data(data=data, input_column="text", label_column="label", second_input_column=None)

	# Test that it chooses the correct one (e.g. imdb only has train and test, but no validation)
	self.assertEqual(data.split, "test")

	# Test that the data point returned is correct; this maps to the first example in the dataset
	self.assertEqual(data[0]["text"], "I love movies about whales!")

	# Test loading subset of a dataset with the `name` field
	data = self.evaluator.load_data("evaluate/glue-ci", subset="cola", split="test")
	self.assertEqual(isinstance(data, Dataset), True)

	# Test loading subset of a dataset with the `name` field and having it infer the split
	data = self.evaluator.load_data("evaluate/glue-ci", subset="cola")
	self.assertEqual(isinstance(data, Dataset), True)

	def test_overwrite_default_metric(self):
	accuracy = load("accuracy")
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric=accuracy,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 1.0)
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="accuracy",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 1.0)

	def test_bootstrap(self):
	data = Dataset.from_dict({"label": [1, 0, 0], "text": ["great movie", "great movie", "horrible movie"]})

	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=data,
	metric="accuracy",
	label_mapping=self.label_mapping,
	strategy="bootstrap",
	n_resamples=10,
	random_state=0,
	)
	self.assertAlmostEqual(results["accuracy"]["score"], 0.666666, 5)
	self.assertAlmostEqual(results["accuracy"]["confidence_interval"][0], 0.33333, 5)
	self.assertAlmostEqual(results["accuracy"]["confidence_interval"][1], 0.666666, 5)
	self.assertAlmostEqual(results["accuracy"]["standard_error"], 0.22498, 5)

	def test_perf(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.perf_pipe,
	data=self.data,
	metric="accuracy",
	input_column=self.input_column,
	label_column=self.label_column,
	label_mapping=self.label_mapping,
	n_resamples=10,
	random_state=0,
	)
	self.assertEqual(results["accuracy"], 1.0)
	self.assertAlmostEqual(results["total_time_in_seconds"], 0.1, 1)
	self.assertAlmostEqual(results["samples_per_second"], len(self.data) / results["total_time_in_seconds"], 5)
	self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(self.data), 5)

	def test_bootstrap_and_perf(self):
	data = Dataset.from_dict({"label": [1, 0, 0], "text": ["great movie", "great movie", "horrible movie"]})

	results = self.evaluator.compute(
	model_or_pipeline=self.perf_pipe,
	data=data,
	metric="accuracy",
	input_column=self.input_column,
	label_column=self.label_column,
	label_mapping=self.label_mapping,
	strategy="bootstrap",
	n_resamples=10,
	random_state=0,
	)
	self.assertAlmostEqual(results["accuracy"]["score"], 0.666666, 5)
	self.assertAlmostEqual(results["accuracy"]["confidence_interval"][0], 0.333333, 5)
	self.assertAlmostEqual(results["accuracy"]["confidence_interval"][1], 0.666666, 5)
	self.assertAlmostEqual(results["accuracy"]["standard_error"], 0.22498285, 5)
	self.assertAlmostEqual(results["total_time_in_seconds"], 0.1, 1)
	self.assertAlmostEqual(results["samples_per_second"], len(data) / results["total_time_in_seconds"], 5)
	self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(data), 5)


	class TestTextClassificationEvaluatorTwoColumns(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict(
	{
	"label": [1, 0],
	"premise": ["great car", "great movie"],
	"hypothesis": ["great vehicle", "horrible movie"],
	}
	)
	self.default_model = "prajjwal1/bert-tiny-mnli"
	self.input_column = "premise"
	self.second_input_column = "hypothesis"
	self.label_column = "label"
	self.pipe = DummyTextClassificationPipeline()
	self.evaluator = evaluator("text-classification")
	self.label_mapping = {"NEGATIVE": 0.0, "POSITIVE": 1.0}
	self.label_mapping2 = {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}

	def test_pipe_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	input_column=self.input_column,
	second_input_column=self.second_input_column,
	label_column="label",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 1.0)

	@slow
	def test_model_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.default_model,
	data=self.data,
	metric="accuracy",
	input_column=self.input_column,
	second_input_column=self.second_input_column,
	label_column=self.label_column,
	label_mapping=self.label_mapping2,
	)
	self.assertEqual(results["accuracy"], 1.0)

	model = AutoModelForSequenceClassification.from_pretrained(self.default_model)
	tokenizer = AutoTokenizer.from_pretrained(self.default_model)

	results = self.evaluator.compute(
	model_or_pipeline=model,
	data=self.data,
	metric="accuracy",
	input_column=self.input_column,
	second_input_column=self.second_input_column,
	tokenizer=tokenizer,
	label_mapping=self.label_mapping2,
	)
	self.assertEqual(results["accuracy"], 1.0)


	class TestImageClassificationEvaluator(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict(
	{
	"label": [2, 2],
	"image": [Image.new("RGB", (500, 500), (255, 255, 255)), Image.new("RGB", (500, 500), (170, 95, 170))],
	}
	)
	self.default_model = "lysandre/tiny-vit-random"
	self.pipe = DummyImageClassificationPipeline()
	self.evaluator = evaluator("image-classification")
	self.label_mapping = AutoConfig.from_pretrained(self.default_model).label2id

	def test_pipe_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	@slow
	def test_model_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.default_model,
	data=self.data,
	metric="accuracy",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	model = AutoModelForImageClassification.from_pretrained(self.default_model)
	feature_extractor = AutoFeatureExtractor.from_pretrained(self.default_model)

	results = self.evaluator.compute(
	model_or_pipeline=model,
	data=self.data,
	metric="accuracy",
	feature_extractor=feature_extractor,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	def test_class_init(self):
	evaluator = ImageClassificationEvaluator()
	self.assertEqual(evaluator.task, "image-classification")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="accuracy",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	@slow
	def test_default_pipe_init(self):
	results = self.evaluator.compute(
	data=self.data,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	def test_overwrite_default_metric(self):
	accuracy = load("accuracy")
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric=accuracy,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="accuracy",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)


	class TestQuestionAnsweringEvaluator(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict(
	{
	"id": ["56be4db0acb8001400a502ec", "56be4db0acb8001400a502ed"],
	"context": ["My name is Felix and I love cookies!", "Misa name is Felix and misa love cookies!"],
	"answers": [{"text": ["Felix"], "answer_start": [11]}, {"text": ["Felix"], "answer_start": [13]}],
	"question": ["What is my name?", "What is my name?"],
	}
	)
	self.data_v2 = Dataset.from_dict(
	{
	"id": ["56be4db0acb8001400a502ec", "56be4db0acb8001400a502ed"],
	"context": ["My name is Felix and I love cookies!", "Let's explore the city!"],
	"answers": [{"text": ["Felix"], "answer_start": [11]}, {"text": [], "answer_start": []}],
	"question": ["What is my name?", "What is my name?"],
	}
	)

	self.default_model = "mrm8488/bert-tiny-finetuned-squadv2"
	self.pipe = DummyQuestionAnsweringPipeline(v2=False)
	self.pipe_v2 = DummyQuestionAnsweringPipeline(v2=True)
	self.evaluator = evaluator("question-answering")

	def test_pipe_init(self):
	# squad_v1-like dataset
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	)
	self.assertEqual(results["exact_match"], 100.0)
	self.assertEqual(results["f1"], 100.0)

	@slow
	def test_model_init(self):
	# squad_v1-like dataset
	results = self.evaluator.compute(
	model_or_pipeline=self.default_model,
	data=self.data,
	metric="squad",
	)
	self.assertEqual(results["exact_match"], 0)
	self.assertEqual(results["f1"], 100 / 3)

	model = AutoModelForQuestionAnswering.from_pretrained(self.default_model)
	tokenizer = AutoTokenizer.from_pretrained(self.default_model)

	results = self.evaluator.compute(
	model_or_pipeline=model,
	data=self.data,
	metric="squad",
	tokenizer=tokenizer,
	)
	self.assertEqual(results["exact_match"], 0)
	self.assertEqual(results["f1"], 100 / 3)

	def test_class_init(self):
	# squad_v1-like dataset
	evaluator = QuestionAnsweringEvaluator()
	self.assertEqual(evaluator.task, "question-answering")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="squad",
	)
	self.assertEqual(results["exact_match"], 100.0)
	self.assertEqual(results["f1"], 100.0)

	# squad_v2-like dataset
	evaluator = QuestionAnsweringEvaluator()
	self.assertEqual(evaluator.task, "question-answering")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe_v2,
	data=self.data_v2,
	metric="squad_v2",
	)
	self.assertDictEqual(
	{key: results[key] for key in ["HasAns_f1", "NoAns_f1"]}, {"HasAns_f1": 100.0, "NoAns_f1": 100.0}
	)

	@slow
	def test_default_pipe_init(self):
	# squad_v1-like dataset
	results = self.evaluator.compute(
	data=self.data,
	)
	self.assertEqual(results["exact_match"], 100.0)
	self.assertEqual(results["f1"], 100.0)

	# squad_v2-like dataset
	results = self.evaluator.compute(
	data=self.data_v2,
	metric="squad_v2",
	)
	self.assertDictEqual(
	{key: results[key] for key in ["HasAns_f1", "NoAns_f1"]}, {"HasAns_f1": 100.0, "NoAns_f1": 0.0}
	)

	def test_data_loading(self):
	# Test passing in dataset by name with data_split
	data = self.evaluator.load_data("evaluate/squad-ci", split="validation[:1]")
	self.evaluator.prepare_data(
	data=data, question_column="question", context_column="context", id_column="id", label_column="answers"
	)

	# Test passing in dataset by name without data_split and inferring the optimal split
	data = self.evaluator.load_data("evaluate/squad-ci")
	self.evaluator.prepare_data(
	data=data, question_column="question", context_column="context", id_column="id", label_column="answers"
	)

	# Test that it chooses the correct one (e.g. squad only has train and validation, but no test)
	self.assertEqual(data.split, "validation")

	# Test that the data point returned is correct; this maps to the first example in the squad-ci dataset
	self.assertEqual(data[0]["id"], "56be4db0acb8001400a502ec")

	def test_overwrite_default_metric(self):
	# squad_v1-like dataset
	squad = load("squad")
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric=squad,
	)
	self.assertEqual(results["exact_match"], 100.0)
	self.assertEqual(results["f1"], 100.0)

	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="squad",
	)
	self.assertEqual(results["exact_match"], 100.0)
	self.assertEqual(results["f1"], 100.0)


	class TestTokenClassificationEvaluator(TestCase):
	def setUp(self):
	features = Features(
	{
	"tokens": Sequence(feature=Value(dtype="string")),
	"ner_tags": Sequence(feature=ClassLabel(names=["O", "B-LOC", "I-LOC"])),
	}
	)

	self.data = Dataset.from_dict(
	{
	"tokens": [["New", "York", "a", "nice", "City", "."]],
	"ner_tags": [[1, 2, 0, 0, 1, 0]],
	},
	features=features,
	)
	self.default_model = "hf-internal-testing/tiny-bert-for-token-classification"
	self.pipe = DummyTokenClassificationPipeline()
	self.evaluator = evaluator("token-classification")

	@slow
	def test_model_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.default_model,
	data=self.data,
	metric="seqeval",
	)
	self.assertEqual(results["overall_accuracy"], 0.5)

	model = AutoModelForTokenClassification.from_pretrained(self.default_model)
	tokenizer = AutoTokenizer.from_pretrained(self.default_model)
	results = self.evaluator.compute(
	model_or_pipeline=model,
	data=self.data,
	metric="seqeval",
	tokenizer=tokenizer,
	)
	self.assertEqual(results["overall_accuracy"], 0.5)

	def test_class_init(self):
	evaluator = TokenClassificationEvaluator()
	self.assertEqual(evaluator.task, "token-classification")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="seqeval",
	)
	self.assertEqual(results["overall_accuracy"], 1.0)

	@slow
	def test_default_pipe_init(self):
	results = self.evaluator.compute(
	data=self.data,
	)
	self.assertEqual(results["overall_accuracy"], 2 / 3)

	def test_overwrite_default_metric(self):
	accuracy = load("seqeval")
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric=accuracy,
	)
	self.assertEqual(results["overall_accuracy"], 1.0)
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="seqeval",
	)
	self.assertEqual(results["overall_accuracy"], 1.0)

	def test_data_loading(self):
	# Test passing in dataset by name with data_split
	data = self.evaluator.load_data("evaluate/conll2003-ci", split="validation[:1]")
	self.evaluator.prepare_data(
	data=data,
	input_column="tokens",
	label_column="ner_tags",
	join_by=" ",
	)

	# Test passing in dataset by name without data_split and inferring the optimal split
	data = self.evaluator.load_data("evaluate/conll2003-ci")
	self.evaluator.prepare_data(
	data=data,
	input_column="tokens",
	label_column="ner_tags",
	join_by=" ",
	)

	# Test that it chooses the correct one (e.g. conll2003 has train, validation, test but should select test)
	self.assertEqual(data.split, "test")

	# Test that the data point returned is correct; this maps to the first example in the dataset
	self.assertEqual(data[0]["id"], "0")

	def test_wrong_task(self):
	self.assertRaises(KeyError, evaluator, "bad_task")

	def test_words_to_offsets(self):
	task_evaluator = evaluator("token-classification")

	words = ["This", "is", "a", "test", "."]
	join_by = " "

	offsets = task_evaluator.words_to_offsets(words, join_by)

	self.assertListEqual([(0, 3), (5, 6), (8, 8), (10, 13), (15, 15)], offsets)

	words = ["日", "本", "語", "はなせるの?"]
	join_by = ""

	offsets = task_evaluator.words_to_offsets(words, join_by)

	self.assertListEqual([(0, 0), (1, 1), (2, 2), (3, 8)], offsets)

	def test_predictions_processor(self):
	task_evaluator = evaluator("token-classification")
	join_by = " "
	words = [["New", "York", "a", "nice", "City", "."]]

	# aligned start and words
	predictions = [
	[
	{"start": 0, "entity": "B-LOC"},
	{"start": 2, "entity": "I-LOC"},
	{"start": 4, "entity": "I-LOC"},
	{"start": 9, "entity": "O"},
	{"start": 11, "entity": "O"},
	{"start": 16, "entity": "B-LOC"},
	{"start": 21, "entity": "O"},
	]
	]
	predictions = task_evaluator.predictions_processor(predictions, words, join_by)
	self.assertListEqual(predictions["predictions"][0], ["B-LOC", "I-LOC", "O", "O", "B-LOC", "O"])

	# non-aligned start and words
	predictions = [
	[
	{"start": 0, "entity": "B-LOC"},
	{"start": 2, "entity": "I-LOC"},
	{"start": 9, "entity": "O"},
	{"start": 11, "entity": "O"},
	{"start": 16, "entity": "B-LOC"},
	{"start": 21, "entity": "O"},
	]
	]
	predictions = task_evaluator.predictions_processor(predictions, words, join_by)
	self.assertListEqual(predictions["predictions"][0], ["B-LOC", "O", "O", "O", "B-LOC", "O"])

	# non-aligned start and words
	predictions = [
	[
	{"start": 0, "entity": "B-LOC"},
	{"start": 6, "entity": "I-LOC"},
	{"start": 9, "entity": "O"},
	{"start": 11, "entity": "O"},
	{"start": 16, "entity": "B-LOC"},
	{"start": 21, "entity": "O"},
	]
	]
	predictions = task_evaluator.predictions_processor(predictions, words, join_by)
	self.assertListEqual(predictions["predictions"][0], ["B-LOC", "O", "O", "O", "B-LOC", "O"])

	# non-aligned start and words
	predictions = [
	[
	{"start": 0, "entity": "B-LOC"},
	{"start": 9, "entity": "O"},
	{"start": 11, "entity": "O"},
	{"start": 16, "entity": "B-LOC"},
	{"start": 21, "entity": "O"},
	]
	]
	predictions = task_evaluator.predictions_processor(predictions, words, join_by)
	self.assertListEqual(predictions["predictions"][0], ["B-LOC", "O", "O", "O", "B-LOC", "O"])


	class TestTextGenerationEvaluator(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict({"text": ["Lorem ipsum"]})
	self.pipe = DummyTextGenerationPipeline(num_return_sequences=4)
	self.evaluator = evaluator("text-generation")

	def test_class_init(self):
	evaluator = TextGenerationEvaluator()
	self.assertEqual(evaluator.task, "text-generation")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="word_count",
	)
	self.assertIsInstance(results["unique_words"], int)

	@slow
	def test_default_pipe_init(self):
	results = self.evaluator.compute(data=self.data)
	self.assertIsInstance(results["unique_words"], int)

	def test_overwrite_default_metric(self):
	word_length = load("word_length")
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric=word_length,
	)
	self.assertIsInstance(results["average_word_length"], int)
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="word_length",
	)
	self.assertIsInstance(results["average_word_length"], int)

	def test_process_predictions_multiple_return_sequences(self):
	processed_predictions = self.evaluator.predictions_processor(
	[
	[{"generated_text": "A"}, {"generated_text": "B"}],
	[{"generated_text": "C"}, {"generated_text": "D"}],
	]
	)
	self.assertEqual(processed_predictions, {"data": ["A", "B", "C", "D"]})


	class TestText2TextGenerationEvaluator(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict(
	{
	"text": ["Lorem ipsum"] * 4,
	"label": ["Ipsum Lorem"] * 4,
	}
	)
	self.pipe = DummyText2TextGenerationPipeline()
	self.evaluator = evaluator("text2text-generation")

	def test_pipe_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	)
	self.assertEqual(results["bleu"], 0)

	def test_class_init(self):
	evaluator = Text2TextGenerationEvaluator()
	self.assertEqual(evaluator.task, "text2text-generation")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="bleu",
	)
	self.assertEqual(results["bleu"], 0)

	@slow
	def test_default_pipe_init(self):
	results = self.evaluator.compute(data=self.data)
	self.assertEqual(results["bleu"], 0)

	def test_overwrite_default_metric(self):
	rouge = load("rouge")
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric=rouge,
	)
	self.assertEqual(results["rouge1"], 1.0)
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="rouge",
	)
	self.assertEqual(results["rouge1"], 1.0)

	def test_summarization(self):
	pipe = DummyText2TextGenerationPipeline(task="summarization", prefix="summary")
	e = evaluator("summarization")

	results = e.compute(
	model_or_pipeline=pipe,
	data=self.data,
	)
	self.assertEqual(results["rouge1"], 1.0)

	def test_translation(self):
	pipe = DummyText2TextGenerationPipeline(task="translation", prefix="translation")
	e = evaluator("translation")

	results = e.compute(
	model_or_pipeline=pipe,
	data=self.data,
	)
	self.assertEqual(results["bleu"], 0)


	class TestAutomaticSpeechRecognitionEvaluator(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict(
	{
	"path": [
	# Examples copied from default speech model of
	# `automic-speech-recognition` pipeline:
	# https://huggingface.co/facebook/wav2vec2-base-960h
	# https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py#L161
	"https://cdn-media.huggingface.co/speech_samples/sample1.flac",
	"https://cdn-media.huggingface.co/speech_samples/sample2.flac",
	],
	"sentence": ["Ipsum Lorem"] * 2,
	}
	)
	self.pipe = DummyAutomaticSpeechRecognitionPipeline()
	self.evaluator = evaluator("automatic-speech-recognition")

	def test_pipe_init(self):
	print(self.evaluator)
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	)
	print(results)
	self.assertEqual(results["wer"], 1.0)

	def test_class_init(self):
	evaluator = AutomaticSpeechRecognitionEvaluator()
	self.assertEqual(evaluator.task, "automatic-speech-recognition")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="wer",
	)
	self.assertEqual(results["wer"], 1.0)

	@slow
	def test_default_pipe_init(self):
	results = self.evaluator.compute(data=self.data)
	self.assertGreater(results["wer"], 1.0)

	def test_overwrite_default_metric(self):
	cer = load("cer")
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric=cer,
	)
	self.assertEqual(results["cer"], 0.7272727272727273)

	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="cer",
	)
	self.assertEqual(results["cer"], 0.7272727272727273)


	class TestAudioClassificationEvaluator(TestCase):
	def setUp(self):
	self.data = Dataset.from_dict(
	{"file": ["https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac"], "label": [11]}
	)
	self.raw_data = Dataset.from_dict(
	{
	"audio": [
	np.array(
	[-0.00048828, -0.00018311, -0.00137329, 0.00079346, 0.00091553, 0.00085449], dtype=np.float32
	)
	],
	"label": [11],
	}
	)
	self.default_model = "superb/wav2vec2-base-superb-ks"
	self.pipe = DummyAudioClassificationPipeline()
	self.evaluator = evaluator("audio-classification")
	self.label_mapping = AutoConfig.from_pretrained(self.default_model).label2id

	def test_pipe_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	def test_raw_pipe_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe, data=self.raw_data, label_mapping=self.label_mapping, input_column="audio"
	)
	self.assertEqual(results["accuracy"], 0)

	@slow
	def test_model_init(self):
	results = self.evaluator.compute(
	model_or_pipeline=self.default_model,
	data=self.data,
	metric="accuracy",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	model = AutoModelForAudioClassification.from_pretrained(self.default_model)
	feature_extractor = AutoFeatureExtractor.from_pretrained(self.default_model)

	results = self.evaluator.compute(
	model_or_pipeline=model,
	data=self.data,
	metric="accuracy",
	feature_extractor=feature_extractor,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	def test_class_init(self):
	evaluator = AudioClassificationEvaluator()
	self.assertEqual(evaluator.task, "audio-classification")
	self.assertIsNone(evaluator.default_metric_name)

	results = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="accuracy",
	label_mapping=self.label_mapping,
	)
	results_raw = evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.raw_data,
	label_mapping=self.label_mapping,
	metric="accuracy",
	input_column="audio",
	)
	self.assertEqual(results_raw["accuracy"], 0)
	self.assertEqual(results["accuracy"], 0)

	@slow
	def test_default_pipe_init(self):
	results = self.evaluator.compute(
	data=self.data,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)

	def test_overwrite_default_metric(self):
	accuracy = load("accuracy")
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric=accuracy,
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)
	results = self.evaluator.compute(
	model_or_pipeline=self.pipe,
	data=self.data,
	metric="accuracy",
	label_mapping=self.label_mapping,
	)
	self.assertEqual(results["accuracy"], 0)