|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from time import sleep |
|
|
from unittest import TestCase, mock |
|
|
|
|
|
import numpy as np |
|
|
from datasets import ClassLabel, Dataset, Features, Sequence, Value |
|
|
from PIL import Image |
|
|
from transformers import ( |
|
|
AutoConfig, |
|
|
AutoFeatureExtractor, |
|
|
AutoModelForAudioClassification, |
|
|
AutoModelForImageClassification, |
|
|
AutoModelForQuestionAnswering, |
|
|
AutoModelForSequenceClassification, |
|
|
AutoModelForTokenClassification, |
|
|
AutoTokenizer, |
|
|
pipeline, |
|
|
) |
|
|
|
|
|
from evaluate import ( |
|
|
AudioClassificationEvaluator, |
|
|
AutomaticSpeechRecognitionEvaluator, |
|
|
Evaluator, |
|
|
ImageClassificationEvaluator, |
|
|
QuestionAnsweringEvaluator, |
|
|
Text2TextGenerationEvaluator, |
|
|
TextClassificationEvaluator, |
|
|
TextGenerationEvaluator, |
|
|
TokenClassificationEvaluator, |
|
|
evaluator, |
|
|
load, |
|
|
) |
|
|
|
|
|
from .utils import slow |
|
|
|
|
|
|
|
|
class DummyTextGenerationPipeline: |
|
|
def __init__(self, prefix="generated", task="text-generation", num_return_sequences=1): |
|
|
self.task = task |
|
|
self.prefix = prefix |
|
|
self.num_return_sequences = num_return_sequences |
|
|
|
|
|
def __call__(self, inputs, **kwargs): |
|
|
return [[{f"{self.prefix}_text": "Lorem ipsum"} for _ in range(self.num_return_sequences)] for _ in inputs] |
|
|
|
|
|
|
|
|
class DummyText2TextGenerationPipeline: |
|
|
def __init__(self, prefix="generated", task="text2text-generation"): |
|
|
self.task = task |
|
|
self.prefix = prefix |
|
|
|
|
|
def __call__(self, inputs, **kwargs): |
|
|
return [{f"{self.prefix}_text": "Lorem ipsum"} for _ in inputs] |
|
|
|
|
|
|
|
|
class DummyTextClassificationPipeline: |
|
|
def __init__(self, sleep_time=None): |
|
|
self.task = "text-classification" |
|
|
self.sleep_time = sleep_time |
|
|
|
|
|
def __call__(self, inputs, **kwargs): |
|
|
if self.sleep_time is not None: |
|
|
sleep(self.sleep_time) |
|
|
return [{"label": "NEGATIVE"} if i % 2 == 1 else {"label": "POSITIVE"} for i, _ in enumerate(inputs)] |
|
|
|
|
|
|
|
|
class DummyImageClassificationPipeline: |
|
|
def __init__(self): |
|
|
self.task = "image-classification" |
|
|
|
|
|
def __call__(self, images, **kwargs): |
|
|
return [[{"score": 0.9, "label": "yurt"}, {"score": 0.1, "label": "umbrella"}] for i, _ in enumerate(images)] |
|
|
|
|
|
|
|
|
class DummyQuestionAnsweringPipeline: |
|
|
def __init__(self, v2: bool): |
|
|
self.task = "question-answering" |
|
|
self.v2 = v2 |
|
|
|
|
|
def __call__(self, question, context, **kwargs): |
|
|
if self.v2: |
|
|
return [ |
|
|
{"score": 0.95, "start": 31, "end": 39, "answer": "Felix"} |
|
|
if i % 2 == 0 |
|
|
else {"score": 0.95, "start": 0, "end": 0, "answer": ""} |
|
|
for i in range(len(question)) |
|
|
] |
|
|
else: |
|
|
return [{"score": 0.95, "start": 31, "end": 39, "answer": "Felix"} for _ in question] |
|
|
|
|
|
|
|
|
class DummyTokenClassificationPipeline: |
|
|
def __init__(self): |
|
|
self.task = "token-classification" |
|
|
|
|
|
def __call__(self, inputs, **kwargs): |
|
|
result = [ |
|
|
{"start": 0, "entity": "B-LOC"}, |
|
|
{"start": 2, "entity": "I-LOC"}, |
|
|
{"start": 4, "entity": "I-LOC"}, |
|
|
{"start": 9, "entity": "O"}, |
|
|
{"start": 11, "entity": "O"}, |
|
|
{"start": 16, "entity": "B-LOC"}, |
|
|
{"start": 21, "entity": "O"}, |
|
|
] |
|
|
|
|
|
return [result] |
|
|
|
|
|
|
|
|
class DummyAutomaticSpeechRecognitionPipeline: |
|
|
def __init__(self) -> None: |
|
|
self.task = "automatic-speech-recognition" |
|
|
|
|
|
def __call__(self, inputs, **kwargs): |
|
|
return [{"text": "Lorem ipsum"} for _ in inputs] |
|
|
|
|
|
|
|
|
class DummyAudioClassificationPipeline: |
|
|
def __init__(self): |
|
|
self.task = "audio-classification" |
|
|
|
|
|
def __call__(self, audio, **kwargs): |
|
|
return [[{"score": 0.9, "label": "yes"}, {"score": 0.1, "label": "no"}] for i, _ in enumerate(audio)] |
|
|
|
|
|
|
|
|
class TestEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]}) |
|
|
self.default_ckpt = "hf-internal-testing/tiny-random-bert" |
|
|
self.default_model = AutoModelForSequenceClassification.from_pretrained(self.default_ckpt, num_labels=2) |
|
|
self.default_tokenizer = AutoTokenizer.from_pretrained(self.default_ckpt) |
|
|
self.pipe = pipeline("text-classification", model=self.default_model, tokenizer=self.default_tokenizer) |
|
|
self.evaluator = evaluator("text-classification") |
|
|
self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]}) |
|
|
self.label_mapping = {"LABEL_0": 0.0, "LABEL_1": 1.0} |
|
|
|
|
|
def test_wrong_task(self): |
|
|
self.assertRaises(KeyError, evaluator, "bad_task") |
|
|
|
|
|
def test_device_placement(self): |
|
|
orig_import = __import__ |
|
|
|
|
|
pt_mock = mock.Mock() |
|
|
tf_mock = mock.Mock() |
|
|
|
|
|
|
|
|
def import_pt_tf_mock(name, *args): |
|
|
if name == "torch": |
|
|
if pt_available: |
|
|
return pt_mock |
|
|
else: |
|
|
raise ImportError |
|
|
if name == "tensorflow": |
|
|
if tf_available: |
|
|
return tf_mock |
|
|
else: |
|
|
raise ImportError |
|
|
return orig_import(name, *args) |
|
|
|
|
|
with mock.patch("builtins.__import__", side_effect=import_pt_tf_mock): |
|
|
|
|
|
pt_available = False |
|
|
tf_available = False |
|
|
self.assertEqual(Evaluator._infer_device(), -1) |
|
|
|
|
|
|
|
|
pt_available = True |
|
|
pt_mock.cuda.is_available.return_value = False |
|
|
self.assertEqual(Evaluator._infer_device(), -1) |
|
|
|
|
|
|
|
|
pt_mock.cuda.is_available.return_value = True |
|
|
self.assertEqual(Evaluator._infer_device(), 0) |
|
|
|
|
|
|
|
|
pt_available = False |
|
|
tf_available = True |
|
|
tf_mock.config.list_physical_devices.return_value = [] |
|
|
self.assertEqual(Evaluator._infer_device(), -1) |
|
|
|
|
|
|
|
|
tf_mock.config.list_physical_devices.return_value = ["GPU:0", "GPU:1"] |
|
|
self.assertEqual(Evaluator._infer_device(), 0) |
|
|
|
|
|
|
|
|
pt_mock.cuda.is_available.return_value = True |
|
|
self.assertRaises( |
|
|
ValueError, Evaluator.check_for_mismatch_in_device_setup, Evaluator._infer_device(), self.pipe |
|
|
) |
|
|
|
|
|
|
|
|
pt_available = False |
|
|
tf_available = True |
|
|
self.assertRaises( |
|
|
ValueError, Evaluator.check_for_mismatch_in_device_setup, Evaluator._infer_device(), self.pipe |
|
|
) |
|
|
|
|
|
def test_pipe_init(self): |
|
|
self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
input_column="text", |
|
|
label_column="label", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
|
|
|
def test_model_init(self): |
|
|
self.evaluator.compute( |
|
|
model_or_pipeline=self.default_model, |
|
|
tokenizer=self.default_tokenizer, |
|
|
data=self.data, |
|
|
input_column="text", |
|
|
label_column="label", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
|
|
|
def test_model_str_init(self): |
|
|
self.evaluator.compute( |
|
|
model_or_pipeline=self.default_ckpt, |
|
|
data=self.data, |
|
|
input_column="text", |
|
|
label_column="label", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
|
|
|
|
|
|
class TestTextClassificationEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict({"label": [1, 0], "text": ["great movie", "horrible movie"]}) |
|
|
self.default_model = "lvwerra/distilbert-imdb" |
|
|
self.input_column = "text" |
|
|
self.label_column = "label" |
|
|
self.pipe = DummyTextClassificationPipeline() |
|
|
self.perf_pipe = DummyTextClassificationPipeline(sleep_time=0.1) |
|
|
self.evaluator = evaluator("text-classification") |
|
|
self.label_mapping = {"NEGATIVE": 0.0, "POSITIVE": 1.0} |
|
|
|
|
|
def test_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
input_column="text", |
|
|
label_column="label", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
|
|
|
@slow |
|
|
def test_model_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.default_model, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
input_column=self.input_column, |
|
|
label_column=self.label_column, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(self.default_model) |
|
|
tokenizer = AutoTokenizer.from_pretrained(self.default_model) |
|
|
|
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=model, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
tokenizer=tokenizer, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
|
|
|
def test_class_init(self): |
|
|
evaluator = TextClassificationEvaluator() |
|
|
self.assertEqual(evaluator.task, "text-classification") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="f1", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["f1"], 1.0) |
|
|
|
|
|
@slow |
|
|
def test_default_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
data=self.data, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
|
|
|
def test_data_loading(self): |
|
|
|
|
|
|
|
|
data = self.evaluator.load_data("evaluate/imdb-ci", split="test[:1]") |
|
|
self.evaluator.prepare_data(data=data, input_column="text", label_column="label", second_input_column=None) |
|
|
|
|
|
|
|
|
data = self.evaluator.load_data("evaluate/imdb-ci") |
|
|
self.evaluator.prepare_data(data=data, input_column="text", label_column="label", second_input_column=None) |
|
|
|
|
|
|
|
|
self.assertEqual(data.split, "test") |
|
|
|
|
|
|
|
|
self.assertEqual(data[0]["text"], "I love movies about whales!") |
|
|
|
|
|
|
|
|
data = self.evaluator.load_data("evaluate/glue-ci", subset="cola", split="test") |
|
|
self.assertEqual(isinstance(data, Dataset), True) |
|
|
|
|
|
|
|
|
data = self.evaluator.load_data("evaluate/glue-ci", subset="cola") |
|
|
self.assertEqual(isinstance(data, Dataset), True) |
|
|
|
|
|
def test_overwrite_default_metric(self): |
|
|
accuracy = load("accuracy") |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric=accuracy, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
|
|
|
def test_bootstrap(self): |
|
|
data = Dataset.from_dict({"label": [1, 0, 0], "text": ["great movie", "great movie", "horrible movie"]}) |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=data, |
|
|
metric="accuracy", |
|
|
label_mapping=self.label_mapping, |
|
|
strategy="bootstrap", |
|
|
n_resamples=10, |
|
|
random_state=0, |
|
|
) |
|
|
self.assertAlmostEqual(results["accuracy"]["score"], 0.666666, 5) |
|
|
self.assertAlmostEqual(results["accuracy"]["confidence_interval"][0], 0.33333, 5) |
|
|
self.assertAlmostEqual(results["accuracy"]["confidence_interval"][1], 0.666666, 5) |
|
|
self.assertAlmostEqual(results["accuracy"]["standard_error"], 0.22498, 5) |
|
|
|
|
|
def test_perf(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.perf_pipe, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
input_column=self.input_column, |
|
|
label_column=self.label_column, |
|
|
label_mapping=self.label_mapping, |
|
|
n_resamples=10, |
|
|
random_state=0, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
self.assertAlmostEqual(results["total_time_in_seconds"], 0.1, 1) |
|
|
self.assertAlmostEqual(results["samples_per_second"], len(self.data) / results["total_time_in_seconds"], 5) |
|
|
self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(self.data), 5) |
|
|
|
|
|
def test_bootstrap_and_perf(self): |
|
|
data = Dataset.from_dict({"label": [1, 0, 0], "text": ["great movie", "great movie", "horrible movie"]}) |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.perf_pipe, |
|
|
data=data, |
|
|
metric="accuracy", |
|
|
input_column=self.input_column, |
|
|
label_column=self.label_column, |
|
|
label_mapping=self.label_mapping, |
|
|
strategy="bootstrap", |
|
|
n_resamples=10, |
|
|
random_state=0, |
|
|
) |
|
|
self.assertAlmostEqual(results["accuracy"]["score"], 0.666666, 5) |
|
|
self.assertAlmostEqual(results["accuracy"]["confidence_interval"][0], 0.333333, 5) |
|
|
self.assertAlmostEqual(results["accuracy"]["confidence_interval"][1], 0.666666, 5) |
|
|
self.assertAlmostEqual(results["accuracy"]["standard_error"], 0.22498285, 5) |
|
|
self.assertAlmostEqual(results["total_time_in_seconds"], 0.1, 1) |
|
|
self.assertAlmostEqual(results["samples_per_second"], len(data) / results["total_time_in_seconds"], 5) |
|
|
self.assertAlmostEqual(results["latency_in_seconds"], results["total_time_in_seconds"] / len(data), 5) |
|
|
|
|
|
|
|
|
class TestTextClassificationEvaluatorTwoColumns(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict( |
|
|
{ |
|
|
"label": [1, 0], |
|
|
"premise": ["great car", "great movie"], |
|
|
"hypothesis": ["great vehicle", "horrible movie"], |
|
|
} |
|
|
) |
|
|
self.default_model = "prajjwal1/bert-tiny-mnli" |
|
|
self.input_column = "premise" |
|
|
self.second_input_column = "hypothesis" |
|
|
self.label_column = "label" |
|
|
self.pipe = DummyTextClassificationPipeline() |
|
|
self.evaluator = evaluator("text-classification") |
|
|
self.label_mapping = {"NEGATIVE": 0.0, "POSITIVE": 1.0} |
|
|
self.label_mapping2 = {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2} |
|
|
|
|
|
def test_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
input_column=self.input_column, |
|
|
second_input_column=self.second_input_column, |
|
|
label_column="label", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
|
|
|
@slow |
|
|
def test_model_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.default_model, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
input_column=self.input_column, |
|
|
second_input_column=self.second_input_column, |
|
|
label_column=self.label_column, |
|
|
label_mapping=self.label_mapping2, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(self.default_model) |
|
|
tokenizer = AutoTokenizer.from_pretrained(self.default_model) |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=model, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
input_column=self.input_column, |
|
|
second_input_column=self.second_input_column, |
|
|
tokenizer=tokenizer, |
|
|
label_mapping=self.label_mapping2, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 1.0) |
|
|
|
|
|
|
|
|
class TestImageClassificationEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict( |
|
|
{ |
|
|
"label": [2, 2], |
|
|
"image": [Image.new("RGB", (500, 500), (255, 255, 255)), Image.new("RGB", (500, 500), (170, 95, 170))], |
|
|
} |
|
|
) |
|
|
self.default_model = "lysandre/tiny-vit-random" |
|
|
self.pipe = DummyImageClassificationPipeline() |
|
|
self.evaluator = evaluator("image-classification") |
|
|
self.label_mapping = AutoConfig.from_pretrained(self.default_model).label2id |
|
|
|
|
|
def test_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
@slow |
|
|
def test_model_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.default_model, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
model = AutoModelForImageClassification.from_pretrained(self.default_model) |
|
|
feature_extractor = AutoFeatureExtractor.from_pretrained(self.default_model) |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=model, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
feature_extractor=feature_extractor, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
def test_class_init(self): |
|
|
evaluator = ImageClassificationEvaluator() |
|
|
self.assertEqual(evaluator.task, "image-classification") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
@slow |
|
|
def test_default_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
data=self.data, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
def test_overwrite_default_metric(self): |
|
|
accuracy = load("accuracy") |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric=accuracy, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
|
|
|
class TestQuestionAnsweringEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict( |
|
|
{ |
|
|
"id": ["56be4db0acb8001400a502ec", "56be4db0acb8001400a502ed"], |
|
|
"context": ["My name is Felix and I love cookies!", "Misa name is Felix and misa love cookies!"], |
|
|
"answers": [{"text": ["Felix"], "answer_start": [11]}, {"text": ["Felix"], "answer_start": [13]}], |
|
|
"question": ["What is my name?", "What is my name?"], |
|
|
} |
|
|
) |
|
|
self.data_v2 = Dataset.from_dict( |
|
|
{ |
|
|
"id": ["56be4db0acb8001400a502ec", "56be4db0acb8001400a502ed"], |
|
|
"context": ["My name is Felix and I love cookies!", "Let's explore the city!"], |
|
|
"answers": [{"text": ["Felix"], "answer_start": [11]}, {"text": [], "answer_start": []}], |
|
|
"question": ["What is my name?", "What is my name?"], |
|
|
} |
|
|
) |
|
|
|
|
|
self.default_model = "mrm8488/bert-tiny-finetuned-squadv2" |
|
|
self.pipe = DummyQuestionAnsweringPipeline(v2=False) |
|
|
self.pipe_v2 = DummyQuestionAnsweringPipeline(v2=True) |
|
|
self.evaluator = evaluator("question-answering") |
|
|
|
|
|
def test_pipe_init(self): |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
) |
|
|
self.assertEqual(results["exact_match"], 100.0) |
|
|
self.assertEqual(results["f1"], 100.0) |
|
|
|
|
|
@slow |
|
|
def test_model_init(self): |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.default_model, |
|
|
data=self.data, |
|
|
metric="squad", |
|
|
) |
|
|
self.assertEqual(results["exact_match"], 0) |
|
|
self.assertEqual(results["f1"], 100 / 3) |
|
|
|
|
|
model = AutoModelForQuestionAnswering.from_pretrained(self.default_model) |
|
|
tokenizer = AutoTokenizer.from_pretrained(self.default_model) |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=model, |
|
|
data=self.data, |
|
|
metric="squad", |
|
|
tokenizer=tokenizer, |
|
|
) |
|
|
self.assertEqual(results["exact_match"], 0) |
|
|
self.assertEqual(results["f1"], 100 / 3) |
|
|
|
|
|
def test_class_init(self): |
|
|
|
|
|
evaluator = QuestionAnsweringEvaluator() |
|
|
self.assertEqual(evaluator.task, "question-answering") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="squad", |
|
|
) |
|
|
self.assertEqual(results["exact_match"], 100.0) |
|
|
self.assertEqual(results["f1"], 100.0) |
|
|
|
|
|
|
|
|
evaluator = QuestionAnsweringEvaluator() |
|
|
self.assertEqual(evaluator.task, "question-answering") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe_v2, |
|
|
data=self.data_v2, |
|
|
metric="squad_v2", |
|
|
) |
|
|
self.assertDictEqual( |
|
|
{key: results[key] for key in ["HasAns_f1", "NoAns_f1"]}, {"HasAns_f1": 100.0, "NoAns_f1": 100.0} |
|
|
) |
|
|
|
|
|
@slow |
|
|
def test_default_pipe_init(self): |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
data=self.data, |
|
|
) |
|
|
self.assertEqual(results["exact_match"], 100.0) |
|
|
self.assertEqual(results["f1"], 100.0) |
|
|
|
|
|
|
|
|
results = self.evaluator.compute( |
|
|
data=self.data_v2, |
|
|
metric="squad_v2", |
|
|
) |
|
|
self.assertDictEqual( |
|
|
{key: results[key] for key in ["HasAns_f1", "NoAns_f1"]}, {"HasAns_f1": 100.0, "NoAns_f1": 0.0} |
|
|
) |
|
|
|
|
|
def test_data_loading(self): |
|
|
|
|
|
data = self.evaluator.load_data("evaluate/squad-ci", split="validation[:1]") |
|
|
self.evaluator.prepare_data( |
|
|
data=data, question_column="question", context_column="context", id_column="id", label_column="answers" |
|
|
) |
|
|
|
|
|
|
|
|
data = self.evaluator.load_data("evaluate/squad-ci") |
|
|
self.evaluator.prepare_data( |
|
|
data=data, question_column="question", context_column="context", id_column="id", label_column="answers" |
|
|
) |
|
|
|
|
|
|
|
|
self.assertEqual(data.split, "validation") |
|
|
|
|
|
|
|
|
self.assertEqual(data[0]["id"], "56be4db0acb8001400a502ec") |
|
|
|
|
|
def test_overwrite_default_metric(self): |
|
|
|
|
|
squad = load("squad") |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric=squad, |
|
|
) |
|
|
self.assertEqual(results["exact_match"], 100.0) |
|
|
self.assertEqual(results["f1"], 100.0) |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="squad", |
|
|
) |
|
|
self.assertEqual(results["exact_match"], 100.0) |
|
|
self.assertEqual(results["f1"], 100.0) |
|
|
|
|
|
|
|
|
class TestTokenClassificationEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
features = Features( |
|
|
{ |
|
|
"tokens": Sequence(feature=Value(dtype="string")), |
|
|
"ner_tags": Sequence(feature=ClassLabel(names=["O", "B-LOC", "I-LOC"])), |
|
|
} |
|
|
) |
|
|
|
|
|
self.data = Dataset.from_dict( |
|
|
{ |
|
|
"tokens": [["New", "York", "a", "nice", "City", "."]], |
|
|
"ner_tags": [[1, 2, 0, 0, 1, 0]], |
|
|
}, |
|
|
features=features, |
|
|
) |
|
|
self.default_model = "hf-internal-testing/tiny-bert-for-token-classification" |
|
|
self.pipe = DummyTokenClassificationPipeline() |
|
|
self.evaluator = evaluator("token-classification") |
|
|
|
|
|
@slow |
|
|
def test_model_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.default_model, |
|
|
data=self.data, |
|
|
metric="seqeval", |
|
|
) |
|
|
self.assertEqual(results["overall_accuracy"], 0.5) |
|
|
|
|
|
model = AutoModelForTokenClassification.from_pretrained(self.default_model) |
|
|
tokenizer = AutoTokenizer.from_pretrained(self.default_model) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=model, |
|
|
data=self.data, |
|
|
metric="seqeval", |
|
|
tokenizer=tokenizer, |
|
|
) |
|
|
self.assertEqual(results["overall_accuracy"], 0.5) |
|
|
|
|
|
def test_class_init(self): |
|
|
evaluator = TokenClassificationEvaluator() |
|
|
self.assertEqual(evaluator.task, "token-classification") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="seqeval", |
|
|
) |
|
|
self.assertEqual(results["overall_accuracy"], 1.0) |
|
|
|
|
|
@slow |
|
|
def test_default_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
data=self.data, |
|
|
) |
|
|
self.assertEqual(results["overall_accuracy"], 2 / 3) |
|
|
|
|
|
def test_overwrite_default_metric(self): |
|
|
accuracy = load("seqeval") |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric=accuracy, |
|
|
) |
|
|
self.assertEqual(results["overall_accuracy"], 1.0) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="seqeval", |
|
|
) |
|
|
self.assertEqual(results["overall_accuracy"], 1.0) |
|
|
|
|
|
def test_data_loading(self): |
|
|
|
|
|
data = self.evaluator.load_data("evaluate/conll2003-ci", split="validation[:1]") |
|
|
self.evaluator.prepare_data( |
|
|
data=data, |
|
|
input_column="tokens", |
|
|
label_column="ner_tags", |
|
|
join_by=" ", |
|
|
) |
|
|
|
|
|
|
|
|
data = self.evaluator.load_data("evaluate/conll2003-ci") |
|
|
self.evaluator.prepare_data( |
|
|
data=data, |
|
|
input_column="tokens", |
|
|
label_column="ner_tags", |
|
|
join_by=" ", |
|
|
) |
|
|
|
|
|
|
|
|
self.assertEqual(data.split, "test") |
|
|
|
|
|
|
|
|
self.assertEqual(data[0]["id"], "0") |
|
|
|
|
|
def test_wrong_task(self): |
|
|
self.assertRaises(KeyError, evaluator, "bad_task") |
|
|
|
|
|
def test_words_to_offsets(self): |
|
|
task_evaluator = evaluator("token-classification") |
|
|
|
|
|
words = ["This", "is", "a", "test", "."] |
|
|
join_by = " " |
|
|
|
|
|
offsets = task_evaluator.words_to_offsets(words, join_by) |
|
|
|
|
|
self.assertListEqual([(0, 3), (5, 6), (8, 8), (10, 13), (15, 15)], offsets) |
|
|
|
|
|
words = ["日", "本", "語", "はなせるの?"] |
|
|
join_by = "" |
|
|
|
|
|
offsets = task_evaluator.words_to_offsets(words, join_by) |
|
|
|
|
|
self.assertListEqual([(0, 0), (1, 1), (2, 2), (3, 8)], offsets) |
|
|
|
|
|
def test_predictions_processor(self): |
|
|
task_evaluator = evaluator("token-classification") |
|
|
join_by = " " |
|
|
words = [["New", "York", "a", "nice", "City", "."]] |
|
|
|
|
|
|
|
|
predictions = [ |
|
|
[ |
|
|
{"start": 0, "entity": "B-LOC"}, |
|
|
{"start": 2, "entity": "I-LOC"}, |
|
|
{"start": 4, "entity": "I-LOC"}, |
|
|
{"start": 9, "entity": "O"}, |
|
|
{"start": 11, "entity": "O"}, |
|
|
{"start": 16, "entity": "B-LOC"}, |
|
|
{"start": 21, "entity": "O"}, |
|
|
] |
|
|
] |
|
|
predictions = task_evaluator.predictions_processor(predictions, words, join_by) |
|
|
self.assertListEqual(predictions["predictions"][0], ["B-LOC", "I-LOC", "O", "O", "B-LOC", "O"]) |
|
|
|
|
|
|
|
|
predictions = [ |
|
|
[ |
|
|
{"start": 0, "entity": "B-LOC"}, |
|
|
{"start": 2, "entity": "I-LOC"}, |
|
|
{"start": 9, "entity": "O"}, |
|
|
{"start": 11, "entity": "O"}, |
|
|
{"start": 16, "entity": "B-LOC"}, |
|
|
{"start": 21, "entity": "O"}, |
|
|
] |
|
|
] |
|
|
predictions = task_evaluator.predictions_processor(predictions, words, join_by) |
|
|
self.assertListEqual(predictions["predictions"][0], ["B-LOC", "O", "O", "O", "B-LOC", "O"]) |
|
|
|
|
|
|
|
|
predictions = [ |
|
|
[ |
|
|
{"start": 0, "entity": "B-LOC"}, |
|
|
{"start": 6, "entity": "I-LOC"}, |
|
|
{"start": 9, "entity": "O"}, |
|
|
{"start": 11, "entity": "O"}, |
|
|
{"start": 16, "entity": "B-LOC"}, |
|
|
{"start": 21, "entity": "O"}, |
|
|
] |
|
|
] |
|
|
predictions = task_evaluator.predictions_processor(predictions, words, join_by) |
|
|
self.assertListEqual(predictions["predictions"][0], ["B-LOC", "O", "O", "O", "B-LOC", "O"]) |
|
|
|
|
|
|
|
|
predictions = [ |
|
|
[ |
|
|
{"start": 0, "entity": "B-LOC"}, |
|
|
{"start": 9, "entity": "O"}, |
|
|
{"start": 11, "entity": "O"}, |
|
|
{"start": 16, "entity": "B-LOC"}, |
|
|
{"start": 21, "entity": "O"}, |
|
|
] |
|
|
] |
|
|
predictions = task_evaluator.predictions_processor(predictions, words, join_by) |
|
|
self.assertListEqual(predictions["predictions"][0], ["B-LOC", "O", "O", "O", "B-LOC", "O"]) |
|
|
|
|
|
|
|
|
class TestTextGenerationEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict({"text": ["Lorem ipsum"]}) |
|
|
self.pipe = DummyTextGenerationPipeline(num_return_sequences=4) |
|
|
self.evaluator = evaluator("text-generation") |
|
|
|
|
|
def test_class_init(self): |
|
|
evaluator = TextGenerationEvaluator() |
|
|
self.assertEqual(evaluator.task, "text-generation") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="word_count", |
|
|
) |
|
|
self.assertIsInstance(results["unique_words"], int) |
|
|
|
|
|
@slow |
|
|
def test_default_pipe_init(self): |
|
|
results = self.evaluator.compute(data=self.data) |
|
|
self.assertIsInstance(results["unique_words"], int) |
|
|
|
|
|
def test_overwrite_default_metric(self): |
|
|
word_length = load("word_length") |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric=word_length, |
|
|
) |
|
|
self.assertIsInstance(results["average_word_length"], int) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="word_length", |
|
|
) |
|
|
self.assertIsInstance(results["average_word_length"], int) |
|
|
|
|
|
def test_process_predictions_multiple_return_sequences(self): |
|
|
processed_predictions = self.evaluator.predictions_processor( |
|
|
[ |
|
|
[{"generated_text": "A"}, {"generated_text": "B"}], |
|
|
[{"generated_text": "C"}, {"generated_text": "D"}], |
|
|
] |
|
|
) |
|
|
self.assertEqual(processed_predictions, {"data": ["A", "B", "C", "D"]}) |
|
|
|
|
|
|
|
|
class TestText2TextGenerationEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict( |
|
|
{ |
|
|
"text": ["Lorem ipsum"] * 4, |
|
|
"label": ["Ipsum Lorem"] * 4, |
|
|
} |
|
|
) |
|
|
self.pipe = DummyText2TextGenerationPipeline() |
|
|
self.evaluator = evaluator("text2text-generation") |
|
|
|
|
|
def test_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
) |
|
|
self.assertEqual(results["bleu"], 0) |
|
|
|
|
|
def test_class_init(self): |
|
|
evaluator = Text2TextGenerationEvaluator() |
|
|
self.assertEqual(evaluator.task, "text2text-generation") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="bleu", |
|
|
) |
|
|
self.assertEqual(results["bleu"], 0) |
|
|
|
|
|
@slow |
|
|
def test_default_pipe_init(self): |
|
|
results = self.evaluator.compute(data=self.data) |
|
|
self.assertEqual(results["bleu"], 0) |
|
|
|
|
|
def test_overwrite_default_metric(self): |
|
|
rouge = load("rouge") |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric=rouge, |
|
|
) |
|
|
self.assertEqual(results["rouge1"], 1.0) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="rouge", |
|
|
) |
|
|
self.assertEqual(results["rouge1"], 1.0) |
|
|
|
|
|
def test_summarization(self): |
|
|
pipe = DummyText2TextGenerationPipeline(task="summarization", prefix="summary") |
|
|
e = evaluator("summarization") |
|
|
|
|
|
results = e.compute( |
|
|
model_or_pipeline=pipe, |
|
|
data=self.data, |
|
|
) |
|
|
self.assertEqual(results["rouge1"], 1.0) |
|
|
|
|
|
def test_translation(self): |
|
|
pipe = DummyText2TextGenerationPipeline(task="translation", prefix="translation") |
|
|
e = evaluator("translation") |
|
|
|
|
|
results = e.compute( |
|
|
model_or_pipeline=pipe, |
|
|
data=self.data, |
|
|
) |
|
|
self.assertEqual(results["bleu"], 0) |
|
|
|
|
|
|
|
|
class TestAutomaticSpeechRecognitionEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict( |
|
|
{ |
|
|
"path": [ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"https://cdn-media.huggingface.co/speech_samples/sample1.flac", |
|
|
"https://cdn-media.huggingface.co/speech_samples/sample2.flac", |
|
|
], |
|
|
"sentence": ["Ipsum Lorem"] * 2, |
|
|
} |
|
|
) |
|
|
self.pipe = DummyAutomaticSpeechRecognitionPipeline() |
|
|
self.evaluator = evaluator("automatic-speech-recognition") |
|
|
|
|
|
def test_pipe_init(self): |
|
|
print(self.evaluator) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
) |
|
|
print(results) |
|
|
self.assertEqual(results["wer"], 1.0) |
|
|
|
|
|
def test_class_init(self): |
|
|
evaluator = AutomaticSpeechRecognitionEvaluator() |
|
|
self.assertEqual(evaluator.task, "automatic-speech-recognition") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="wer", |
|
|
) |
|
|
self.assertEqual(results["wer"], 1.0) |
|
|
|
|
|
@slow |
|
|
def test_default_pipe_init(self): |
|
|
results = self.evaluator.compute(data=self.data) |
|
|
self.assertGreater(results["wer"], 1.0) |
|
|
|
|
|
def test_overwrite_default_metric(self): |
|
|
cer = load("cer") |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric=cer, |
|
|
) |
|
|
self.assertEqual(results["cer"], 0.7272727272727273) |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="cer", |
|
|
) |
|
|
self.assertEqual(results["cer"], 0.7272727272727273) |
|
|
|
|
|
|
|
|
class TestAudioClassificationEvaluator(TestCase): |
|
|
def setUp(self): |
|
|
self.data = Dataset.from_dict( |
|
|
{"file": ["https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac"], "label": [11]} |
|
|
) |
|
|
self.raw_data = Dataset.from_dict( |
|
|
{ |
|
|
"audio": [ |
|
|
np.array( |
|
|
[-0.00048828, -0.00018311, -0.00137329, 0.00079346, 0.00091553, 0.00085449], dtype=np.float32 |
|
|
) |
|
|
], |
|
|
"label": [11], |
|
|
} |
|
|
) |
|
|
self.default_model = "superb/wav2vec2-base-superb-ks" |
|
|
self.pipe = DummyAudioClassificationPipeline() |
|
|
self.evaluator = evaluator("audio-classification") |
|
|
self.label_mapping = AutoConfig.from_pretrained(self.default_model).label2id |
|
|
|
|
|
def test_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
def test_raw_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, data=self.raw_data, label_mapping=self.label_mapping, input_column="audio" |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
@slow |
|
|
def test_model_init(self): |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.default_model, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
model = AutoModelForAudioClassification.from_pretrained(self.default_model) |
|
|
feature_extractor = AutoFeatureExtractor.from_pretrained(self.default_model) |
|
|
|
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=model, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
feature_extractor=feature_extractor, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
def test_class_init(self): |
|
|
evaluator = AudioClassificationEvaluator() |
|
|
self.assertEqual(evaluator.task, "audio-classification") |
|
|
self.assertIsNone(evaluator.default_metric_name) |
|
|
|
|
|
results = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
results_raw = evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.raw_data, |
|
|
label_mapping=self.label_mapping, |
|
|
metric="accuracy", |
|
|
input_column="audio", |
|
|
) |
|
|
self.assertEqual(results_raw["accuracy"], 0) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
@slow |
|
|
def test_default_pipe_init(self): |
|
|
results = self.evaluator.compute( |
|
|
data=self.data, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|
|
|
def test_overwrite_default_metric(self): |
|
|
accuracy = load("accuracy") |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric=accuracy, |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
results = self.evaluator.compute( |
|
|
model_or_pipeline=self.pipe, |
|
|
data=self.data, |
|
|
metric="accuracy", |
|
|
label_mapping=self.label_mapping, |
|
|
) |
|
|
self.assertEqual(results["accuracy"], 0) |
|
|
|