|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""FrugalScore metric.""" |
|
|
|
|
|
import datasets |
|
|
import torch |
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments |
|
|
|
|
|
import evaluate |
|
|
|
|
|
|
|
|
_CITATION = """\ |
|
|
@article{eddine2021frugalscore, |
|
|
title={FrugalScore: Learning Cheaper, Lighter and Faster Evaluation Metrics for Automatic Text Generation}, |
|
|
author={Eddine, Moussa Kamal and Shang, Guokan and Tixier, Antoine J-P and Vazirgiannis, Michalis}, |
|
|
journal={arXiv preprint arXiv:2110.08559}, |
|
|
year={2021} |
|
|
} |
|
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
|
FrugalScore is a reference-based metric for NLG models evaluation. It is based on a distillation approach that allows to learn a fixed, low cost version of any expensive NLG metric, while retaining most of its original performance. |
|
|
""" |
|
|
|
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
|
Calculates how good are predictions given some references, using certain scores. |
|
|
Args: |
|
|
predictions (list of str): list of predictions to score. Each predictions |
|
|
should be a string. |
|
|
references (list of str): list of reference for each prediction. Each |
|
|
reference should be a string. |
|
|
batch_size (int): the batch size for predictions. |
|
|
max_length (int): maximum sequence length. |
|
|
device (str): either gpu or cpu |
|
|
Returns: |
|
|
scores (list of int): list of scores. |
|
|
Examples: |
|
|
>>> frugalscore = evaluate.load("frugalscore") |
|
|
>>> results = frugalscore.compute(predictions=['hello there', 'huggingface'], references=['hello world', 'hugging face']) |
|
|
>>> print([round(s, 3) for s in results["scores"]]) |
|
|
[0.631, 0.645] |
|
|
""" |
|
|
|
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
|
class FRUGALSCORE(evaluate.Metric): |
|
|
def _info(self): |
|
|
return evaluate.MetricInfo( |
|
|
description=_DESCRIPTION, |
|
|
citation=_CITATION, |
|
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
features=datasets.Features( |
|
|
{ |
|
|
"predictions": datasets.Value("string"), |
|
|
"references": datasets.Value("string"), |
|
|
} |
|
|
), |
|
|
homepage="https://github.com/moussaKam/FrugalScore", |
|
|
) |
|
|
|
|
|
def _download_and_prepare(self, dl_manager): |
|
|
if self.config_name == "default": |
|
|
checkpoint = "moussaKam/frugalscore_tiny_bert-base_bert-score" |
|
|
else: |
|
|
checkpoint = self.config_name |
|
|
self.model = AutoModelForSequenceClassification.from_pretrained(checkpoint) |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
|
|
|
|
|
def _compute( |
|
|
self, |
|
|
predictions, |
|
|
references, |
|
|
batch_size=32, |
|
|
max_length=128, |
|
|
device=None, |
|
|
): |
|
|
"""Returns the scores""" |
|
|
assert len(predictions) == len( |
|
|
references |
|
|
), "predictions and references should have the same number of sentences." |
|
|
if device is not None: |
|
|
assert device in ["gpu", "cpu"], "device should be either gpu or cpu." |
|
|
else: |
|
|
device = "gpu" if torch.cuda.is_available() else "cpu" |
|
|
training_args = TrainingArguments( |
|
|
"trainer", |
|
|
fp16=(device == "gpu"), |
|
|
per_device_eval_batch_size=batch_size, |
|
|
report_to="all", |
|
|
no_cuda=(device == "cpu"), |
|
|
log_level="warning", |
|
|
) |
|
|
dataset = {"sentence1": predictions, "sentence2": references} |
|
|
raw_datasets = datasets.Dataset.from_dict(dataset) |
|
|
|
|
|
def tokenize_function(data): |
|
|
return self.tokenizer( |
|
|
data["sentence1"], data["sentence2"], max_length=max_length, truncation=True, padding=True |
|
|
) |
|
|
|
|
|
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) |
|
|
tokenized_datasets.remove_columns(["sentence1", "sentence2"]) |
|
|
trainer = Trainer(self.model, training_args, tokenizer=self.tokenizer) |
|
|
predictions = trainer.predict(tokenized_datasets) |
|
|
return {"scores": list(predictions.predictions.squeeze(-1))} |
|
|
|