|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" BLEU metric. """ |
|
|
|
|
|
import datasets |
|
|
|
|
|
import evaluate |
|
|
|
|
|
from .nmt_bleu import compute_bleu |
|
|
from .tokenizer_13a import Tokenizer13a |
|
|
|
|
|
|
|
|
_CITATION = """\ |
|
|
@INPROCEEDINGS{Papineni02bleu:a, |
|
|
author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu}, |
|
|
title = {BLEU: a Method for Automatic Evaluation of Machine Translation}, |
|
|
booktitle = {}, |
|
|
year = {2002}, |
|
|
pages = {311--318} |
|
|
} |
|
|
@inproceedings{lin-och-2004-orange, |
|
|
title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation", |
|
|
author = "Lin, Chin-Yew and |
|
|
Och, Franz Josef", |
|
|
booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics", |
|
|
month = "aug 23{--}aug 27", |
|
|
year = "2004", |
|
|
address = "Geneva, Switzerland", |
|
|
publisher = "COLING", |
|
|
url = "https://www.aclweb.org/anthology/C04-1072", |
|
|
pages = "501--507", |
|
|
} |
|
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
|
BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. |
|
|
Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is" |
|
|
– this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics. |
|
|
|
|
|
Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations. |
|
|
Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality. |
|
|
Neither intelligibility nor grammatical correctness are not taken into account. |
|
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
|
Computes BLEU score of translated segments against one or more references. |
|
|
Args: |
|
|
predictions: list of translations to score. |
|
|
references: list of lists of or just a list of references for each translation. |
|
|
tokenizer : approach used for tokenizing `predictions` and `references`. |
|
|
The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT. |
|
|
This can be replaced by any function that takes a string as input and returns a list of tokens as output. |
|
|
max_order: Maximum n-gram order to use when computing BLEU score. |
|
|
smooth: Whether or not to apply Lin et al. 2004 smoothing. |
|
|
Returns: |
|
|
'bleu': bleu score, |
|
|
'precisions': geometric mean of n-gram precisions, |
|
|
'brevity_penalty': brevity penalty, |
|
|
'length_ratio': ratio of lengths, |
|
|
'translation_length': translation_length, |
|
|
'reference_length': reference_length |
|
|
Examples: |
|
|
|
|
|
>>> predictions = ["hello there general kenobi", "foo bar foobar"] |
|
|
>>> references = [ |
|
|
... ["hello there general kenobi", "hello there!"], |
|
|
... ["foo bar foobar"] |
|
|
... ] |
|
|
>>> bleu = evaluate.load("bleu") |
|
|
>>> results = bleu.compute(predictions=predictions, references=references) |
|
|
>>> print(results["bleu"]) |
|
|
1.0 |
|
|
""" |
|
|
|
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
|
class Bleu(evaluate.Metric): |
|
|
def _info(self): |
|
|
return evaluate.MetricInfo( |
|
|
description=_DESCRIPTION, |
|
|
citation=_CITATION, |
|
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
features=[ |
|
|
datasets.Features( |
|
|
{ |
|
|
"predictions": datasets.Value("string", id="sequence"), |
|
|
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), |
|
|
} |
|
|
), |
|
|
datasets.Features( |
|
|
{ |
|
|
"predictions": datasets.Value("string", id="sequence"), |
|
|
"references": datasets.Value("string", id="sequence"), |
|
|
} |
|
|
), |
|
|
], |
|
|
codebase_urls=["https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py"], |
|
|
reference_urls=[ |
|
|
"https://en.wikipedia.org/wiki/BLEU", |
|
|
"https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213", |
|
|
], |
|
|
) |
|
|
|
|
|
def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False): |
|
|
|
|
|
if isinstance(references[0], str): |
|
|
references = [[ref] for ref in references] |
|
|
|
|
|
references = [[tokenizer(r) for r in ref] for ref in references] |
|
|
predictions = [tokenizer(p) for p in predictions] |
|
|
score = compute_bleu( |
|
|
reference_corpus=references, translation_corpus=predictions, max_order=max_order, smooth=smooth |
|
|
) |
|
|
(bleu, precisions, bp, ratio, translation_length, reference_length) = score |
|
|
return { |
|
|
"bleu": bleu, |
|
|
"precisions": precisions, |
|
|
"brevity_penalty": bp, |
|
|
"length_ratio": ratio, |
|
|
"translation_length": translation_length, |
|
|
"reference_length": reference_length, |
|
|
} |
|
|
|