| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ BLEU metric. """ |
|
|
| import datasets |
|
|
| import evaluate |
|
|
| from .nmt_bleu import compute_bleu |
| from .tokenizer_13a import Tokenizer13a |
|
|
|
|
| _CITATION = """\ |
| @INPROCEEDINGS{Papineni02bleu:a, |
| author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu}, |
| title = {BLEU: a Method for Automatic Evaluation of Machine Translation}, |
| booktitle = {}, |
| year = {2002}, |
| pages = {311--318} |
| } |
| @inproceedings{lin-och-2004-orange, |
| title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation", |
| author = "Lin, Chin-Yew and |
| Och, Franz Josef", |
| booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics", |
| month = "aug 23{--}aug 27", |
| year = "2004", |
| address = "Geneva, Switzerland", |
| publisher = "COLING", |
| url = "https://www.aclweb.org/anthology/C04-1072", |
| pages = "501--507", |
| } |
| """ |
|
|
| _DESCRIPTION = """\ |
| BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. |
| Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is" |
| – this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics. |
| |
| Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations. |
| Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality. |
| Neither intelligibility nor grammatical correctness are not taken into account. |
| """ |
|
|
| _KWARGS_DESCRIPTION = """ |
| Computes BLEU score of translated segments against one or more references. |
| Args: |
| predictions: list of translations to score. |
| references: list of lists of or just a list of references for each translation. |
| tokenizer : approach used for tokenizing `predictions` and `references`. |
| The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT. |
| This can be replaced by any function that takes a string as input and returns a list of tokens as output. |
| max_order: Maximum n-gram order to use when computing BLEU score. |
| smooth: Whether or not to apply Lin et al. 2004 smoothing. |
| Returns: |
| 'bleu': bleu score, |
| 'precisions': geometric mean of n-gram precisions, |
| 'brevity_penalty': brevity penalty, |
| 'length_ratio': ratio of lengths, |
| 'translation_length': translation_length, |
| 'reference_length': reference_length |
| Examples: |
| |
| >>> predictions = ["hello there general kenobi", "foo bar foobar"] |
| >>> references = [ |
| ... ["hello there general kenobi", "hello there!"], |
| ... ["foo bar foobar"] |
| ... ] |
| >>> bleu = evaluate.load("bleu") |
| >>> results = bleu.compute(predictions=predictions, references=references) |
| >>> print(results["bleu"]) |
| 1.0 |
| """ |
|
|
|
|
| @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
| class Bleu(evaluate.Metric): |
| def _info(self): |
| return evaluate.MetricInfo( |
| description=_DESCRIPTION, |
| citation=_CITATION, |
| inputs_description=_KWARGS_DESCRIPTION, |
| features=[ |
| datasets.Features( |
| { |
| "predictions": datasets.Value("string", id="sequence"), |
| "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), |
| } |
| ), |
| datasets.Features( |
| { |
| "predictions": datasets.Value("string", id="sequence"), |
| "references": datasets.Value("string", id="sequence"), |
| } |
| ), |
| ], |
| codebase_urls=["https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py"], |
| reference_urls=[ |
| "https://en.wikipedia.org/wiki/BLEU", |
| "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213", |
| ], |
| ) |
|
|
| def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False): |
| |
| if isinstance(references[0], str): |
| references = [[ref] for ref in references] |
|
|
| references = [[tokenizer(r) for r in ref] for ref in references] |
| predictions = [tokenizer(p) for p in predictions] |
| score = compute_bleu( |
| reference_corpus=references, translation_corpus=predictions, max_order=max_order, smooth=smooth |
| ) |
| (bleu, precisions, bp, ratio, translation_length, reference_length) = score |
| return { |
| "bleu": bleu, |
| "precisions": precisions, |
| "brevity_penalty": bp, |
| "length_ratio": ratio, |
| "translation_length": translation_length, |
| "reference_length": reference_length, |
| } |
|
|