|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" METEOR metric. """ |
|
|
|
|
|
import datasets |
|
|
import numpy as np |
|
|
from nltk.translate import meteor_score |
|
|
from packaging import version |
|
|
|
|
|
import evaluate |
|
|
|
|
|
|
|
|
if evaluate.config.PY_VERSION < version.parse("3.8"): |
|
|
import importlib_metadata |
|
|
else: |
|
|
import importlib.metadata as importlib_metadata |
|
|
|
|
|
|
|
|
NLTK_VERSION = version.parse(importlib_metadata.version("nltk")) |
|
|
if NLTK_VERSION >= version.Version("3.6.4"): |
|
|
from nltk import word_tokenize |
|
|
|
|
|
|
|
|
_CITATION = """\ |
|
|
@inproceedings{banarjee2005, |
|
|
title = {{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments}, |
|
|
author = {Banerjee, Satanjeev and Lavie, Alon}, |
|
|
booktitle = {Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization}, |
|
|
month = jun, |
|
|
year = {2005}, |
|
|
address = {Ann Arbor, Michigan}, |
|
|
publisher = {Association for Computational Linguistics}, |
|
|
url = {https://www.aclweb.org/anthology/W05-0909}, |
|
|
pages = {65--72}, |
|
|
} |
|
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
|
METEOR, an automatic metric for machine translation evaluation |
|
|
that is based on a generalized concept of unigram matching between the |
|
|
machine-produced translation and human-produced reference translations. |
|
|
Unigrams can be matched based on their surface forms, stemmed forms, |
|
|
and meanings; furthermore, METEOR can be easily extended to include more |
|
|
advanced matching strategies. Once all generalized unigram matches |
|
|
between the two strings have been found, METEOR computes a score for |
|
|
this matching using a combination of unigram-precision, unigram-recall, and |
|
|
a measure of fragmentation that is designed to directly capture how |
|
|
well-ordered the matched words in the machine translation are in relation |
|
|
to the reference. |
|
|
|
|
|
METEOR gets an R correlation value of 0.347 with human evaluation on the Arabic |
|
|
data and 0.331 on the Chinese data. This is shown to be an improvement on |
|
|
using simply unigram-precision, unigram-recall and their harmonic F1 |
|
|
combination. |
|
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
|
Computes METEOR score of translated segments against one or more references. |
|
|
Args: |
|
|
predictions: list of predictions to score. Each prediction |
|
|
should be a string with tokens separated by spaces. |
|
|
references: list of reference for each prediction. Each |
|
|
reference should be a string with tokens separated by spaces. |
|
|
alpha: Parameter for controlling relative weights of precision and recall. default: 0.9 |
|
|
beta: Parameter for controlling shape of penalty as a function of fragmentation. default: 3 |
|
|
gamma: Relative weight assigned to fragmentation penalty. default: 0.5 |
|
|
Returns: |
|
|
'meteor': meteor score. |
|
|
Examples: |
|
|
|
|
|
>>> meteor = evaluate.load('meteor') |
|
|
>>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"] |
|
|
>>> references = ["It is a guide to action that ensures that the military will forever heed Party commands"] |
|
|
>>> results = meteor.compute(predictions=predictions, references=references) |
|
|
>>> print(round(results["meteor"], 4)) |
|
|
0.6944 |
|
|
""" |
|
|
|
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
|
class Meteor(evaluate.Metric): |
|
|
def _info(self): |
|
|
return evaluate.MetricInfo( |
|
|
description=_DESCRIPTION, |
|
|
citation=_CITATION, |
|
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
features=[ |
|
|
datasets.Features( |
|
|
{ |
|
|
"predictions": datasets.Value("string", id="sequence"), |
|
|
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), |
|
|
} |
|
|
), |
|
|
datasets.Features( |
|
|
{ |
|
|
"predictions": datasets.Value("string", id="sequence"), |
|
|
"references": datasets.Value("string", id="sequence"), |
|
|
} |
|
|
), |
|
|
], |
|
|
codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py"], |
|
|
reference_urls=[ |
|
|
"https://www.nltk.org/api/nltk.translate.html#module-nltk.translate.meteor_score", |
|
|
"https://en.wikipedia.org/wiki/METEOR", |
|
|
], |
|
|
) |
|
|
|
|
|
def _download_and_prepare(self, dl_manager): |
|
|
import nltk |
|
|
|
|
|
nltk.download("wordnet") |
|
|
if NLTK_VERSION >= version.Version("3.6.5"): |
|
|
nltk.download("punkt") |
|
|
if NLTK_VERSION >= version.Version("3.6.6"): |
|
|
nltk.download("omw-1.4") |
|
|
|
|
|
def _compute(self, predictions, references, alpha=0.9, beta=3, gamma=0.5): |
|
|
multiple_refs = isinstance(references[0], list) |
|
|
if NLTK_VERSION >= version.Version("3.6.5"): |
|
|
|
|
|
if multiple_refs: |
|
|
scores = [ |
|
|
meteor_score.meteor_score( |
|
|
[word_tokenize(ref) for ref in refs], |
|
|
word_tokenize(pred), |
|
|
alpha=alpha, |
|
|
beta=beta, |
|
|
gamma=gamma, |
|
|
) |
|
|
for refs, pred in zip(references, predictions) |
|
|
] |
|
|
else: |
|
|
scores = [ |
|
|
meteor_score.single_meteor_score( |
|
|
word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma |
|
|
) |
|
|
for ref, pred in zip(references, predictions) |
|
|
] |
|
|
else: |
|
|
if multiple_refs: |
|
|
scores = [ |
|
|
meteor_score.meteor_score( |
|
|
[[word_tokenize(ref) for ref in group] for group in references][0], |
|
|
word_tokenize(pred), |
|
|
alpha=alpha, |
|
|
beta=beta, |
|
|
gamma=gamma, |
|
|
) |
|
|
for ref, pred in zip(references, predictions) |
|
|
] |
|
|
else: |
|
|
scores = [ |
|
|
meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma) |
|
|
for ref, pred in zip(references, predictions) |
|
|
] |
|
|
|
|
|
return {"meteor": np.mean(scores)} |
|
|
|