|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""ParaPLUIE metric.""" |
|
|
|
|
|
import evaluate |
|
|
import datasets |
|
|
from .ppluie import ppluie |
|
|
|
|
|
_CITATION = """\ |
|
|
@inproceedings{lemesle-etal-2025-paraphrase, |
|
|
title = "Paraphrase Generation Evaluation Powered by an {LLM}: A Semantic Metric, Not a Lexical One", |
|
|
author = "Lemesle, Quentin and |
|
|
Chevelu, Jonathan and |
|
|
Martin, Philippe and |
|
|
Lolive, Damien and |
|
|
Delhay, Arnaud and |
|
|
Barbot, Nelly", |
|
|
editor = "Rambow, Owen and |
|
|
Wanner, Leo and |
|
|
Apidianaki, Marianna and |
|
|
Al-Khalifa, Hend and |
|
|
Eugenio, Barbara Di and |
|
|
Schockaert, Steven", |
|
|
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics", |
|
|
month = jan, |
|
|
year = "2025", |
|
|
address = "Abu Dhabi, UAE", |
|
|
publisher = "Association for Computational Linguistics", |
|
|
url = "https://aclanthology.org/2025.coling-main.538/", |
|
|
pages = "8057--8087", |
|
|
abstract = "Evaluating automatic paraphrase production systems is a difficult task as it involves, among other things, assessing the semantic proximity between two sentences. Usual measures are based on lexical distances, or at least on semantic embedding alignments. The rise of Large Language Models (LLM) has provided tools to model relationships within a text thanks to the attention mechanism. In this article, we introduce ParaPLUIE, a new measure based on a log likelihood ratio from an LLM, to assess the quality of a potential paraphrase. This measure is compared with usual measures on two known by the NLP community datasets prior to this study. Three new small datasets have been built to allow metrics to be compared in different scenario and to avoid data contamination bias. According to evaluations, the proposed measure is better for sorting pairs of sentences by semantic proximity. In particular, it is much more independent to lexical distance and provides an interpretable classification threshold between paraphrases and non-paraphrases." |
|
|
} |
|
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
|
ParaPLUIE is a metric for evaluating the semantic proximity of two sentences. |
|
|
ParaPLUIE use the perplexity of an LLM to compute a confidence score. |
|
|
It has shown the highest correlation with human judgement on paraphrase classification meanwhile reamin the computional cost low as it roughtly equal to one token generation cost. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
|
Calculates how good are predictions given some references, using certain scores |
|
|
Args: |
|
|
predictions: list of predictions to score. Each predictions |
|
|
should be a string with tokens separated by spaces. |
|
|
references: list of reference for each prediction. Each |
|
|
reference should be a string with tokens separated by spaces. |
|
|
Returns: |
|
|
accuracy: description of the first score, |
|
|
another_score: description of the second score, |
|
|
Examples: |
|
|
Examples should be written in doctest format, and should illustrate how |
|
|
to use the function. |
|
|
|
|
|
>>> my_new_module = evaluate.load("my_new_module") |
|
|
>>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) |
|
|
>>> print(results) |
|
|
{'accuracy': 1.0} |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
|
class Parapluie(evaluate.Metric): |
|
|
"""TODO: Short description of my evaluation module.""" |
|
|
|
|
|
def _info(self): |
|
|
return evaluate.MetricInfo( |
|
|
|
|
|
module_type="metric", |
|
|
description=_DESCRIPTION, |
|
|
citation=_CITATION, |
|
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
|
|
features=datasets.Features({ |
|
|
'source': datasets.Value("string"), |
|
|
'hypothese': datasets.Value("string"), |
|
|
}), |
|
|
codebase_urls=["https://gitlab.inria.fr/expression/paraphrase-generation-evaluation-powered-by-an-llm-a-semantic-metric-not-a-lexical-one-coling-2025"], |
|
|
) |
|
|
|
|
|
def _download_and_prepare(self, dl_manager): |
|
|
|
|
|
self.scorer = None |
|
|
pass |
|
|
|
|
|
def init( |
|
|
self, |
|
|
model, |
|
|
device = "cuda:0", |
|
|
template = "FS-DIRECT", |
|
|
use_chat_template = True, |
|
|
half_mode = True, |
|
|
n_right_specials_tokens = 1 |
|
|
): |
|
|
self.scorer = ppluie(model, device, template, use_chat_template, half_mode, n_right_specials_tokens) |
|
|
|
|
|
def _compute(self, S, H): |
|
|
"""Returns the scores""" |
|
|
score = self.scorer(S, H) |
|
|
return { |
|
|
"score": score, |
|
|
} |