File size: 5,368 Bytes
766a079
 
 
 
 
 
 
 
 
 
 
 
 
db044f5
766a079
 
 
e0a8e70
766a079
 
50bd00a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
766a079
 
 
 
50bd00a
 
 
766a079
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
faab91a
 
766a079
 
 
0a73688
766a079
 
 
 
 
 
 
 
 
 
 
faab91a
 
766a079
9093803
766a079
 
13237c8
 
 
766a079
 
13237c8
 
 
 
 
 
 
 
 
 
 
 
766a079
13237c8
766a079
13237c8
766a079
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ParaPLUIE metric."""

import evaluate
import datasets
from .ppluie import ppluie

_CITATION = """\
@inproceedings{lemesle-etal-2025-paraphrase,
    title = "Paraphrase Generation Evaluation Powered by an {LLM}: A Semantic Metric, Not a Lexical One",
    author = "Lemesle, Quentin  and
      Chevelu, Jonathan  and
      Martin, Philippe  and
      Lolive, Damien  and
      Delhay, Arnaud  and
      Barbot, Nelly",
    editor = "Rambow, Owen  and
      Wanner, Leo  and
      Apidianaki, Marianna  and
      Al-Khalifa, Hend  and
      Eugenio, Barbara Di  and
      Schockaert, Steven",
    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
    month = jan,
    year = "2025",
    address = "Abu Dhabi, UAE",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.coling-main.538/",
    pages = "8057--8087",
    abstract = "Evaluating automatic paraphrase production systems is a difficult task as it involves, among other things, assessing the semantic proximity between two sentences. Usual measures are based on lexical distances, or at least on semantic embedding alignments. The rise of Large Language Models (LLM) has provided tools to model relationships within a text thanks to the attention mechanism. In this article, we introduce ParaPLUIE, a new measure based on a log likelihood ratio from an LLM, to assess the quality of a potential paraphrase. This measure is compared with usual measures on two known by the NLP community datasets prior to this study. Three new small datasets have been built to allow metrics to be compared in different scenario and to avoid data contamination bias. According to evaluations, the proposed measure is better for sorting pairs of sentences by semantic proximity. In particular, it is much more independent to lexical distance and provides an interpretable classification threshold between paraphrases and non-paraphrases."
}
"""

_DESCRIPTION = """\
ParaPLUIE is a metric for evaluating the semantic proximity of two sentences. 
ParaPLUIE use the perplexity of an LLM to compute a confidence score.
It has shown the highest correlation with human judgement on paraphrase classification meanwhile reamin the computional cost low as it roughtly equal to one token generation cost.
"""


# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
Returns:
    accuracy: description of the first score,
    another_score: description of the second score,
Examples:
    Examples should be written in doctest format, and should illustrate how
    to use the function.

    >>> my_new_module = evaluate.load("my_new_module")
    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
    >>> print(results)
    {'accuracy': 1.0}
"""


# BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Parapluie(evaluate.Metric):
    """TODO: Short description of my evaluation module."""

    def _info(self):
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features({
                'source': datasets.Value("string"),
                'hypothese': datasets.Value("string"),
            }),
            codebase_urls=["https://gitlab.inria.fr/expression/paraphrase-generation-evaluation-powered-by-an-llm-a-semantic-metric-not-a-lexical-one-coling-2025"],
        )

    def _download_and_prepare(self, dl_manager):
        # rewrite of init...
        self.scorer = None
        pass

    def init(
            self,
            model,
            device = "cuda:0",
            template = "FS-DIRECT",
            use_chat_template = True,
            half_mode = True,
            n_right_specials_tokens = 1
            ):
        self.scorer = ppluie(model, device, template, use_chat_template, half_mode, n_right_specials_tokens)

    def _compute(self, S, H):
        """Returns the scores"""
        score = self.scorer(S, H)
        return {
            "score": score,
        }