Spaces:

qlemesle
/

parapluie

Sleeping

App Files Files Community

parapluie / parapluie.py

qlemesle

Auto init with mistral

53e33f3 about 1 month ago

raw

history blame contribute delete

6.07 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""ParaPLUIE metric."""

	import evaluate
	import datasets
	from .config import *
	from .templates import *
	from .ppluie import *

	_CITATION = """\
	@inproceedings{lemesle-etal-2025-paraphrase,
	title = "Paraphrase Generation Evaluation Powered by an {LLM}: A Semantic Metric, Not a Lexical One",
	author = "Lemesle, Quentin and
	Chevelu, Jonathan and
	Martin, Philippe and
	Lolive, Damien and
	Delhay, Arnaud and
	Barbot, Nelly",
	editor = "Rambow, Owen and
	Wanner, Leo and
	Apidianaki, Marianna and
	Al-Khalifa, Hend and
	Eugenio, Barbara Di and
	Schockaert, Steven",
	booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
	month = jan,
	year = "2025",
	address = "Abu Dhabi, UAE",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/2025.coling-main.538/",
	pages = "8057--8087",
	abstract = "Evaluating automatic paraphrase production systems is a difficult task as it involves, among other things, assessing the semantic proximity between two sentences. Usual measures are based on lexical distances, or at least on semantic embedding alignments. The rise of Large Language Models (LLM) has provided tools to model relationships within a text thanks to the attention mechanism. In this article, we introduce ParaPLUIE, a new measure based on a log likelihood ratio from an LLM, to assess the quality of a potential paraphrase. This measure is compared with usual measures on two known by the NLP community datasets prior to this study. Three new small datasets have been built to allow metrics to be compared in different scenario and to avoid data contamination bias. According to evaluations, the proposed measure is better for sorting pairs of sentences by semantic proximity. In particular, it is much more independent to lexical distance and provides an interpretable classification threshold between paraphrases and non-paraphrases."
	}
	"""

	_DESCRIPTION = """\
	ParaPLUIE is a metric for evaluating the semantic proximity of two sentences.
	ParaPLUIE use the perplexity of an LLM to compute a confidence score.
	It has shown the highest correlation with human judgement on paraphrase classification meanwhile reamin the computional cost low as it roughtly equal to one token generation cost.
	"""


	_KWARGS_DESCRIPTION = """
	Args:
	sources (`list` of `string`): Source sentences.
	hypotheses (`list` of `string`): Hypothetical paraphrases.
	Returns:
	score (`float`): ParaPLUIE score. Minimum possible value is -inf. Maximum possible value is +inf. A score greater than 0 mean that sentences are paraphrases. A score lower than 0 mean the opposite.
	Examples:
	import evaluate
	ppluie = evaluate.load("qlemesle/parapluie")
	ppluie.init(model="mistralai/Mistral-7B-Instruct-v0.2")

	S = "Have you ever seen a tsunami ?"
	H = "Have you ever seen a tiramisu ?"

	results = ppluie.compute(sources=[S], hypotheses=[H])
	print(results)
	>>> {'scores': [-16.97607421875]}
	"""


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class Parapluie(evaluate.Metric):
	"""TODO: Short description of my evaluation module."""

	def _info(self):
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=datasets.Features({
	'sources': datasets.Value("string"),
	'hypotheses': datasets.Value("string"),
	}),
	codebase_urls=["https://gitlab.inria.fr/expression/paraphrase-generation-evaluation-powered-by-an-llm-a-semantic-metric-not-a-lexical-one-coling-2025"],
	)

	def _download_and_prepare(self, dl_manager):
	# rewrite of init...
	self.scorer = None
	pass

	def init(
	self,
	model,
	device = "cuda:0",
	template = "FS-DIRECT",
	use_chat_template = True,
	half_mode = True,
	n_right_specials_tokens = 1
	):
	self.scorer = ppluie(model, device, template, use_chat_template, half_mode, n_right_specials_tokens)

	def show_templates(self):
	# to show available prompting templates styles
	self.scorer.show_templates()

	def check_end_tokens_tmpl(self):
	# to show how is the prompt encoded, to ensure that the correct numbers of special tokens are removed
	# and Yes / No words fit on one token
	self.scorer.chech_end_tokens_tmpl()

	def show_available_models(self):
	# to show LLM already tested with ParaPLUIE
	self.scorer.show_available_models()

	def setTemplate(self, tmplt):
	self.scorer.setTemplate(tmplt)

	def _compute(self, sources, hypotheses):
	if self.scorer is None:
	print("Init hasn't been done ! Auto init")
	self.init(model="mistralai/Mistral-7B-Instruct-v0.2", device="cpu")
	print("Loading Mistral Done")
	scores = []
	for i in range(len(sources)):
	scores.append(self.scorer(sources[i], hypotheses[i]))
	return {
	"scores": scores,
	}