Spaces:

aauss
/

tram_accuracy

Sleeping

App Files Files Community

tram_accuracy / tram_accuracy.py

aauss

Fix wrong color pick in README.

9635e9e 4 days ago

raw

history blame contribute delete

3.64 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Metric to calculate the accuracy for the TRAM benchmark by Wang et al. (2024)."""

	import re
	import evaluate
	import datasets


	_CITATION = """\
	@InProceedings{auss:tram_accuracy,
	title = {TRAM Accuracy},
	authors={Auss Abbood},
	year={2025}
	}
	"""

	_DESCRIPTION = """\
	Accuracy metric for the (multiple choice) TRAM datasets by Wang et al. (2024).
	"""


	_KWARGS_DESCRIPTION = """
	Calculates the accuracy for the TRAM datasets by extracting the final answer from the prediction and comparing it to the reference answer.
	Args:
	predictions: list of predictions to score. Each prediction
	should be a string with the model's response, which contains the final answer.
	references: list of reference for each prediction. Each
	reference a single letter respresenting the correct answer.
	return_average: whether to return the average accuracy or the accuracy for each prediction.
	Returns:
	accuracy: the accuracy for the TRAM datasets.
	"""


	TRAM_ANSWER_REGEX = re.compile(r"[Tt]he final answer is .([A-D]).")


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class TRAMAccuracy(evaluate.Metric):
	"""Calculates the accuracy for the (multiple choice) TRAM datasets by extracting the final answer from the prediction and comparing it to the reference answer."""

	def _info(self):
	# TODO: Specifies the evaluate.EvaluationModuleInfo object
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=datasets.Features(
	{
	"predictions": datasets.Value("string"),
	"references": datasets.Value("string"),
	}
	),
	# Homepage of the module for documentation
	# homepage="http://module.homepage",
	# Additional links to the codebase or references
	# codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
	# reference_urls=["http://path.to.reference.url/new_module"],
	)

	def _compute(self, predictions, references, return_average=True):
	"""Returns the accuracy for the (multiple choice) TRAM datasets."""
	predictions_matches = [
	TRAM_ANSWER_REGEX.search(prediction) for prediction in predictions
	]
	predictions_extracted = [
	match.group(1) if match is not None else None
	for match in predictions_matches
	]
	accuracy = [
	1 if response == label else 0
	for response, label in zip(predictions_extracted, references)
	]
	if return_average:
	return {"accuracy": sum(accuracy) / len(accuracy)}
	else:
	return {"accuracy": accuracy}