TwT-6
/

api-demo

Model card Files Files and versions

api-demo / opencompass-my-api /build /lib /opencompass /datasets /crowspairs.py

TwT-6's picture

Upload 2667 files

256a159 verified about 2 years ago

history blame contribute delete

2.7 kB

	import re
	from typing import List

	from datasets import load_dataset

	from opencompass.openicl.icl_evaluator import BaseEvaluator
	from opencompass.registry import LOAD_DATASET

	from .base import BaseDataset


	@LOAD_DATASET.register_module()
	class crowspairsDataset(BaseDataset):

	@staticmethod
	def load(**kwargs):

	dataset = load_dataset(**kwargs)

	def preprocess(example):
	example['label'] = 0
	return example

	return dataset.map(preprocess)


	@LOAD_DATASET.register_module()
	class crowspairsDataset_V2(BaseDataset):

	@staticmethod
	def load(**kwargs):
	dataset = load_dataset(**kwargs)

	def preprocess(example):
	example['label'] = 'A'
	return example

	return dataset.map(preprocess)


	def crowspairs_postprocess(text: str) -> str:
	"""Cannot cover all the cases, try to be as accurate as possible."""
	if re.search('Neither', text) or re.search('Both', text):
	return 'invalid'

	if text != '':
	first_option = text[0]
	if first_option.isupper() and first_option in 'AB':
	return first_option

	if re.search(' A ', text) or re.search('A.', text):
	return 'A'

	if re.search(' B ', text) or re.search('B.', text):
	return 'B'

	return 'invalid'


	class CrowspairsEvaluator(BaseEvaluator):
	"""Calculate accuracy and valid accuracy according the prediction for
	crows-pairs dataset."""

	def __init__(self) -> None:
	super().__init__()

	def score(self, predictions: List, references: List) -> dict:
	"""Calculate scores and accuracy.

	Args:
	predictions (List): List of probabilities for each class of each
	sample.
	references (List): List of target labels for each sample.

	Returns:
	dict: calculated scores.
	"""
	if len(predictions) != len(references):
	return {
	'error': 'predictions and references have different length.'
	}
	all_match = 0
	for i, j in zip(predictions, references):
	all_match += i == j

	valid_match = 0
	valid_length = 0
	for i, j in zip(predictions, references):
	if i != 'invalid':
	valid_length += 1
	valid_match += i == j

	accuracy = round(all_match / len(predictions), 4) * 100
	valid_accuracy = round(valid_match / valid_length, 4) * 100
	valid_frac = round(valid_length / len(predictions), 4) * 100
	return dict(accuracy=accuracy,
	valid_accuracy=valid_accuracy,
	valid_frac=valid_frac)