TwT-6
/

api-demo

Model card Files Files and versions

api-demo / opencompass-my-api /build /lib /opencompass /datasets /bbh.py

TwT-6's picture

Upload 2667 files

256a159 verified about 2 years ago

history blame contribute delete

2.74 kB

	import json
	import os.path as osp
	import re

	from datasets import Dataset

	from opencompass.openicl.icl_evaluator import BaseEvaluator
	from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
	TEXT_POSTPROCESSORS)

	from .base import BaseDataset


	@LOAD_DATASET.register_module()
	class BBHDataset(BaseDataset):

	@staticmethod
	def load(path: str, name: str):
	with open(osp.join(path, f'{name}.json'), 'r') as f:
	data = json.load(f)['examples']
	dataset = Dataset.from_list(data)
	return dataset


	@TEXT_POSTPROCESSORS.register_module('bbh-mcq')
	def bbh_mcq_postprocess(text: str) -> str:
	ans = text
	ans_line = ans.split('answer is ')
	if len(ans_line) != 1:
	ans = ans_line[1].strip()
	match = re.search(r'\(([A-Z])\)*', ans)
	if match:
	return match.group(1)
	match = re.search(r'([A-Z])', ans)
	if match:
	return match.group(1)
	return ans


	@TEXT_POSTPROCESSORS.register_module('bbh-freeform')
	def bbh_freeform_postprocess(text: str) -> str:
	ans = text
	ans_line = ans.split('answer is ')
	if len(ans_line) != 1:
	ans = ans_line[1].strip()
	ans = ans.split('\n')[0]
	if ans.endswith('.'):
	ans = ans[:-1]
	return ans


	@ICL_EVALUATORS.register_module()
	class BBHEvaluator(BaseEvaluator):

	def score(self, predictions, references):
	if len(predictions) != len(references):
	return {
	'error': 'predictions and references have different '
	'length'
	}

	predictions = [bbh_freeform_postprocess(pred) for pred in predictions]

	details = []
	cnt = 0
	for pred, ref in zip(predictions, references):
	detail = {'pred': pred, 'answer': ref, 'correct': False}
	if pred == ref:
	cnt += 1
	detail['correct'] = True
	details.append(detail)

	score = cnt / len(predictions) * 100

	return {'score': score, 'details': details}


	@ICL_EVALUATORS.register_module()
	class BBHEvaluator_mcq(BaseEvaluator):

	def score(self, predictions, references):
	if len(predictions) != len(references):
	return {
	'error': 'predictions and references have different '
	'length'
	}
	details = []
	cnt = 0
	for pred, ref in zip(predictions, references):
	detail = {'pred': pred, 'answer': ref, 'correct': False}
	if pred == ref:
	cnt += 1
	detail['correct'] = True
	details.append(detail)

	score = cnt / len(predictions) * 100

	return {'score': score, 'details': details}