TwT-6
/

api-demo

Model card Files Files and versions

api-demo / opencompass-my-api /opencompass /datasets /arc.py

TwT-6's picture

Upload 2667 files

256a159 verified almost 2 years ago

history blame contribute delete

3.27 kB

	import json
	import os.path as osp

	from datasets import Dataset

	from opencompass.registry import LOAD_DATASET

	from .base import BaseDataset


	@LOAD_DATASET.register_module()
	class ARCDataset(BaseDataset):

	@staticmethod
	def load(path: str):
	with open(path, 'r', errors='ignore') as in_f:
	rows = []
	for line in in_f:
	item = json.loads(line.strip())
	question = item['question']
	if len(question['choices']) != 4:
	continue
	labels = [c['label'] for c in question['choices']]
	answerKey = 'ABCD'[labels.index(item['answerKey'])]
	rows.append({
	'question': question['stem'],
	'answerKey': answerKey,
	'textA': question['choices'][0]['text'],
	'textB': question['choices'][1]['text'],
	'textC': question['choices'][2]['text'],
	'textD': question['choices'][3]['text'],
	})
	return Dataset.from_list(rows)


	class ARCDatasetClean(BaseDataset):

	# load the contamination annotations of CEval from
	# https://github.com/liyucheng09/Contamination_Detector
	@staticmethod
	def load_contamination_annotations(path, split='val'):
	import requests

	assert split == 'test', 'We only have test set annotation for ARC'
	annotation_cache_path = osp.join(
	path, f'ARC_c_{split}_contamination_annotations.json')
	if osp.exists(annotation_cache_path):
	with open(annotation_cache_path, 'r') as f:
	annotations = json.load(f)
	return annotations
	link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ARC_annotations.json' # noqa
	annotations = json.loads(requests.get(link_of_annotations).text)
	with open(annotation_cache_path, 'w') as f:
	json.dump(annotations, f)
	return annotations

	@staticmethod
	def load(path: str):
	annotations = ARCDatasetClean.load_contamination_annotations(
	osp.dirname(path), 'test')
	with open(path, 'r', errors='ignore') as in_f:
	rows = []
	for line in in_f:
	item = json.loads(line.strip())
	id_ = item['id']
	question = item['question']
	if id_ in annotations:
	is_clean = annotations[id_][0]
	else:
	is_clean = 'not labeled'
	if len(question['choices']) != 4:
	continue
	labels = [c['label'] for c in question['choices']]
	answerKey = 'ABCD'[labels.index(item['answerKey'])]
	rows.append({
	'question': question['stem'],
	'answerKey': answerKey,
	'textA': question['choices'][0]['text'],
	'textB': question['choices'][1]['text'],
	'textC': question['choices'][2]['text'],
	'textD': question['choices'][3]['text'],
	'is_clean': is_clean,
	})
	return Dataset.from_list(rows)