koichi12
/

llm-scripts

Model card Files Files and versions

llm-scripts / scripts /yans /eval /lm-evaluation-harness /lm_eval /tasks /sat.py

koichi12's picture

Add files using upload-large-folder tool

42c6c18 verified over 1 year ago

history blame contribute delete

2.13 kB

	"""
	Similarity of Semantic Relations
	https://arxiv.org/pdf/cs/0608100.pdf

	SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
	multiple-choice analogy questions; 5 choices per question.

	Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)
	"""
	import inspect
	import lm_eval.datasets.sat_analogies.sat_analogies
	from lm_eval.base import MultipleChoiceTask


	_CITATION = """
	@article{article,
	author = {Turney, Peter},
	year = {2006},
	month = {09},
	pages = {379-416},
	title = {Similarity of Semantic Relations},
	volume = {32},
	journal = {Computational Linguistics},
	doi = {10.1162/coli.2006.32.3.379}
	}
	"""


	class SATAnalogies(MultipleChoiceTask):
	VERSION = 0
	DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies)
	DATASET_NAME = None

	def __init__(self, data_dir: str):
	"""
	SAT Analog Questions is not publicly available. You must request the data
	by emailing Peter Turney and then download it to a local directory path
	which should be passed into the `data_dir` arg.
	"""
	super().__init__(data_dir=data_dir)

	def has_training_docs(self):
	return False

	def has_validation_docs(self):
	return True

	def has_test_docs(self):
	return False

	def training_docs(self):
	return []

	def validation_docs(self):
	return map(self._process_doc, self.dataset["validation"])

	def test_docs(self):
	return []

	def _process_doc(self, doc):
	return {
	"source": doc["source"],
	"query": doc["stem"].split(" ")[:2],
	"choices": [
	"{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
	],
	"gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
	}

	def doc_to_text(self, doc):
	return "{} is to {} as".format(*doc["query"])

	def should_decontaminate(self):
	return True

	def doc_to_decontamination_query(self, doc):
	return doc["source"] + "\n" + " ".join(doc["query"])