koichi12's picture
Add files using upload-large-folder tool
42c6c18 verified
"""
Similarity of Semantic Relations
https://arxiv.org/pdf/cs/0608100.pdf
SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
multiple-choice analogy questions; 5 choices per question.
Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)
"""
import inspect
import lm_eval.datasets.sat_analogies.sat_analogies
from lm_eval.base import MultipleChoiceTask
_CITATION = """
@article{article,
author = {Turney, Peter},
year = {2006},
month = {09},
pages = {379-416},
title = {Similarity of Semantic Relations},
volume = {32},
journal = {Computational Linguistics},
doi = {10.1162/coli.2006.32.3.379}
}
"""
class SATAnalogies(MultipleChoiceTask):
VERSION = 0
DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies)
DATASET_NAME = None
def __init__(self, data_dir: str):
"""
SAT Analog Questions is not publicly available. You must request the data
by emailing Peter Turney and then download it to a local directory path
which should be passed into the `data_dir` arg.
"""
super().__init__(data_dir=data_dir)
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def training_docs(self):
return []
def validation_docs(self):
return map(self._process_doc, self.dataset["validation"])
def test_docs(self):
return []
def _process_doc(self, doc):
return {
"source": doc["source"],
"query": doc["stem"].split(" ")[:2],
"choices": [
"{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
],
"gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
}
def doc_to_text(self, doc):
return "{} is to {} as".format(*doc["query"])
def should_decontaminate(self):
return True
def doc_to_decontamination_query(self, doc):
return doc["source"] + "\n" + " ".join(doc["query"])