koichi12
/

llm-scripts

Model card Files Files and versions

llm-scripts / scripts /yans /eval /lm-evaluation-harness /lm_eval /tasks /ja /jblimp.py

koichi12's picture

Add files using upload-large-folder tool

42c6c18 verified over 1 year ago

history blame contribute delete

1.62 kB

	"""
	JBLiMP: Japanese Benchmark of Linguistic Minimal Pairs
	https://aclanthology.org/2023.findings-eacl.117/

	JBLiMP is a novel dataset for targeted syntactic evaluations of language models in Japanese. JBLiMP consists of 331 minimal pairs, which are created based on acceptability judgments extracted from journal articles in theoretical linguistics. These minimal pairs are grouped into 11 categories, each covering a different linguistic phenomenon.

	Homepage: https://github.com/osekilab/JBLiMP/tree/main
	"""
	from lm_eval.base import rf, Task
	from lm_eval.metrics import mean
	from lm_eval.tasks.blimp import BlimpTask

	_CITATION = """
	@inproceedings{Someya2023JBLiMPJB,
	title={JBLiMP: Japanese Benchmark of Linguistic Minimal Pairs},
	author={Taiga Someya and Yohei Oseki},
	booktitle={Findings},
	year={2023}
	}
	""" # noqa: W605


	class JBlimpTask(BlimpTask):
	VERSION = 0
	DATASET_PATH = "polm-stability/jblimp"
	DATASET_NAME = None


	class JBlimp(JBlimpTask):
	DATASET_NAME = "jblimp"

	# NOTE: This is very confusing, but while BLiMP uses keys like `sentence_good`,
	# JBLiMP uses keys like `good_sentence`.

	def doc_to_decontamination_query(self, doc):
	return doc["good_sentence"] + " " + doc["bad_sentence"]

	def construct_requests(self, doc, ctx):
	assert not ctx

	# Calculate the loglikelihood for the good and the bad sentence.
	# Note that loglikelihood translates the "" prefix to the "<\|endoftext\|>" token
	return [
	rf.loglikelihood("", doc["good_sentence"]),
	rf.loglikelihood("", doc["bad_sentence"]),
	]