| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """DROP dataset.""" |
| |
|
| |
|
| | import json |
| | import os |
| |
|
| | import datasets |
| |
|
| |
|
| | _CITATION = """\ |
| | @misc{dua2019drop, |
| | title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, |
| | author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner}, |
| | year={2019}, |
| | eprint={1903.00161}, |
| | archivePrefix={arXiv}, |
| | primaryClass={cs.CL} |
| | } |
| | """ |
| |
|
| | _DESCRIPTION = """\ |
| | DROP is a QA dataset which tests comprehensive understanding of paragraphs. In |
| | this crowdsourced, adversarially-created, 96k question-answering benchmark, a |
| | system must resolve multiple references in a question, map them onto a paragraph, |
| | and perform discrete operations over them (such as addition, counting, or sorting). |
| | """ |
| |
|
| | _HOMEPAGE = "https://allenai.org/data/drop" |
| |
|
| | |
| | _LICENSE = "" |
| |
|
| | _URLS = { |
| | "drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip", |
| | } |
| |
|
| | _EMPTY_VALIDATED_ANSWER = [ |
| | { |
| | "number": "", |
| | "date": { |
| | "day": "", |
| | "month": "", |
| | "year": "", |
| | }, |
| | "spans": [], |
| | "worker_id": "", |
| | "hit_id": "", |
| | } |
| | ] |
| |
|
| |
|
| | class Drop(datasets.GeneratorBasedBuilder): |
| | """DROP is a QA dataset which tests comprehensive understanding of paragraphs.""" |
| |
|
| | VERSION = datasets.Version("0.0.1") |
| |
|
| | BUILDER_CONFIGS = [ |
| | datasets.BuilderConfig( |
| | name="drop", version=VERSION, description="The DROP dataset." |
| | ), |
| | ] |
| |
|
| | def _info(self): |
| | features = datasets.Features( |
| | { |
| | "section_id": datasets.Value("string"), |
| | "passage": datasets.Value("string"), |
| | "question": datasets.Value("string"), |
| | "query_id": datasets.Value("string"), |
| | "answer": { |
| | "number": datasets.Value("string"), |
| | "date": { |
| | "day": datasets.Value("string"), |
| | "month": datasets.Value("string"), |
| | "year": datasets.Value("string"), |
| | }, |
| | "spans": datasets.features.Sequence(datasets.Value("string")), |
| | "worker_id": datasets.Value("string"), |
| | "hit_id": datasets.Value("string"), |
| | }, |
| | "validated_answers": datasets.features.Sequence( |
| | { |
| | "number": datasets.Value("string"), |
| | "date": { |
| | "day": datasets.Value("string"), |
| | "month": datasets.Value("string"), |
| | "year": datasets.Value("string"), |
| | }, |
| | "spans": datasets.features.Sequence(datasets.Value("string")), |
| | "worker_id": datasets.Value("string"), |
| | "hit_id": datasets.Value("string"), |
| | } |
| | ), |
| | } |
| | ) |
| | return datasets.DatasetInfo( |
| | description=_DESCRIPTION, |
| | features=features, |
| | homepage=_HOMEPAGE, |
| | license=_LICENSE, |
| | citation=_CITATION, |
| | ) |
| |
|
| | def _split_generators(self, dl_manager): |
| | urls = _URLS[self.config.name] |
| | data_dir = dl_manager.download_and_extract(urls) |
| | return [ |
| | datasets.SplitGenerator( |
| | name=datasets.Split.TRAIN, |
| | |
| | gen_kwargs={ |
| | "filepath": os.path.join( |
| | data_dir, "drop_dataset", "drop_dataset_train.json" |
| | ), |
| | "split": "train", |
| | }, |
| | ), |
| | datasets.SplitGenerator( |
| | name=datasets.Split.VALIDATION, |
| | |
| | gen_kwargs={ |
| | "filepath": os.path.join( |
| | data_dir, "drop_dataset", "drop_dataset_dev.json" |
| | ), |
| | "split": "validation", |
| | }, |
| | ), |
| | ] |
| |
|
| | |
| | def _generate_examples(self, filepath, split): |
| | with open(filepath, encoding="utf-8") as f: |
| | data = json.load(f) |
| | key = 0 |
| | for section_id, example in data.items(): |
| | |
| | for qa in example["qa_pairs"]: |
| | |
| | answer = qa["answer"] |
| | answer = { |
| | "number": answer["number"], |
| | "date": { |
| | "day": answer["date"].get("day", ""), |
| | "month": answer["date"].get("month", ""), |
| | "year": answer["date"].get("year", ""), |
| | }, |
| | "spans": answer["spans"], |
| | "worker_id": answer.get("worker_id", ""), |
| | "hit_id": answer.get("hit_id", ""), |
| | } |
| | validated_answers = [] |
| | if "validated_answers" in qa: |
| | for validated_answer in qa["validated_answers"]: |
| | va = { |
| | "number": validated_answer.get("number", ""), |
| | "date": { |
| | "day": validated_answer["date"].get("day", ""), |
| | "month": validated_answer["date"].get("month", ""), |
| | "year": validated_answer["date"].get("year", ""), |
| | }, |
| | "spans": validated_answer.get("spans", ""), |
| | "worker_id": validated_answer.get("worker_id", ""), |
| | "hit_id": validated_answer.get("hit_id", ""), |
| | } |
| | validated_answers.append(va) |
| | else: |
| | validated_answers = _EMPTY_VALIDATED_ANSWER |
| | yield key, { |
| | "section_id": section_id, |
| | "passage": example["passage"], |
| | "question": qa["question"], |
| | "query_id": qa["query_id"], |
| | "answer": answer, |
| | "validated_answers": validated_answers, |
| | } |
| | key += 1 |
| |
|