| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """QuAC dataset.""" |
|
|
|
|
| import json |
|
|
| import datasets |
|
|
|
|
| _CITATION = """\ |
| @article{choi2018quac, |
| title={Quac: Question answering in context}, |
| author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke}, |
| journal={arXiv preprint arXiv:1808.07036}, |
| year={2018} |
| } |
| """ |
|
|
| _DESCRIPTION = """\ |
| Question Answering in Context (QuAC) is a dataset for modeling, understanding, and |
| participating in information seeking dialog. Data instances consist of an interactive |
| dialog between two crowd workers: (1) a student who poses a sequence of freeform |
| questions to learn as much as possible about a hidden Wikipedia text, and (2) |
| a teacher who answers the questions by providing short excerpts (spans) from the text. |
| """ |
|
|
| _HOMEPAGE = "https://quac.ai/" |
|
|
| |
| _LICENSE = "" |
|
|
| _URLS = { |
| "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json", |
| "validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json", |
| } |
|
|
|
|
| class Quac(datasets.GeneratorBasedBuilder): |
| """Question Answering in Context (QuAC) is a dataset for modeling, understanding, and participating in information seeking dialog.""" |
|
|
| VERSION = datasets.Version("1.1.0") |
|
|
| BUILDER_CONFIGS = [ |
| datasets.BuilderConfig( |
| name="quac", version=VERSION, description="The QuAC dataset" |
| ), |
| ] |
|
|
| def _info(self): |
| features = datasets.Features( |
| { |
| "title": datasets.Value("string"), |
| "section_title": datasets.Value("string"), |
| "paragraph": datasets.Value("string"), |
| "question": datasets.Value("string"), |
| "answer": datasets.Value("string"), |
| } |
| ) |
| return datasets.DatasetInfo( |
| description=_DESCRIPTION, |
| features=features, |
| homepage=_HOMEPAGE, |
| license=_LICENSE, |
| citation=_CITATION, |
| ) |
|
|
| def _split_generators(self, dl_manager): |
| urls = {"train": _URLS["train"], "validation": _URLS["validation"]} |
| data_dir = dl_manager.download_and_extract(urls) |
| return [ |
| datasets.SplitGenerator( |
| name=datasets.Split.TRAIN, |
| |
| gen_kwargs={ |
| "filepath": data_dir["train"], |
| "split": "train", |
| }, |
| ), |
| datasets.SplitGenerator( |
| name=datasets.Split.VALIDATION, |
| |
| gen_kwargs={"filepath": data_dir["validation"], "split": "validation"}, |
| ), |
| ] |
|
|
| |
| def _generate_examples(self, filepath, split): |
| with open(filepath, encoding="utf-8") as f: |
| data = json.load(f)["data"] |
| key = 0 |
| for row in data: |
| paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "") |
| qas = row["paragraphs"][0]["qas"] |
| qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas] |
| for (question, answer) in qa_pairs: |
| |
| yield key, { |
| "title": row["title"], |
| "section_title": row["section_title"], |
| "paragraph": paragraph, |
| "question": question, |
| "answer": answer, |
| } |
| key += 1 |
|
|