koichi12's picture
Add files using upload-large-folder tool
4c8cf60 verified
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: Address all TODOs and remove all explanatory comments
"""QuAC dataset."""
import json
import datasets
_CITATION = """\
@article{choi2018quac,
title={Quac: Question answering in context},
author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
journal={arXiv preprint arXiv:1808.07036},
year={2018}
}
"""
_DESCRIPTION = """\
Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
participating in information seeking dialog. Data instances consist of an interactive
dialog between two crowd workers: (1) a student who poses a sequence of freeform
questions to learn as much as possible about a hidden Wikipedia text, and (2)
a teacher who answers the questions by providing short excerpts (spans) from the text.
"""
_HOMEPAGE = "https://quac.ai/"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = {
"train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json",
"validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json",
}
class Quac(datasets.GeneratorBasedBuilder):
"""Question Answering in Context (QuAC) is a dataset for modeling, understanding, and participating in information seeking dialog."""
VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="quac", version=VERSION, description="The QuAC dataset"
),
]
def _info(self):
features = datasets.Features(
{
"title": datasets.Value("string"),
"section_title": datasets.Value("string"),
"paragraph": datasets.Value("string"),
"question": datasets.Value("string"),
"answer": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
data_dir = dl_manager.download_and_extract(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": data_dir["train"],
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
with open(filepath, encoding="utf-8") as f:
data = json.load(f)["data"]
key = 0
for row in data:
paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
qas = row["paragraphs"][0]["qas"]
qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
for (question, answer) in qa_pairs:
# Yields examples as (key, example) tuples
yield key, {
"title": row["title"],
"section_title": row["section_title"],
"paragraph": paragraph,
"question": question,
"answer": answer,
}
key += 1