| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """MuTual dataset.""" |
|
|
|
|
| import json |
| import os |
| from pathlib import Path |
|
|
| import datasets |
|
|
|
|
| _CITATION = """\ |
| @inproceedings{mutual, |
| title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning", |
| author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" , |
| booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics", |
| year = "2020", |
| publisher = "Association for Computational Linguistics", |
| } |
| """ |
|
|
| _DESCRIPTION = """\ |
| MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is |
| modified from Chinese high school English listening comprehension test data. |
| """ |
|
|
| _HOMEPAGE = "https://github.com/Nealcly/MuTual" |
|
|
| |
| _LICENSE = "" |
|
|
| _URLS = "https://github.com/Nealcly/MuTual/archive/master.zip" |
|
|
|
|
| class Mutual(datasets.GeneratorBasedBuilder): |
| """MuTual: A Dataset for Multi-Turn Dialogue Reasoning""" |
|
|
| VERSION = datasets.Version("0.0.1") |
|
|
| BUILDER_CONFIGS = [ |
| datasets.BuilderConfig( |
| name="mutual", version=VERSION, description="The MuTual dataset." |
| ), |
| datasets.BuilderConfig( |
| name="mutual_plus", |
| version=VERSION, |
| description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", |
| ), |
| ] |
|
|
| def _info(self): |
| features = datasets.Features( |
| { |
| "answers": datasets.Value("string"), |
| "options": datasets.features.Sequence(datasets.Value("string")), |
| "article": datasets.Value("string"), |
| "id": datasets.Value("string"), |
| } |
| ) |
| return datasets.DatasetInfo( |
| description=f"{_DESCRIPTION}\n{self.config.description}", |
| features=features, |
| homepage=_HOMEPAGE, |
| license=_LICENSE, |
| citation=_CITATION, |
| ) |
|
|
| def _split_generators(self, dl_manager): |
| urls = _URLS |
| data_dir = dl_manager.download_and_extract(urls) |
| return [ |
| datasets.SplitGenerator( |
| name=datasets.Split.TRAIN, |
| |
| gen_kwargs={ |
| "basepath": os.path.join( |
| data_dir, "MuTual-master", "data", self.config.name, "train" |
| ), |
| "split": "train", |
| }, |
| ), |
| datasets.SplitGenerator( |
| name=datasets.Split.TEST, |
| |
| gen_kwargs={ |
| "basepath": os.path.join( |
| data_dir, "MuTual-master", "data", self.config.name, "test" |
| ), |
| "split": "test", |
| }, |
| ), |
| datasets.SplitGenerator( |
| name=datasets.Split.VALIDATION, |
| |
| gen_kwargs={ |
| "basepath": os.path.join( |
| data_dir, "MuTual-master", "data", self.config.name, "dev" |
| ), |
| "split": "dev", |
| }, |
| ), |
| ] |
|
|
| |
| def _generate_examples(self, basepath, split): |
| |
| |
| key = 0 |
| for file in sorted(Path(basepath).iterdir()): |
| if file.suffix != ".txt": |
| continue |
| with open(file, "r", encoding="utf-8") as f: |
| data_str = f.read() |
| |
| if not data_str: |
| continue |
| data = json.loads(data_str) |
| yield key, { |
| "answers": data["answers"], |
| "options": data["options"], |
| "article": data["article"], |
| "id": data["id"], |
| } |
| key += 1 |
|
|