Spaces:
Build error
Build error
| import os | |
| import json | |
| import datasets | |
| """Summscreen dataset.""" | |
| _CITATION = """ | |
| @article{DBLP:journals/corr/abs-2104-07091, | |
| author = {Mingda Chen and | |
| Zewei Chu and | |
| Sam Wiseman and | |
| Kevin Gimpel}, | |
| title = {SummScreen: {A} Dataset for Abstractive Screenplay Summarization}, | |
| journal = {CoRR}, | |
| volume = {abs/2104.07091}, | |
| year = {2021}, | |
| url = {https://arxiv.org/abs/2104.07091}, | |
| archivePrefix = {arXiv}, | |
| eprint = {2104.07091}, | |
| timestamp = {Mon, 19 Apr 2021 16:45:47 +0200}, | |
| biburl = {https://dblp.org/rec/journals/corr/abs-2104-07091.bib}, | |
| bibsource = {dblp computer science bibliography, https://dblp.org} | |
| } | |
| """ | |
| _DESCRIPTION = """ | |
| A summary of scientific papers should ideally incorporate the impact of the papers on the research community | |
| reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm), | |
| the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain. | |
| """ | |
| _HOMEPAGE = "https://github.com/mingdachen/SummScreen" | |
| _LICENSE = "MIT Licencse" | |
| _URLs = "https://drive.google.com/uc?id=1BvdIllGBo9d2-bzXQRzWuJXB04XPVmfF" | |
| class SummertimeSummscreen(datasets.GeneratorBasedBuilder): | |
| """Summscreen dataset.""" | |
| VERSION = datasets.Version("1.1.0") | |
| BUILDER_CONFIGS = [ | |
| datasets.BuilderConfig(), | |
| ] | |
| def _info(self): | |
| features = datasets.Features( | |
| { | |
| "entry_number": datasets.Value("string"), | |
| "transcript": datasets.features.Sequence(datasets.Value("string")), | |
| "recap": datasets.Value("string"), | |
| } | |
| ) | |
| return datasets.DatasetInfo( | |
| description=_DESCRIPTION, | |
| features=features, | |
| supervised_keys=None, | |
| homepage=_HOMEPAGE, | |
| license=_LICENSE, | |
| citation=_CITATION, | |
| ) | |
| def _split_generators(self, dl_manager): | |
| """Returns SplitGenerators.""" | |
| my_urls = _URLs | |
| path = dl_manager.download_and_extract(my_urls) | |
| path = os.path.join(path, "SummScreen") | |
| trainpath_fd = os.path.join("ForeverDreaming", "fd_train.json") | |
| trainpath_tms = os.path.join("TVMegaSite", "tms_train.json") | |
| trainpaths = [trainpath_fd, trainpath_tms] | |
| devpath_fd = os.path.join("ForeverDreaming", "fd_dev.json") | |
| devpath_tms = os.path.join("TVMegaSite", "tms_dev.json") | |
| devpaths = [devpath_fd, devpath_tms] | |
| testpath_fd = os.path.join("ForeverDreaming", "fd_test.json") | |
| testpath_tms = os.path.join("TVMegaSite", "tms_test.json") | |
| testpaths = [testpath_fd, testpath_tms] | |
| return [ | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TRAIN, | |
| # These kwargs will be passed to _generate_examples | |
| gen_kwargs={"filepaths": (path, trainpaths), "split": "train"}, | |
| ), | |
| datasets.SplitGenerator( | |
| name=datasets.Split.VALIDATION, | |
| # These kwargs will be passed to _generate_examples | |
| gen_kwargs={"filepaths": (path, devpaths), "split": "dev"}, | |
| ), | |
| datasets.SplitGenerator( | |
| name=datasets.Split.TEST, | |
| # These kwargs will be passed to _generate_examples | |
| gen_kwargs={"filepaths": (path, testpaths), "split": "test"}, | |
| ), | |
| ] | |
| def _generate_examples(self, filepaths, split): | |
| """Yields examples.""" | |
| path, relative_filepaths = filepaths | |
| for filepath in relative_filepaths: | |
| extraction_path = os.path.join(path, filepath) | |
| with open(extraction_path, "r") as f: | |
| for line in f: | |
| processed_line = line.replace("@@ ", "") | |
| instance = json.loads(processed_line) | |
| entry = {} | |
| entry["entry_number"] = instance["filename"] | |
| entry["transcript"] = instance["Transcript"] | |
| entry["recap"] = instance["Recap"][ | |
| 0 | |
| ] # Recap is a single string in list | |
| yield entry["entry_number"], entry | |