Spaces:

akhaliq
/

SummerTime

Build error

SummerTime / dataset /non_huggingface_datasets_builders /summscreen.py

akhaliq3

spaces demo

546a9ba over 4 years ago

4.28 kB

	import os
	import json
	import datasets


	"""Summscreen dataset."""


	_CITATION = """
	@article{DBLP:journals/corr/abs-2104-07091,
	author = {Mingda Chen and
	Zewei Chu and
	Sam Wiseman and
	Kevin Gimpel},
	title = {SummScreen: {A} Dataset for Abstractive Screenplay Summarization},
	journal = {CoRR},
	volume = {abs/2104.07091},
	year = {2021},
	url = {https://arxiv.org/abs/2104.07091},
	archivePrefix = {arXiv},
	eprint = {2104.07091},
	timestamp = {Mon, 19 Apr 2021 16:45:47 +0200},
	biburl = {https://dblp.org/rec/journals/corr/abs-2104-07091.bib},
	bibsource = {dblp computer science bibliography, https://dblp.org}
	}
	"""

	_DESCRIPTION = """
	A summary of scientific papers should ideally incorporate the impact of the papers on the research community
	reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
	the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
	"""

	_HOMEPAGE = "https://github.com/mingdachen/SummScreen"

	_LICENSE = "MIT Licencse"

	_URLs = "https://drive.google.com/uc?id=1BvdIllGBo9d2-bzXQRzWuJXB04XPVmfF"


	class SummertimeSummscreen(datasets.GeneratorBasedBuilder):
	"""Summscreen dataset."""

	VERSION = datasets.Version("1.1.0")

	BUILDER_CONFIGS = [
	datasets.BuilderConfig(),
	]

	def _info(self):
	features = datasets.Features(
	{
	"entry_number": datasets.Value("string"),
	"transcript": datasets.features.Sequence(datasets.Value("string")),
	"recap": datasets.Value("string"),
	}
	)
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=features,
	supervised_keys=None,
	homepage=_HOMEPAGE,
	license=_LICENSE,
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	"""Returns SplitGenerators."""
	my_urls = _URLs
	path = dl_manager.download_and_extract(my_urls)
	path = os.path.join(path, "SummScreen")

	trainpath_fd = os.path.join("ForeverDreaming", "fd_train.json")
	trainpath_tms = os.path.join("TVMegaSite", "tms_train.json")
	trainpaths = [trainpath_fd, trainpath_tms]

	devpath_fd = os.path.join("ForeverDreaming", "fd_dev.json")
	devpath_tms = os.path.join("TVMegaSite", "tms_dev.json")
	devpaths = [devpath_fd, devpath_tms]

	testpath_fd = os.path.join("ForeverDreaming", "fd_test.json")
	testpath_tms = os.path.join("TVMegaSite", "tms_test.json")
	testpaths = [testpath_fd, testpath_tms]

	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"filepaths": (path, trainpaths), "split": "train"},
	),
	datasets.SplitGenerator(
	name=datasets.Split.VALIDATION,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"filepaths": (path, devpaths), "split": "dev"},
	),
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	# These kwargs will be passed to _generate_examples
	gen_kwargs={"filepaths": (path, testpaths), "split": "test"},
	),
	]

	def _generate_examples(self, filepaths, split):
	"""Yields examples."""

	path, relative_filepaths = filepaths
	for filepath in relative_filepaths:

	extraction_path = os.path.join(path, filepath)

	with open(extraction_path, "r") as f:
	for line in f:
	processed_line = line.replace("@@ ", "")
	instance = json.loads(processed_line)

	entry = {}
	entry["entry_number"] = instance["filename"]
	entry["transcript"] = instance["Transcript"]
	entry["recap"] = instance["Recap"][
	0
	] # Recap is a single string in list

	yield entry["entry_number"], entry