Spaces:

shortform
/

chapter_summary_summvis

Runtime error

App Files Files Community

chapter_summary_summvis / preprocessing.py

kmfoda

Initial upload

0379fdb about 3 years ago

raw

history blame contribute delete

12.8 kB

	import logging
	from argparse import ArgumentParser
	from typing import List

	from meerkat import DataPanel, SpacyColumn
	from meerkat.logging.utils import set_logging_level
	from spacy import load

	from align import BertscoreAligner, NGramAligner, StaticEmbeddingAligner, Aligner
	from utils import clean_text

	set_logging_level('critical')
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.CRITICAL)


	def _run_aligners(
	dataset: DataPanel,
	aligners: List[Aligner],
	doc_column: str,
	reference_column: str,
	summary_columns: List[str] = None,
	):
	if not summary_columns:
	summary_columns = []

	to_columns = []
	if reference_column is not None:
	to_columns.append(reference_column)
	to_columns.extend(summary_columns)

	for aligner in aligners:

	# Run the aligner on (document, summary) pairs
	dataset = dataset.update(
	lambda x: {
	f'{type(aligner).__name__}:{doc_column}:{to_columns}':
	aligner.align(
	x[doc_column],
	[x[col] for col in to_columns],
	),
	},
	)

	if reference_column is not None and len(summary_columns):
	# Run the aligner on (reference, summary) pairs
	dataset = dataset.update(
	lambda x: {
	f'{type(aligner).__name__}:{reference_column}:{summary_columns}': aligner.align(
	x[reference_column],
	[x[col] for col in summary_columns],
	),
	},
	)

	if len(to_columns) > 1:
	# Instead of having one column for (document, summary) comparisons, split
	# off into (1 + \|summary_columns\|) total columns, one for each comparison

	# Retrieve the (document, summary) column
	doc_summary_column = dataset[f'{type(aligner).__name__}:{doc_column}:{to_columns}']

	for i, col in enumerate(to_columns):
	# Add as a new column after encoding with the aligner's `encode` method
	dataset.add_column(
	f'{type(aligner).__name__}:{doc_column}:{col}',
	[row[i] for row in doc_summary_column],
	)

	# Remove the (document, summary) column
	dataset.remove_column(f'{type(aligner).__name__}:{doc_column}:{to_columns}')

	if reference_column is not None and len(summary_columns) > 1:
	# Instead of having one column for (reference, summary) comparisons, split
	# off into (\|summary_columns\|) total columns, one for each comparison

	# Retrieve the (reference, summary) column
	reference_summary_column = dataset[f'{type(aligner).__name__}:{reference_column}:{summary_columns}']

	for i, col in enumerate(summary_columns):
	# Add as a new column
	dataset.add_column(
	f'{type(aligner).__name__}:{reference_column}:{col}',
	[row[i] for row in reference_summary_column],
	)

	# Remove the (reference, summary) column
	dataset.remove_column(f'{type(aligner).__name__}:{reference_column}:{summary_columns}')

	return dataset


	def load_nlp():
	try:
	return load('en_core_web_lg')
	except OSError:
	raise OSError("'en_core_web_lg model' is required unless loading from cached file."
	"To install: 'python -m spacy download en_core_web_lg'")


	def run_workflow(
	jsonl_path: str,
	doc_column: str = None,
	reference_column: str = None,
	summary_columns: List[str] = None,
	bert_aligner_threshold: float = 0.5,
	bert_aligner_top_k: int = 3,
	embedding_aligner_threshold: float = 0.5,
	embedding_aligner_top_k: int = 3,
	processed_dataset_path: str = None,
	n_samples: int = None
	):
	if not jsonl_path:
	raise ValueError("'jsonl_path' is required")

	if not processed_dataset_path:
	raise ValueError("Please specify a path to save the dataset.")

	# Load the dataset
	dataset = DataPanel.from_jsonl(jsonl_path)

	if doc_column is None:
	# Assume `doc_column` is called "document"
	doc_column = 'document'
	assert doc_column in dataset.columns, \
	f"`doc_column={doc_column}` is not a column in datapanel."
	print("Assuming `doc_column` is called 'document'.")

	if reference_column is None:
	# Assume `reference_column` is called "summary:reference"
	reference_column = 'summary:reference'
	print("Assuming `reference_column` is called 'summary:reference'.")
	if reference_column not in dataset.columns:
	print("No reference summary loaded")
	reference_column = None

	if summary_columns is None or len(summary_columns) == 0:
	# Assume `summary_columns` are prefixed by "summary:"
	summary_columns = []
	for col in dataset.columns:
	if col.startswith("summary:") and col != "summary:reference":
	summary_columns.append(col)
	print(f"Reading summary columns from datapanel. Found {summary_columns}.")

	if len(summary_columns) == 0 and reference_column is None:
	raise ValueError("At least one summary is required")

	# Restrict to the first `n_samples`
	if n_samples:
	print(f"Restricting to {n_samples} samples.")
	dataset = dataset.head(n_samples)

	print("size of dataset:", len(dataset))

	# Combine the text columns into one list
	text_columns = [doc_column] + ([reference_column] if reference_column else []) + summary_columns

	# Preprocessing all the text columns
	print("Preprocessing text columns")
	dataset = dataset.update(
	lambda x: {
	f'preprocessed_{k}': x[k] if args.no_clean else clean_text(x[k])
	for k in text_columns
	}
	)

	# Run the Spacy pipeline on all preprocessed text columns
	nlp = load_nlp()

	nlp.add_pipe('sentencizer', before="parser")

	print("Running spacy processing")
	for col in text_columns:
	dataset.add_column(f'spacy:{col}', SpacyColumn.from_docs(nlp.pipe(dataset[f'preprocessed_{col}'])))

	# Run the 3 align pipelines
	bert_aligner = BertscoreAligner(
	threshold=bert_aligner_threshold,
	top_k=bert_aligner_top_k,
	)

	embedding_aligner = StaticEmbeddingAligner(
	threshold=embedding_aligner_threshold,
	top_k=embedding_aligner_top_k,
	)

	ngram_aligner = NGramAligner()

	dataset = _run_aligners(
	dataset=dataset,
	aligners=[bert_aligner, embedding_aligner, ngram_aligner],
	doc_column=f'spacy:{doc_column}',
	reference_column=f'spacy:{reference_column}' if reference_column else None,
	summary_columns=[f'spacy:{col}' for col in summary_columns],
	)

	# Save the dataset
	dataset.write(processed_dataset_path)

	return dataset


	def standardize_dataset(
	dataset_name: str,
	dataset_version: str,
	dataset_split: str,
	save_jsonl_path: str,
	doc_column: str = None,
	reference_column: str = None,
	n_samples: int = None

	):
	"""Load a dataset from Huggingface and dump it to disk."""

	if args.dataset is None or \
	args.split is None or \
	args.save_jsonl_path is None:
	raise ValueError('Missing command line argument')

	# Load the dataset from Huggingface
	dataset = get_dataset(
	dataset_name=dataset_name,
	dataset_version=dataset_version,
	dataset_split=dataset_split
	)
	if n_samples:
	dataset = dataset[:n_samples]

	if doc_column is None:
	if reference_column is not None:
	raise ValueError("You must specify `doc_column` if you specify `reference_column`")
	try:
	doc_column, reference_column = {
	'cnn_dailymail': ('article', 'highlights'),
	'xsum': ('document', 'summary')
	}[dataset_name]
	except:
	raise NotImplementedError(
	"Please specify `doc_column`."
	)

	# Rename the columns
	if doc_column != 'document':
	dataset.add_column('document', dataset[doc_column])
	dataset.remove_column(doc_column)
	dataset.add_column('summary:reference', dataset[reference_column])
	dataset.remove_column(reference_column)

	# Save the dataset back to disk
	dataset.to_jsonl(save_jsonl_path)
	return dataset


	def get_dataset(
	dataset_name: str = None,
	dataset_version: str = None,
	dataset_split: str = 'test',
	dataset_jsonl: str = None,
	):
	"""Load a dataset."""
	assert (dataset_name is not None) != (dataset_jsonl is not None), \
	"Specify one of `dataset_name` or `dataset_jsonl`."

	# Load the dataset
	if dataset_name is not None:
	return get_hf_dataset(dataset_name, dataset_version, dataset_split)

	return DataPanel.from_jsonl(json_path=dataset_jsonl)


	def get_hf_dataset(name: str, version: str = None, split: str = 'test'):
	"""Get dataset from Huggingface."""
	if version:
	return DataPanel.from_huggingface(name, version, split=split)
	return DataPanel.from_huggingface(name, split=split)


	if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument('--dataset', type=str, choices=['cnn_dailymail', 'xsum'],
	help="Huggingface dataset name.")
	parser.add_argument('--version', type=str,
	help="Huggingface dataset version.")
	parser.add_argument('--split', type=str, default='test',
	help="Huggingface dataset split.")
	parser.add_argument('--dataset_jsonl', type=str,
	help="Path to a jsonl file for the dataset.")
	parser.add_argument('--save_jsonl_path', type=str,
	help="Path to save the processed jsonl dataset.")
	parser.add_argument('--doc_column', type=str,
	help="Name of the document column in the dataset.")
	parser.add_argument('--reference_column', type=str,
	help="Name of the reference summary column in the dataset.")
	parser.add_argument('--summary_columns', nargs='+', default=[],
	help="Name of other summary columns in/added to the dataset.")

	parser.add_argument('--bert_aligner_threshold', type=float, default=0.1,
	help="Minimum threshold for BERT alignment.")
	parser.add_argument('--bert_aligner_top_k', type=int, default=10,
	help="Top-k for BERT alignment.")
	parser.add_argument('--embedding_aligner_threshold', type=float, default=0.1,
	help="Minimum threshold for embedding alignment.")
	parser.add_argument('--embedding_aligner_top_k', type=int, default=10,
	help="Top-k for embedding alignment.")
	parser.add_argument('--processed_dataset_path', type=str,
	help="Path to store the final processed dataset.")
	parser.add_argument('--n_samples', type=int,
	help="Number of dataset samples to process.")

	parser.add_argument('--workflow', action='store_true', default=False,
	help="Whether to run the preprocessing workflow.")
	parser.add_argument('--standardize', action='store_true', default=False,
	help="Whether to standardize the dataset and save to jsonl.")
	parser.add_argument('--no_clean', action='store_true', default=False,
	help="Do not clean text (remove extraneous spaces, newlines).")
	args = parser.parse_args()

	if args.standardize:
	# Dump a Huggingface dataset to standardized jsonl format
	standardize_dataset(
	dataset_name=args.dataset,
	dataset_version=args.version,
	dataset_split=args.split,
	save_jsonl_path=args.save_jsonl_path,
	doc_column=args.doc_column,
	reference_column=args.reference_column,
	n_samples=args.n_samples
	)

	if args.workflow:
	# Run the processing workflow
	run_workflow(
	jsonl_path=args.dataset_jsonl,
	doc_column=args.doc_column,
	reference_column=args.reference_column,
	summary_columns=args.summary_columns,
	bert_aligner_threshold=args.bert_aligner_threshold,
	bert_aligner_top_k=args.bert_aligner_top_k,
	embedding_aligner_threshold=args.embedding_aligner_threshold,
	embedding_aligner_top_k=args.embedding_aligner_top_k,
	processed_dataset_path=args.processed_dataset_path,
	n_samples=args.n_samples
	)