Spaces:

ngupta949
/

playground_pdfsum

Sleeping

Upload 3 files

ffb9e96 verified 11 months ago

881 Bytes

	from transformers import pipeline
	import PyPDF2

	# Load summarization model
	summarizer_pipeline = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

	def extract_text_from_pdf(file):
	# Check if file is a path (string) or file-like object
	if isinstance(file, str):
	with open(file, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	text = "".join([page.extract_text() or "" for page in reader.pages])
	else:
	reader = PyPDF2.PdfReader(file)
	text = "".join([page.extract_text() or "" for page in reader.pages])
	return text.strip()

	def summarize_text(text, max_length=130, min_length=30):
	if len(text.strip()) == 0:
	return "No valid text found in the PDF."
	summary = summarizer_pipeline(text[:3000], max_length=max_length, min_length=min_length, do_sample=False)
	return summary[0]['summary_text']