Spaces:

akazmi
/

pdfreaderandsummarizer

Build error

App Files Files Community

pdfreaderandsummarizer / app.py

akazmi

Update app.py

748f7c1 verified about 1 year ago

raw

history blame contribute delete

1.94 kB

	import gradio as gr
	from transformers import pipeline
	import PyPDF2

	# Load the summarization pipeline
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	def pdf_to_text(pdf_file):
	"""Extract text from a PDF file."""
	text = ""
	try:
	with open(pdf_file, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text: # Only add non-empty pages
	text += page_text + "\n"
	except Exception as e:
	return f"Error reading PDF: {str(e)}"
	return text.strip()

	def summarize_pdf(pdf_file):
	"""Summarize the content of a PDF file."""
	text = pdf_to_text(pdf_file)
	if len(text) == 0:
	return "No text found in the PDF."

	# Check if the text is too short for summarization
	if len(text) < 50: # Adjust this threshold if necessary
	return "The text extracted is too short for summarization."

	# Split text if it's too long
	max_input_length = 1024 # BART's maximum token length
	text_chunks = [text[i:i + max_input_length] for i in range(0, len(text), max_input_length)]

	# Attempt to summarize the text
	summaries = []
	for chunk in text_chunks:
	try:
	summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
	summaries.append(summary[0]['summary_text'])
	except Exception as e:
	return f"Error summarizing text: {str(e)}"

	return "\n\n".join(summaries) # Join summaries from chunks

	# Create a Gradio interface
	interface = gr.Interface(
	fn=summarize_pdf,
	inputs=gr.File(label="Upload a PDF file"),
	outputs=gr.Textbox(label="Summary", lines=10),
	title="PDF Summarizer - by Atif Kazmi",
	description="Upload a PDF file to receive a summary."
	)

	# Launch the interface
	if __name__ == "__main__":
	interface.launch()