Spaces:

Amelia-James
/

pdf-summarizer

Build error

App Files Files Community

pdf-summarizer / app.py

Amelia-James

Update app.py

817af55 verified over 1 year ago

raw

history blame contribute delete

4.99 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from PyPDF2 import PdfReader
	from docx import Document
	from transformers import BartForConditionalGeneration, BartTokenizer
	from concurrent.futures import ThreadPoolExecutor

	# Load model and tokenizer
	model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
	tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

	def chunk_text(text, chunk_size=1024):
	"""Break text into chunks of a specified size."""
	tokens = tokenizer.encode(text, truncation=False)
	chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)]
	return chunks

	def summarize_chunk(chunk, summary_max_length=150):
	"""Summarize a single chunk."""
	inputs = tokenizer.decode(chunk, skip_special_tokens=True)
	inputs = tokenizer([inputs], max_length=1024, return_tensors='pt', truncation=True)
	summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=summary_max_length, early_stopping=True)
	return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	def summarize_chunks_parallel(chunks, summary_max_length=150):
	"""Summarize each chunk in parallel and combine the summaries."""
	with ThreadPoolExecutor() as executor:
	summaries = list(executor.map(lambda chunk: summarize_chunk(chunk, summary_max_length), chunks))
	return ' '.join(summaries)

	def summarize_text(text, title=None, author=None, length_ratio=0.25):
	# Dynamically adjust chunk size based on text length
	input_length = len(tokenizer.encode(text, truncation=True))
	chunk_size = min(1024, max(512, input_length // 8))

	# Break text into chunks
	chunks = chunk_text(text, chunk_size=chunk_size)
	# Set the max length for each summary based on the length ratio
	summary_max_length = int(len(chunks) * length_ratio * 1024)
	# Summarize each chunk in parallel and combine the summaries
	summary = summarize_chunks_parallel(chunks, summary_max_length=summary_max_length)

	# Adding introductory sentence if title or author is available
	if title or author:
	intro = f"The text titled '{title}'" if title else "The text"
	if author:
	intro += f" by {author}"
	intro += " discusses the following main points: "
	summary = intro + summary

	return summary

	def extract_text_from_url(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	paragraphs = soup.find_all('p')
	text = ' '.join([para.get_text() for para in paragraphs])
	return text
	except Exception as e:
	return str(e)

	def extract_text_from_pdf(file):
	pdf_text = ""
	try:
	reader = PdfReader(file)
	for page in reader.pages:
	pdf_text += page.extract_text()
	return pdf_text
	except Exception as e:
	return str(e)

	def extract_text_from_docx(file):
	doc_text = ""
	try:
	doc = Document(file)
	for para in doc.paragraphs:
	doc_text += para.text + "\n"
	return doc_text
	except Exception as e:
	return str(e)

	def process_input(text=None, url=None, file=None, length_ratio=0.25):
	if text:
	# Summarize the provided text
	return summarize_text(text, length_ratio=length_ratio)
	elif url:
	# Extract text from the provided URL and summarize it
	text = extract_text_from_url(url)
	if text:
	return summarize_text(text, length_ratio=length_ratio)
	else:
	return "No text extracted from the URL."
	elif file:
	# Extract text from the provided file (PDF or DOCX) and summarize it
	if file.name.endswith('.pdf'):
	text = extract_text_from_pdf(file)
	elif file.name.endswith('.docx'):
	text = extract_text_from_docx(file)
	else:
	return "Unsupported file type. Please upload a PDF or DOCX file."

	if text:
	return summarize_text(text, length_ratio=length_ratio)
	else:
	return "No text extracted from the file."
	else:
	return "Please provide text, a URL, or upload a file."

	# Define Gradio interface
	interface = gr.Interface(
	fn=process_input,
	inputs=[
	gr.Textbox(label="Input Text", placeholder="Enter text here...", lines=10), # Adjusted input field size
	gr.Textbox(label="URL", placeholder="Enter URL here...", lines=2), # Adjusted URL field size
	gr.File(label="Upload a file (PDF or DOCX)"),
	gr.Slider(label="Summary Length Ratio (as a fraction of the original)", minimum=0.1, maximum=1.0, step=0.05, value=0.25)
	],
	outputs=gr.Textbox(label="Summary", lines=20), # Adjusted output field size
	title="Text Summarization Tool",
	description="Enter text, paste a URL, or upload a PDF/DOCX file to generate a summary. Adjust the summary length with the slider."
	)

	interface.launch()