pdf-summarizer / app.py
Amelia-James's picture
Update app.py
817af55 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from docx import Document
from transformers import BartForConditionalGeneration, BartTokenizer
from concurrent.futures import ThreadPoolExecutor
# Load model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
def chunk_text(text, chunk_size=1024):
"""Break text into chunks of a specified size."""
tokens = tokenizer.encode(text, truncation=False)
chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)]
return chunks
def summarize_chunk(chunk, summary_max_length=150):
"""Summarize a single chunk."""
inputs = tokenizer.decode(chunk, skip_special_tokens=True)
inputs = tokenizer([inputs], max_length=1024, return_tensors='pt', truncation=True)
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=summary_max_length, early_stopping=True)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
def summarize_chunks_parallel(chunks, summary_max_length=150):
"""Summarize each chunk in parallel and combine the summaries."""
with ThreadPoolExecutor() as executor:
summaries = list(executor.map(lambda chunk: summarize_chunk(chunk, summary_max_length), chunks))
return ' '.join(summaries)
def summarize_text(text, title=None, author=None, length_ratio=0.25):
# Dynamically adjust chunk size based on text length
input_length = len(tokenizer.encode(text, truncation=True))
chunk_size = min(1024, max(512, input_length // 8))
# Break text into chunks
chunks = chunk_text(text, chunk_size=chunk_size)
# Set the max length for each summary based on the length ratio
summary_max_length = int(len(chunks) * length_ratio * 1024)
# Summarize each chunk in parallel and combine the summaries
summary = summarize_chunks_parallel(chunks, summary_max_length=summary_max_length)
# Adding introductory sentence if title or author is available
if title or author:
intro = f"The text titled '{title}'" if title else "The text"
if author:
intro += f" by {author}"
intro += " discusses the following main points: "
summary = intro + summary
return summary
def extract_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
text = ' '.join([para.get_text() for para in paragraphs])
return text
except Exception as e:
return str(e)
def extract_text_from_pdf(file):
pdf_text = ""
try:
reader = PdfReader(file)
for page in reader.pages:
pdf_text += page.extract_text()
return pdf_text
except Exception as e:
return str(e)
def extract_text_from_docx(file):
doc_text = ""
try:
doc = Document(file)
for para in doc.paragraphs:
doc_text += para.text + "\n"
return doc_text
except Exception as e:
return str(e)
def process_input(text=None, url=None, file=None, length_ratio=0.25):
if text:
# Summarize the provided text
return summarize_text(text, length_ratio=length_ratio)
elif url:
# Extract text from the provided URL and summarize it
text = extract_text_from_url(url)
if text:
return summarize_text(text, length_ratio=length_ratio)
else:
return "No text extracted from the URL."
elif file:
# Extract text from the provided file (PDF or DOCX) and summarize it
if file.name.endswith('.pdf'):
text = extract_text_from_pdf(file)
elif file.name.endswith('.docx'):
text = extract_text_from_docx(file)
else:
return "Unsupported file type. Please upload a PDF or DOCX file."
if text:
return summarize_text(text, length_ratio=length_ratio)
else:
return "No text extracted from the file."
else:
return "Please provide text, a URL, or upload a file."
# Define Gradio interface
interface = gr.Interface(
fn=process_input,
inputs=[
gr.Textbox(label="Input Text", placeholder="Enter text here...", lines=10), # Adjusted input field size
gr.Textbox(label="URL", placeholder="Enter URL here...", lines=2), # Adjusted URL field size
gr.File(label="Upload a file (PDF or DOCX)"),
gr.Slider(label="Summary Length Ratio (as a fraction of the original)", minimum=0.1, maximum=1.0, step=0.05, value=0.25)
],
outputs=gr.Textbox(label="Summary", lines=20), # Adjusted output field size
title="Text Summarization Tool",
description="Enter text, paste a URL, or upload a PDF/DOCX file to generate a summary. Adjust the summary length with the slider."
)
interface.launch()