akazmi's picture
Update app.py
748f7c1 verified
import gradio as gr
from transformers import pipeline
import PyPDF2
# Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def pdf_to_text(pdf_file):
"""Extract text from a PDF file."""
text = ""
try:
with open(pdf_file, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
page_text = page.extract_text()
if page_text: # Only add non-empty pages
text += page_text + "\n"
except Exception as e:
return f"Error reading PDF: {str(e)}"
return text.strip()
def summarize_pdf(pdf_file):
"""Summarize the content of a PDF file."""
text = pdf_to_text(pdf_file)
if len(text) == 0:
return "No text found in the PDF."
# Check if the text is too short for summarization
if len(text) < 50: # Adjust this threshold if necessary
return "The text extracted is too short for summarization."
# Split text if it's too long
max_input_length = 1024 # BART's maximum token length
text_chunks = [text[i:i + max_input_length] for i in range(0, len(text), max_input_length)]
# Attempt to summarize the text
summaries = []
for chunk in text_chunks:
try:
summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
summaries.append(summary[0]['summary_text'])
except Exception as e:
return f"Error summarizing text: {str(e)}"
return "\n\n".join(summaries) # Join summaries from chunks
# Create a Gradio interface
interface = gr.Interface(
fn=summarize_pdf,
inputs=gr.File(label="Upload a PDF file"),
outputs=gr.Textbox(label="Summary", lines=10),
title="PDF Summarizer - by Atif Kazmi",
description="Upload a PDF file to receive a summary."
)
# Launch the interface
if __name__ == "__main__":
interface.launch()