import gradio as gr from transformers import pipeline import PyPDF2 # Load the summarization pipeline summarizer = pipeline("summarization", model="facebook/bart-large-cnn") def pdf_to_text(pdf_file): """Extract text from a PDF file.""" text = "" try: with open(pdf_file, 'rb') as file: reader = PyPDF2.PdfReader(file) for page in reader.pages: page_text = page.extract_text() if page_text: # Only add non-empty pages text += page_text + "\n" except Exception as e: return f"Error reading PDF: {str(e)}" return text.strip() def summarize_pdf(pdf_file): """Summarize the content of a PDF file.""" text = pdf_to_text(pdf_file) if len(text) == 0: return "No text found in the PDF." # Check if the text is too short for summarization if len(text) < 50: # Adjust this threshold if necessary return "The text extracted is too short for summarization." # Split text if it's too long max_input_length = 1024 # BART's maximum token length text_chunks = [text[i:i + max_input_length] for i in range(0, len(text), max_input_length)] # Attempt to summarize the text summaries = [] for chunk in text_chunks: try: summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False) summaries.append(summary[0]['summary_text']) except Exception as e: return f"Error summarizing text: {str(e)}" return "\n\n".join(summaries) # Join summaries from chunks # Create a Gradio interface interface = gr.Interface( fn=summarize_pdf, inputs=gr.File(label="Upload a PDF file"), outputs=gr.Textbox(label="Summary", lines=10), title="PDF Summarizer - by Atif Kazmi", description="Upload a PDF file to receive a summary." ) # Launch the interface if __name__ == "__main__": interface.launch()