Spaces:
Build error
Build error
| import gradio as gr | |
| from transformers import pipeline | |
| import PyPDF2 | |
| # Load the summarization pipeline | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| def pdf_to_text(pdf_file): | |
| """Extract text from a PDF file.""" | |
| text = "" | |
| try: | |
| with open(pdf_file, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: # Only add non-empty pages | |
| text += page_text + "\n" | |
| except Exception as e: | |
| return f"Error reading PDF: {str(e)}" | |
| return text.strip() | |
| def summarize_pdf(pdf_file): | |
| """Summarize the content of a PDF file.""" | |
| text = pdf_to_text(pdf_file) | |
| if len(text) == 0: | |
| return "No text found in the PDF." | |
| # Check if the text is too short for summarization | |
| if len(text) < 50: # Adjust this threshold if necessary | |
| return "The text extracted is too short for summarization." | |
| # Split text if it's too long | |
| max_input_length = 1024 # BART's maximum token length | |
| text_chunks = [text[i:i + max_input_length] for i in range(0, len(text), max_input_length)] | |
| # Attempt to summarize the text | |
| summaries = [] | |
| for chunk in text_chunks: | |
| try: | |
| summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False) | |
| summaries.append(summary[0]['summary_text']) | |
| except Exception as e: | |
| return f"Error summarizing text: {str(e)}" | |
| return "\n\n".join(summaries) # Join summaries from chunks | |
| # Create a Gradio interface | |
| interface = gr.Interface( | |
| fn=summarize_pdf, | |
| inputs=gr.File(label="Upload a PDF file"), | |
| outputs=gr.Textbox(label="Summary", lines=10), | |
| title="PDF Summarizer - by Atif Kazmi", | |
| description="Upload a PDF file to receive a summary." | |
| ) | |
| # Launch the interface | |
| if __name__ == "__main__": | |
| interface.launch() | |