Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import PyPDF2 | |
| import pdfplumber | |
| # Load the summarization pipeline | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| def extract_text_from_pdf(pdf_file): | |
| """Extract text from a PDF using PyPDF2 with a fallback to pdfplumber.""" | |
| text = "" | |
| try: | |
| # First try with PyPDF2 | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| except Exception as e: | |
| print(f"PyPDF2 failed: {e}") | |
| # Fallback to pdfplumber | |
| with pdfplumber.open(pdf_file) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() | |
| return text | |
| def chunk_text(text, max_chunk_size=1024): | |
| """Split text into smaller chunks to fit within model token limits.""" | |
| words = text.split() | |
| for i in range(0, len(words), max_chunk_size): | |
| yield " ".join(words[i:i + max_chunk_size]) | |
| def summarize_pdf(pdf_file): | |
| """Extract text from PDF, chunk it, and summarize.""" | |
| try: | |
| # Extract text from the PDF | |
| text = extract_text_from_pdf(pdf_file) | |
| if not text.strip(): | |
| return "❌ Could not extract any text from the PDF. Please upload a readable document." | |
| # Chunk text for summarization | |
| summaries = [] | |
| for chunk in chunk_text(text): | |
| # Summarize each chunk | |
| summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False) | |
| summaries.append(summary[0]['summary_text']) | |
| # Combine all summaries into one | |
| full_summary = "\n\n".join(summaries) | |
| return full_summary | |
| except Exception as e: | |
| return f"❌ An error occurred: {str(e)}" | |
| # Gradio Interface | |
| interface = gr.Interface( | |
| fn=summarize_pdf, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs=gr.Textbox(label="Summary"), | |
| title="PDF Summarizer", | |
| description="Upload a PDF file to extract and summarize its content using state-of-the-art AI." | |
| ) | |
| interface.launch() | |