pdf_summarization1

Sleeping

App Files Files Community

LovnishVerma commited on May 31, 2025

Commit

97d1ec6

verified ·

1 Parent(s): 3b0bb36

Update app.py

Browse files

Files changed (1) hide show

app.py +266 -39

app.py CHANGED Viewed

@@ -1,47 +1,274 @@
 import gradio as gr
-from pdfminer.high_level import extract_text
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-import os
-# Load summarization model
-model_name = "google/pegasus-xsum"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
-# Extract text from PDF
-def extract_text_from_pdf(pdf_file):
-    with open(pdf_file.name, "rb") as f:
-        return extract_text(f)
-# Summarize text (truncate to first 1024 words)
-def summarize_text(text):
-    # Optional preprocessing
-    text = text.replace("\n", " ")
-    words = text.split()
-    if len(words) > 1024:
-        text = " ".join(words[:1024])
     try:
-        summary = summarizer(text, max_length=128, min_length=30, do_sample=False)[0]["summary_text"]
-        return summary
     except Exception as e:
-        return f"❌ Error: {str(e)}"
-# Gradio interface
-def summarize_pdf(pdf_file):
-    text = extract_text_from_pdf(pdf_file)
-    if not text.strip():
-        return "⚠️ No extractable text found in the PDF."
-    return summarize_text(text)
-# UI
-with gr.Blocks() as demo:
-    gr.Markdown("# 📄 PDF Summarizer (Fast, Pegasus-XSum)\nUpload a PDF file to generate a quick and accurate summary.")
-    with gr.Row():
-        file_input = gr.File(label="Upload your PDF", file_types=[".pdf"])
-    with gr.Row():
-        output = gr.Textbox(label="Summary", lines=15)
-    btn = gr.Button("Summarize")
-    btn.click(summarize_pdf, inputs=file_input, outputs=output)
-demo.launch()

 import gradio as gr
+import PyPDF2
+import io
+from transformers import pipeline, AutoTokenizer
+import torch
+import re
+from typing import Optional
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class PDFSummarizer:
+    def __init__(self):
+        """Initialize the PDF summarizer with optimized models."""
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {self.device}")
+        # Use a fast, efficient model for summarization
+        model_name = "facebook/bart-large-cnn"
+        try:
+            # Load tokenizer and pipeline
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.summarizer = pipeline(
+                "summarization",
+                model=model_name,
+                tokenizer=self.tokenizer,
+                device=0 if self.device == "cuda" else -1,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+            )
+            logger.info("Model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading model: {e}")
+            # Fallback to a smaller model
+            self.summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+    def extract_text_from_pdf(self, pdf_file) -> str:
+        """Extract text from uploaded PDF file."""
+        try:
+            # Read the PDF file
+            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
+            text = ""
+            # Extract text from all pages
+            for page_num in range(len(pdf_reader.pages)):
+                page = pdf_reader.pages[page_num]
+                text += page.extract_text() + "\n"
+            # Clean the text
+            text = self.clean_text(text)
+            return text
+        except Exception as e:
+            logger.error(f"Error extracting PDF text: {e}")
+            return f"Error reading PDF: {str(e)}"
+    def clean_text(self, text: str) -> str:
+        """Clean and preprocess the extracted text."""
+        # Remove extra whitespace and newlines
+        text = re.sub(r'\s+', ' ', text)
+        # Remove special characters but keep punctuation
+        text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
+        return text.strip()
+    def chunk_text(self, text: str, max_chunk_length: int = 1000) -> list:
+        """Split text into chunks for processing."""
+        sentences = text.split('. ')
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) < max_chunk_length:
+                current_chunk += sentence + ". "
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence + ". "
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def summarize_text(self, text: str, summary_length: str = "medium") -> str:
+        """Summarize the extracted text."""
+        if not text or len(text.strip()) < 50:
+            return "Text too short to summarize or empty content."
+        try:
+            # Set summary parameters based on length preference
+            length_params = {
+                "short": {"max_length": 100, "min_length": 30},
+                "medium": {"max_length": 200, "min_length": 50},
+                "long": {"max_length": 400, "min_length": 100}
+            }
+            params = length_params.get(summary_length, length_params["medium"])
+            # Handle long texts by chunking
+            if len(text) > 1024:
+                chunks = self.chunk_text(text, 900)
+                summaries = []
+                for chunk in chunks[:5]:  # Limit to first 5 chunks for speed
+                    try:
+                        summary = self.summarizer(
+                            chunk,
+                            max_length=params["max_length"] // len(chunks[:5]),
+                            min_length=params["min_length"] // len(chunks[:5]),
+                            do_sample=False
+                        )
+                        summaries.append(summary[0]['summary_text'])
+                    except Exception as e:
+                        logger.error(f"Error summarizing chunk: {e}")
+                        continue
+                # Combine chunk summaries
+                combined_summary = " ".join(summaries)
+                # Final summarization if combined text is still long
+                if len(combined_summary) > 512:
+                    final_summary = self.summarizer(
+                        combined_summary,
+                        max_length=params["max_length"],
+                        min_length=params["min_length"],
+                        do_sample=False
+                    )
+                    return final_summary[0]['summary_text']
+                else:
+                    return combined_summary
+            else:
+                # Direct summarization for shorter texts
+                summary = self.summarizer(
+                    text,
+                    max_length=params["max_length"],
+                    min_length=params["min_length"],
+                    do_sample=False
+                )
+                return summary[0]['summary_text']
+        except Exception as e:
+            logger.error(f"Error during summarization: {e}")
+            return f"Error generating summary: {str(e)}"
+# Initialize the summarizer
+pdf_summarizer = PDFSummarizer()
+def process_pdf(pdf_file, summary_length):
+    """Main function to process PDF and return summary."""
+    if pdf_file is None:
+        return "Please upload a PDF file.", ""
     try:
+        # Extract text from PDF
+        extracted_text = pdf_summarizer.extract_text_from_pdf(pdf_file)
+        if extracted_text.startswith("Error"):
+            return extracted_text, ""
+        # Generate summary
+        summary = pdf_summarizer.summarize_text(extracted_text, summary_length)
+        return summary, extracted_text[:1000] + "..." if len(extracted_text) > 1000 else extracted_text
     except Exception as e:
+        logger.error(f"Error processing PDF: {e}")
+        return f"Error processing PDF: {str(e)}", ""
+# Create Gradio interface
+def create_interface():
+    """Create and configure the Gradio interface."""
+    with gr.Blocks(
+        title="PDF Summarizer",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {
+            max-width: 1200px;
+            margin: 0 auto;
+        }
+        .header {
+            text-align: center;
+            margin-bottom: 2rem;
+        }
+        """
+    ) as app:
+        gr.HTML("""
+        <div class="header">
+            <h1>🚀 Fast PDF Summarizer</h1>
+            <p>Upload a PDF file and get an instant AI-powered summary!</p>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Input components
+                pdf_input = gr.File(
+                    label="Upload PDF File",
+                    file_types=[".pdf"],
+                    file_count="single"
+                )
+                summary_length = gr.Radio(
+                    choices=["short", "medium", "long"],
+                    value="medium",
+                    label="Summary Length",
+                    info="Choose how detailed you want the summary to be"
+                )
+                summarize_btn = gr.Button(
+                    "Summarize PDF",
+                    variant="primary",
+                    size="lg"
+                )
+            with gr.Column(scale=2):
+                # Output components
+                summary_output = gr.Textbox(
+                    label="Summary",
+                    lines=10,
+                    placeholder="Your PDF summary will appear here...",
+                    max_lines=15
+                )
+                with gr.Accordion("View Extracted Text", open=False):
+                    extracted_text_output = gr.Textbox(
+                        label="Extracted Text (Preview)",
+                        lines=5,
+                        max_lines=10,
+                        placeholder="Extracted text preview will appear here..."
+                    )
+        # Event handlers
+        summarize_btn.click(
+            fn=process_pdf,
+            inputs=[pdf_input, summary_length],
+            outputs=[summary_output, extracted_text_output],
+            show_progress=True
+        )
+        # Auto-process when file is uploaded
+        pdf_input.change(
+            fn=process_pdf,
+            inputs=[pdf_input, summary_length],
+            outputs=[summary_output, extracted_text_output]
+        )
+        # Examples section
+        gr.HTML("""
+        <div style="margin-top: 2rem; padding: 1rem; background-color: #f0f0f0; border-radius: 8px;">
+            <h3>💡 Tips for Best Results:</h3>
+            <ul>
+                <li>Upload clear, text-based PDFs (not scanned images)</li>
+                <li>Choose 'short' for quick overviews, 'long' for detailed summaries</li>
+                <li>Large PDFs are automatically chunked for faster processing</li>
+                <li>The app works best with documents under 50 pages</li>
+            </ul>
+        </div>
+        """)
+    return app
+# Create and launch the app
+if __name__ == "__main__":
+    app = create_interface()
+    app.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        max_file_size="10mb"
+    )