pdf_summarization1

Sleeping

App Files Files Community

LovnishVerma commited on May 31, 2025

Commit

bb6a28b

verified ·

1 Parent(s): fc57051

Update app.py

Browse files

Files changed (1) hide show

app.py +184 -173

app.py CHANGED Viewed

@@ -4,75 +4,70 @@ import io
 from transformers import pipeline, AutoTokenizer
 import torch
 import re
-from typing import Optional
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 class PDFSummarizer:
     def __init__(self):
-        """Initialize the PDF summarizer with optimized models."""
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Using device: {self.device}")
         # Use a fast, efficient model for summarization
-        model_name = "facebook/bart-large-cnn"
         try:
-            # Load tokenizer and pipeline
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.summarizer = pipeline(
                 "summarization",
-                model=model_name,
-                tokenizer=self.tokenizer,
-                device=0 if self.device == "cuda" else -1,
-                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
             )
-            logger.info("Model loaded successfully")
         except Exception as e:
-            logger.error(f"Error loading model: {e}")
-            # Fallback to a smaller model
-            self.summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
     def extract_text_from_pdf(self, pdf_file) -> str:
-        """Extract text from uploaded PDF file."""
         try:
-            # Read the PDF file
             pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
             text = ""
-            # Extract text from all pages
-            for page_num in range(len(pdf_reader.pages)):
-                page = pdf_reader.pages[page_num]
-                text += page.extract_text() + "\n"
-            # Clean the text
-            text = self.clean_text(text)
-            return text
         except Exception as e:
-            logger.error(f"Error extracting PDF text: {e}")
-            return f"Error reading PDF: {str(e)}"
     def clean_text(self, text: str) -> str:
-        """Clean and preprocess the extracted text."""
-        # Remove extra whitespace and newlines
         text = re.sub(r'\s+', ' ', text)
         # Remove special characters but keep punctuation
-        text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
         return text.strip()
-    def chunk_text(self, text: str, max_chunk_length: int = 1000) -> list:
-        """Split text into chunks for processing."""
         sentences = text.split('. ')
         chunks = []
         current_chunk = ""
         for sentence in sentences:
-            if len(current_chunk) + len(sentence) < max_chunk_length:
-                current_chunk += sentence + ". "
             else:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
@@ -83,192 +78,208 @@ class PDFSummarizer:
         return chunks
-    def summarize_text(self, text: str, summary_length: str = "medium") -> str:
-        """Summarize the extracted text."""
-        if not text or len(text.strip()) < 50:
-            return "Text too short to summarize or empty content."
         try:
-            # Set summary parameters based on length preference
-            length_params = {
-                "short": {"max_length": 100, "min_length": 30},
-                "medium": {"max_length": 200, "min_length": 50},
-                "long": {"max_length": 400, "min_length": 100}
-            }
-            params = length_params.get(summary_length, length_params["medium"])
-            # Handle long texts by chunking
-            if len(text) > 1024:
-                chunks = self.chunk_text(text, 900)
-                summaries = []
-                for chunk in chunks[:5]:  # Limit to first 5 chunks for speed
-                    try:
-                        summary = self.summarizer(
-                            chunk,
-                            max_length=params["max_length"] // len(chunks[:5]),
-                            min_length=params["min_length"] // len(chunks[:5]),
-                            do_sample=False
-                        )
-                        summaries.append(summary[0]['summary_text'])
-                    except Exception as e:
-                        logger.error(f"Error summarizing chunk: {e}")
-                        continue
-                # Combine chunk summaries
-                combined_summary = " ".join(summaries)
-                # Final summarization if combined text is still long
-                if len(combined_summary) > 512:
-                    final_summary = self.summarizer(
-                        combined_summary,
-                        max_length=params["max_length"],
-                        min_length=params["min_length"],
-                        do_sample=False
-                    )
-                    return final_summary[0]['summary_text']
-                else:
-                    return combined_summary
-            else:
-                # Direct summarization for shorter texts
-                summary = self.summarizer(
-                    text,
-                    max_length=params["max_length"],
-                    min_length=params["min_length"],
-                    do_sample=False
                 )
-                return summary[0]['summary_text']
         except Exception as e:
-            logger.error(f"Error during summarization: {e}")
-            return f"Error generating summary: {str(e)}"
 # Initialize the summarizer
 pdf_summarizer = PDFSummarizer()
-def process_pdf(pdf_file, summary_length):
-    """Main function to process PDF and return summary."""
     if pdf_file is None:
-        return "Please upload a PDF file.", ""
     try:
-        # Extract text from PDF
-        extracted_text = pdf_summarizer.extract_text_from_pdf(pdf_file)
-        if extracted_text.startswith("Error"):
-            return extracted_text, ""
-        # Generate summary
-        summary = pdf_summarizer.summarize_text(extracted_text, summary_length)
-        return summary, extracted_text[:1000] + "..." if len(extracted_text) > 1000 else extracted_text
     except Exception as e:
-        logger.error(f"Error processing PDF: {e}")
-        return f"Error processing PDF: {str(e)}", ""
 # Create Gradio interface
 def create_interface():
-    """Create and configure the Gradio interface."""
     with gr.Blocks(
-        title="PDF Summarizer",
         theme=gr.themes.Soft(),
         css="""
         .gradio-container {
-            max-width: 1200px;
-            margin: 0 auto;
         }
-        .header {
-            text-align: center;
-            margin-bottom: 2rem;
         }
         """
-    ) as app:
-        gr.HTML("""
-        <div class="header">
-            <h1>🚀 Fast PDF Summarizer</h1>
-            <p>Upload a PDF file and get an instant AI-powered summary!</p>
-        </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                # Input components
                 pdf_input = gr.File(
-                    label="Upload PDF File",
                     file_types=[".pdf"],
-                    file_count="single"
                 )
-                summary_length = gr.Radio(
-                    choices=["short", "medium", "long"],
-                    value="medium",
-                    label="Summary Length",
                     info="Choose how detailed you want the summary to be"
                 )
                 summarize_btn = gr.Button(
-                    "Summarize PDF",
                     variant="primary",
                     size="lg"
                 )
             with gr.Column(scale=2):
-                # Output components
                 summary_output = gr.Textbox(
-                    label="Summary",
-                    lines=10,
-                    placeholder="Your PDF summary will appear here...",
-                    max_lines=15
                 )
-                with gr.Accordion("View Extracted Text", open=False):
-                    extracted_text_output = gr.Textbox(
-                        label="Extracted Text (Preview)",
-                        lines=5,
-                        max_lines=10,
-                        placeholder="Extracted text preview will appear here..."
-                    )
-        # Event handlers
         summarize_btn.click(
-            fn=process_pdf,
-            inputs=[pdf_input, summary_length],
-            outputs=[summary_output, extracted_text_output],
-            show_progress=True
         )
         # Auto-process when file is uploaded
         pdf_input.change(
-            fn=process_pdf,
-            inputs=[pdf_input, summary_length],
-            outputs=[summary_output, extracted_text_output]
         )
-        # Examples section
-        gr.HTML("""
-        <div style="margin-top: 2rem; padding: 1rem; background-color: #f0f0f0; border-radius: 8px;">
-            <h3>💡 Tips for Best Results:</h3>
-            <ul>
-                <li>Upload clear, text-based PDFs (not scanned images)</li>
-                <li>Choose 'short' for quick overviews, 'long' for detailed summaries</li>
-                <li>Large PDFs are automatically chunked for faster processing</li>
-                <li>The app works best with documents under 50 pages</li>
-            </ul>
-        </div>
-        """)
-    return app
-# Create and launch the app
 if __name__ == "__main__":
-    app = create_interface()
-    app.launch(
-        share=True,
-        server_name="0.0.0.0",
-        server_port=7860,
-        max_file_size="10mb"
-    )

 from transformers import pipeline, AutoTokenizer
 import torch
 import re
+from typing import List, Tuple
+import warnings
+warnings.filterwarnings("ignore")
 class PDFSummarizer:
     def __init__(self):
         # Use a fast, efficient model for summarization
+        self.model_name = "facebook/bart-large-cnn"
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {self.device}")
         try:
+            # Initialize the summarization pipeline
             self.summarizer = pipeline(
                 "summarization",
+                model=self.model_name,
+                device=0 if self.device == "cuda" else -1
             )
+            # Initialize tokenizer for length calculations
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            print("Model loaded successfully")
         except Exception as e:
+            print(f"Error loading model: {e}")
+            raise e
     def extract_text_from_pdf(self, pdf_file) -> str:
+        """Extract text content from PDF file"""
         try:
             pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
             text = ""
+            for page_num, page in enumerate(pdf_reader.pages):
+                page_text = page.extract_text()
+                if page_text.strip():
+                    text += f"\n--- Page {page_num + 1} ---\n"
+                    text += page_text
+            return text.strip()
         except Exception as e:
+            raise Exception(f"Error extracting text from PDF: {str(e)}")
     def clean_text(self, text: str) -> str:
+        """Clean and preprocess text"""
+        # Remove extra whitespaces and newlines
         text = re.sub(r'\s+', ' ', text)
         # Remove special characters but keep punctuation
+        text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
+        # Remove page markers
+        text = re.sub(r'--- Page \d+ ---', '', text)
         return text.strip()
+    def chunk_text(self, text: str, max_chunk_length: int = 900) -> List[str]:
+        """Split text into manageable chunks for processing"""
         sentences = text.split('. ')
         chunks = []
         current_chunk = ""
         for sentence in sentences:
+            # Check if adding this sentence would exceed the limit
+            potential_chunk = current_chunk + sentence + ". "
+            if len(self.tokenizer.encode(potential_chunk)) <= max_chunk_length:
+                current_chunk = potential_chunk
             else:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
         return chunks
+    def summarize_chunk(self, chunk: str, max_length: int = 150, min_length: int = 50) -> str:
+        """Summarize a single chunk of text"""
+        try:
+            summary = self.summarizer(
+                chunk,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=False,
+                truncation=True
+            )
+            return summary[0]['summary_text']
+        except Exception as e:
+            return f"Error summarizing chunk: {str(e)}"
+    def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
+        """Main function to process PDF and generate summary"""
         try:
+            # Extract text from PDF
+            raw_text = self.extract_text_from_pdf(pdf_file)
+            if not raw_text.strip():
+                return "❌ Error: No text could be extracted from the PDF.", "", ""
+            # Clean the text
+            cleaned_text = self.clean_text(raw_text)
+            # Calculate text statistics
+            word_count = len(cleaned_text.split())
+            char_count = len(cleaned_text)
+            if word_count < 50:
+                return "❌ Error: PDF contains too little text to summarize.", "", ""
+            # Chunk the text for processing
+            chunks = self.chunk_text(cleaned_text)
+            # Determine summary parameters based on type
+            if summary_type == "Brief (Quick)":
+                max_len, min_len = 100, 30
+            elif summary_type == "Detailed":
+                max_len, min_len = 200, 80
+            else:  # Comprehensive
+                max_len, min_len = 300, 120
+            # Summarize each chunk
+            chunk_summaries = []
+            for i, chunk in enumerate(chunks):
+                summary = self.summarize_chunk(chunk, max_len, min_len)
+                chunk_summaries.append(summary)
+            # Combine summaries
+            combined_summary = " ".join(chunk_summaries)
+            # If we have multiple chunks, create a final summary
+            if len(chunks) > 1:
+                final_summary = self.summarize_chunk(
+                    combined_summary,
+                    max_length=min(500, max_len * 2),
+                    min_length=min_len
                 )
+            else:
+                final_summary = combined_summary
+            # Create statistics
+            summary_stats = f"""
+📊 **Document Statistics:**
+- Original word count: {word_count:,}
+- Original character count: {char_count:,}
+- Pages processed: {len(chunks)}
+- Summary word count: {len(final_summary.split()):,}
+- Compression ratio: {word_count / len(final_summary.split()):.1f}:1
+            """
+            return final_summary, summary_stats, "✅ Summary generated successfully!"
         except Exception as e:
+            return f"❌ Error processing PDF: {str(e)}", "", ""
 # Initialize the summarizer
 pdf_summarizer = PDFSummarizer()
+def summarize_pdf_interface(pdf_file, summary_type):
+    """Gradio interface function"""
     if pdf_file is None:
+        return "❌ Please upload a PDF file.", "", ""
     try:
+        # Read the uploaded file
+        with open(pdf_file.name, 'rb') as f:
+            pdf_content = f.read()
+        # Process the PDF
+        summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
+        return summary, stats, status
     except Exception as e:
+        return f"❌ Error: {str(e)}", "", ""
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
+        title="📄 AI PDF Summarizer",
         theme=gr.themes.Soft(),
         css="""
         .gradio-container {
+            max-width: 1200px !important;
         }
+        .summary-box {
+            border-left: 4px solid #2196F3;
+            padding: 16px;
+            background-color: #f8f9fa;
         }
         """
+    ) as interface:
+        gr.Markdown("""
+        # 📄 AI-Powered PDF Summarizer
+        Upload any PDF document and get an intelligent summary in seconds!
+        Perfect for research papers, reports, articles, and books.
+        **Features:**
+        - ⚡ Fast processing with BART model
+        - 📊 Document statistics
+        - 🎯 Multiple summary lengths
+        - 🔍 Smart text chunking
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 pdf_input = gr.File(
+                    label="📁 Upload PDF File",
                     file_types=[".pdf"],
+                    type="binary"
                 )
+                summary_type = gr.Radio(
+                    choices=["Brief (Quick)", "Detailed", "Comprehensive"],
+                    value="Detailed",
+                    label="📏 Summary Length",
                     info="Choose how detailed you want the summary to be"
                 )
                 summarize_btn = gr.Button(
+                    "🚀 Generate Summary",
                     variant="primary",
                     size="lg"
                 )
+                status_output = gr.Textbox(
+                    label="📋 Status",
+                    interactive=False,
+                    max_lines=2
+                )
             with gr.Column(scale=2):
                 summary_output = gr.Textbox(
+                    label="📝 Generated Summary",
+                    lines=15,
+                    max_lines=20,
+                    interactive=False,
+                    elem_classes=["summary-box"]
                 )
+                stats_output = gr.Markdown(
+                    label="📊 Document Statistics",
+                    value="Upload a PDF to see statistics"
+                )
+        # Examples section
+        gr.Markdown("""
+        ## 💡 Tips for Best Results:
+        - **File Quality**: Ensure your PDF has selectable text (not just images)
+        - **Length**: Works best with documents between 500-10,000 words
+        - **Language**: Optimized for English content
+        - **Format**: Clean, well-formatted PDFs produce better summaries
+        ## 🔧 Technical Details:
+        - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization)
+        - **Processing**: Smart text chunking with overlap prevention
+        - **Speed**: GPU-accelerated when available
+        """)
+        # Connect the button to the function
         summarize_btn.click(
+            fn=summarize_pdf_interface,
+            inputs=[pdf_input, summary_type],
+            outputs=[summary_output, stats_output, status_output]
         )
         # Auto-process when file is uploaded
         pdf_input.change(
+            fn=summarize_pdf_interface,
+            inputs=[pdf_input, summary_type],
+            outputs=[summary_output, stats_output, status_output]
         )
+    return interface
+# Launch the application
 if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()