Spaces:

LovnishVerma
/

pdf_summarization

Sleeping

App Files Files Community

LovnishVerma commited on Jun 1, 2025

Commit

099d4aa

verified ·

1 Parent(s): 7272788

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -218

app.py CHANGED Viewed

@@ -6,325 +6,293 @@ import torch
 import re
 from typing import List, Tuple
 import warnings
-import nltk
-from nltk.tokenize import sent_tokenize
-import heapq
-import numpy as np
-from collections import Counter
-import string
 warnings.filterwarnings("ignore")
-# Download required NLTK data
-try:
-    nltk.data.find('tokenizers/punkt_tab')
-except LookupError:
-    print("Downloading NLTK data...")
-    nltk.download('punkt_tab', quiet=True)
-# Fallback for older NLTK versions
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt', quiet=True)
-class FastPDFSummarizer:
     def __init__(self):
-        # Use the fastest available model for critical path
-        self.model_name = "facebook/bart-large-cnn"  # Fastest stable option
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
-        # Initialize summarizer with maximum speed optimizations
-        self.summarizer = None
-        self.tokenizer = None
-        self._initialize_model()
-    def _initialize_model(self):
-        """Initialize model with lazy loading and speed optimizations"""
         try:
-            # Only initialize when first needed
-            if self.summarizer is None:
-                print("Loading model...")
-                self.summarizer = pipeline(
-                    "summarization",
-                    model=self.model_name,
-                    device=0 if self.device == "cuda" else -1,
-                    framework="pt",
-                    model_kwargs={
-                        "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
-                        "low_cpu_mem_usage": True,
-                        "use_cache": True
-                    },
-                    tokenizer_kwargs={"padding": True, "truncation": True}
-                )
-                self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-                print("Model loaded successfully")
         except Exception as e:
             print(f"Error loading model: {e}")
-            # Ultra-fast fallback - extractive summarization
-            self.use_extractive = True
     def extract_text_from_pdf(self, pdf_file) -> str:
-        """Extract text with better error handling and speed"""
         try:
-            if isinstance(pdf_file, str):
-                with open(pdf_file, 'rb') as f:
-                    pdf_content = f.read()
-            else:
-                pdf_content = pdf_file
-            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
-            text_parts = []
-            # Limit pages for speed (process max 20 pages)
-            max_pages = min(20, len(pdf_reader.pages))
-            for page_num in range(max_pages):
-                try:
-                    page_text = pdf_reader.pages[page_num].extract_text()
-                    if page_text.strip():
-                        text_parts.append(page_text)
-                except Exception:
-                    continue  # Skip problematic pages
-            return " ".join(text_parts)
         except Exception as e:
             raise Exception(f"Error extracting text from PDF: {str(e)}")
-    def clean_text_fast(self, text: str) -> str:
-        """Ultra-fast text cleaning"""
-        # Remove excessive whitespace
         text = re.sub(r'\s+', ' ', text)
-        # Remove page numbers and headers/footers (common patterns)
-        text = re.sub(r'\b\d+\b(?=\s|$)', '', text)  # Remove standalone numbers
         text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
-        # Additional cleaning for PDF artifacts
-        text = re.sub(r'\uf0b7', '', text)  # Remove bullet points
-        text = re.sub(r'\u2022', '', text)  # Remove bullet points
-        text = re.sub(r'•', '', text)       # Remove bullet points
-        text = re.sub(r'\u00a0', ' ', text)  # Replace non-breaking spaces
-        text = re.sub(r'\n+', ' ', text)     # Replace multiple newlines with a single space
         return text.strip()
-    def extractive_summary(self, text: str, num_sentences: int = 5) -> str:
-        """Ultra-fast extractive summarization as fallback"""
-        try:
-            sentences = sent_tokenize(text)
-        except LookupError:
-            # Fallback to simple sentence splitting if NLTK fails
-            sentences = text.split('. ')
-            sentences = [s.strip() + '.' for s in sentences if s.strip()]
-        if len(sentences) <= num_sentences:
-            return text
-        # Simple frequency-based scoring
-        words = text.lower().split()
-        word_freq = Counter(word for word in words
-                          if word not in string.punctuation and len(word) > 3)
-        sentence_scores = {}
-        for sentence in sentences:
-            words_in_sentence = sentence.lower().split()
-            score = sum(word_freq[word] for word in words_in_sentence
-                       if word in word_freq)
-            sentence_scores[sentence] = score
-        # Get top sentences
-        top_sentences = heapq.nlargest(num_sentences, sentence_scores.keys(),
-                                     key=lambda x: sentence_scores[x])
-        # Return in original order
-        result = []
-        for sentence in sentences:
-            if sentence in top_sentences:
-                result.append(sentence)
-        return " ".join(result)
-    def smart_chunk_text(self, text: str, max_length: int = 1000) -> List[str]:
-        """Smarter, faster text chunking"""
-        # For short texts, don't chunk
-        if len(text.split()) <= max_length:
-            return [text]
-        try:
-            sentences = sent_tokenize(text)
-        except LookupError:
-            # Fallback to simple sentence splitting
-            sentences = text.split('. ')
-            sentences = [s.strip() + '.' for s in sentences if s.strip()]
         chunks = []
-        current_chunk = []
-        current_length = 0
         for sentence in sentences:
-            sentence_length = len(sentence.split())
-            if current_length + sentence_length <= max_length:
-                current_chunk.append(sentence)
-                current_length += sentence_length
             else:
                 if current_chunk:
-                    chunks.append(" ".join(current_chunk))
-                current_chunk = [sentence]
-                current_length = sentence_length
         if current_chunk:
-            chunks.append(" ".join(current_chunk))
-        # Ensure chunks are meaningful
-        return chunks
-    def fast_summarize(self, text: str, max_length: int = 150) -> str:
-        """Optimized summarization with fallbacks"""
         try:
-            # Initialize model if not already done
-            if self.summarizer is None:
-                self._initialize_model()
-            # Use extractive summarization for very long texts or as fallback
-            if hasattr(self, 'use_extractive') or len(text.split()) > 2000:
-                return self.extractive_summary(text, num_sentences=max_length//25)
-            # Fast abstractive summarization
-            result = self.summarizer(
-                text,
                 max_length=max_length,
-                min_length=max_length//3,
                 do_sample=False,
                 truncation=True,
                 early_stopping=True,
-                num_beams=2,  # Increase number of beams for better quality
-                length_penalty=1.0,
-                repetition_penalty=1.1
             )
-            return result[0]['summary_text']
         except Exception as e:
-            print(f"Falling back to extractive summarization: {e}")
-            return self.extractive_summary(text, num_sentences=max_length//25)
-    def process_pdf_fast(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
-        """Optimized main processing function"""
         try:
-            # Extract text
-            print("Extracting text...")
             raw_text = self.extract_text_from_pdf(pdf_file)
             if not raw_text.strip():
                 return "❌ Error: No text could be extracted from the PDF.", "", ""
-            # Fast cleaning
-            cleaned_text = self.clean_text_fast(raw_text)
             word_count = len(cleaned_text.split())
             if word_count < 50:
                 return "❌ Error: PDF contains too little text to summarize.", "", ""
-            # Determine summary length
             if summary_type == "Brief (Quick)":
-                target_length = min(100, word_count // 10)
             elif summary_type == "Detailed":
-                target_length = min(200, word_count // 5)
             else:  # Comprehensive
-                target_length = min(300, word_count // 3)
-            print("Generating summary...")
-            # For very short documents, use direct summarization
-            if word_count <= 1000:
-                summary = self.fast_summarize(cleaned_text, target_length)
             else:
-                # Chunk and summarize
-                chunks = self.smart_chunk_text(cleaned_text)
-                chunk_summaries = []
-                for chunk in chunks:
-                    chunk_summary = self.fast_summarize(chunk, target_length // len(chunks))
-                    chunk_summaries.append(chunk_summary)
-                # Combine summaries
-                if len(chunk_summaries) == 1:
-                    summary = chunk_summaries[0]
-                else:
-                    combined = " ".join(chunk_summaries)
-                    if len(combined.split()) > target_length:
-                        summary = self.fast_summarize(combined, target_length)
-                    else:
-                        summary = combined
-            # Statistics
-            summary_word_count = len(summary.split())
-            stats = f"""
 📊 **Document Statistics:**
-- Original words: {word_count:,}
-- Summary words: {summary_word_count:,}
-- Compression: {word_count/summary_word_count:.1f}:1
-- Processing: ⚡ Optimized mode
             """
-            return summary, stats, "✅ Summary generated successfully!"
         except Exception as e:
-            return f"❌ Error: {str(e)}", "", ""
-# Global instance for reuse
-pdf_summarizer = FastPDFSummarizer()
 def summarize_pdf_interface(pdf_file, summary_type):
     """Gradio interface function"""
     if pdf_file is None:
         return "❌ Please upload a PDF file.", "", ""
     try:
-        return pdf_summarizer.process_pdf_fast(pdf_file, summary_type)
     except Exception as e:
         return f"❌ Error: {str(e)}", "", ""
 def create_interface():
     with gr.Blocks(
-        title="⚡ Ultra-Fast PDF Summarizer",
         theme=gr.themes.Soft(),
         css="""
         .gradio-container {
             max-width: 1200px !important;
         }
         .summary-box {
-            border-left: 4px solid #4CAF50;
             padding: 16px;
             background-color: #f8f9fa;
         }
         """
     ) as interface:
         gr.Markdown("""
-        # ⚡ Ultra-Fast AI PDF Summarizer
-        **Optimized for Speed & Accuracy!** Get intelligent summaries in seconds.
-        **Speed & Accuracy Optimizations:**
-        - 🚀 Lazy model loading
-        - 🎯 Smart text chunking (max 3 chunks)
-        - ⚡ Extractive fallback for large docs
-        - 🔧 Multi-beam generation for better quality
-        - 📄 Page limit (20 pages max)
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 pdf_input = gr.File(
-                    label="📁 Upload PDF",
                     file_types=[".pdf"],
                     type="filepath"
                 )
                 summary_type = gr.Radio(
                     choices=["Brief (Quick)", "Detailed", "Comprehensive"],
-                    value="Brief (Quick)",
-                    label="📏 Summary Type"
                 )
                 summarize_btn = gr.Button(
-                    "⚡ Generate Summary",
                     variant="primary",
                     size="lg"
                 )
                 status_output = gr.Textbox(
                     label="📋 Status",
-                    interactive=False
                 )
             with gr.Column(scale=2):
                 summary_output = gr.Textbox(
-                    label="📝 Summary",
-                    lines=12,
                     interactive=False,
                     elem_classes=["summary-box"]
                 )
-                stats_output = gr.Markdown()
         gr.Markdown("""
-        ## ⚡ Speed & Accuracy Features:
-        - **Smart Processing**: Automatically switches to extractive summarization for large documents
-        - **Limited Pages**: Processes max 20 pages for speed
-        - **Optimized Models**: Uses fastest available AI models
-        - **Chunking**: Max 3 chunks to reduce processing time
-        - **Multi-beam Generation**: Improves summary quality
         """)
         summarize_btn.click(
             fn=summarize_pdf_interface,
             inputs=[pdf_input, summary_type],
             outputs=[summary_output, stats_output, status_output]
         )
         pdf_input.change(
             fn=summarize_pdf_interface,
             inputs=[pdf_input, summary_type],
             outputs=[summary_output, stats_output, status_output]
         )
     return interface
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch()

 import re
 from typing import List, Tuple
 import warnings
 warnings.filterwarnings("ignore")
+class PDFSummarizer:
     def __init__(self):
+        # Use a much faster, lighter model for summarization
+        self.model_name = "sshleifer/distilbart-cnn-12-6"  # Much faster than BART-large
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
         try:
+            # Initialize the summarization pipeline with optimizations
+            self.summarizer = pipeline(
+                "summarization",
+                model=self.model_name,
+                device=0 if self.device == "cuda" else -1,
+                framework="pt",
+                model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
+            )
+            # Initialize tokenizer for length calculations
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            print("Model loaded successfully")
         except Exception as e:
             print(f"Error loading model: {e}")
+            # Fallback to an even faster model
+            self.model_name = "facebook/bart-large-cnn"
+            self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            print("Fallback model loaded")
     def extract_text_from_pdf(self, pdf_file) -> str:
+        """Extract text content from PDF file"""
         try:
+            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
+            text = ""
+            for page_num, page in enumerate(pdf_reader.pages):
+                page_text = page.extract_text()
+                if page_text.strip():
+                    text += f"\n--- Page {page_num + 1} ---\n"
+                    text += page_text
+            return text.strip()
         except Exception as e:
             raise Exception(f"Error extracting text from PDF: {str(e)}")
+    def clean_text(self, text: str) -> str:
+        """Clean and preprocess text"""
+        # Remove extra whitespaces and newlines
         text = re.sub(r'\s+', ' ', text)
+        # Remove special characters but keep punctuation
         text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
+        # Remove page markers
+        text = re.sub(r'--- Page \d+ ---', '', text)
         return text.strip()
+    def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
+        """Split text into smaller, more manageable chunks for faster processing"""
+        sentences = text.split('. ')
         chunks = []
+        current_chunk = ""
         for sentence in sentences:
+            # Check if adding this sentence would exceed the limit
+            potential_chunk = current_chunk + sentence + ". "
+            # Use faster length estimation
+            if len(potential_chunk.split()) <= max_chunk_length:
+                current_chunk = potential_chunk
             else:
                 if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence + ". "
         if current_chunk:
+            chunks.append(current_chunk.strip())
+        # Limit number of chunks for speed
+        return chunks[:5]  # Process max 5 chunks for speed
+    def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
+        """Summarize a single chunk of text with speed optimizations"""
         try:
+            # Speed optimizations
+            summary = self.summarizer(
+                chunk,
                 max_length=max_length,
+                min_length=min_length,
                 do_sample=False,
                 truncation=True,
                 early_stopping=True,
+                num_beams=2  # Reduced from default 4 for speed
             )
+            return summary[0]['summary_text']
         except Exception as e:
+            return f"Error summarizing chunk: {str(e)}"
+    def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
+        """Main function to process PDF and generate summary"""
         try:
+            # Extract text from PDF
             raw_text = self.extract_text_from_pdf(pdf_file)
             if not raw_text.strip():
                 return "❌ Error: No text could be extracted from the PDF.", "", ""
+            # Clean the text
+            cleaned_text = self.clean_text(raw_text)
+            # Calculate text statistics
             word_count = len(cleaned_text.split())
+            char_count = len(cleaned_text)
             if word_count < 50:
                 return "❌ Error: PDF contains too little text to summarize.", "", ""
+            # Chunk the text for processing
+            chunks = self.chunk_text(cleaned_text)
+            # Determine summary parameters based on type (optimized for speed)
             if summary_type == "Brief (Quick)":
+                max_len, min_len = 60, 20
             elif summary_type == "Detailed":
+                max_len, min_len = 100, 40
             else:  # Comprehensive
+                max_len, min_len = 150, 60
+            # Summarize each chunk (with progress tracking)
+            chunk_summaries = []
+            for i, chunk in enumerate(chunks):
+                print(f"Processing chunk {i+1}/{len(chunks)}")
+                summary = self.summarize_chunk(chunk, max_len, min_len)
+                chunk_summaries.append(summary)
+            # Combine summaries
+            combined_summary = " ".join(chunk_summaries)
+            # Skip final summarization for speed if we have few chunks
+            if len(chunks) <= 2:
+                final_summary = combined_summary
             else:
+                # Quick final summary for multiple chunks
+                final_summary = self.summarize_chunk(
+                    combined_summary,
+                    max_length=min(200, max_len * 1.5),
+                    min_length=min_len
+                )
+            # Create statistics
+            summary_stats = f"""
 📊 **Document Statistics:**
+- Original word count: {word_count:,}
+- Original character count: {char_count:,}
+- Pages processed: {len(chunks)}
+- Summary word count: {len(final_summary.split()):,}
+- Compression ratio: {word_count / len(final_summary.split()):.1f}:1
             """
+            return final_summary, summary_stats, "✅ Summary generated successfully!"
         except Exception as e:
+            return f"❌ Error processing PDF: {str(e)}", "", ""
+# Initialize the summarizer
+pdf_summarizer = PDFSummarizer()
 def summarize_pdf_interface(pdf_file, summary_type):
     """Gradio interface function"""
     if pdf_file is None:
         return "❌ Please upload a PDF file.", "", ""
     try:
+        # Read the uploaded file - pdf_file is already the file path
+        with open(pdf_file, 'rb') as f:
+            pdf_content = f.read()
+        # Process the PDF
+        summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
+        return summary, stats, status
     except Exception as e:
         return f"❌ Error: {str(e)}", "", ""
+# Create Gradio interface
 def create_interface():
     with gr.Blocks(
+        title="📄 AI PDF Summarizer",
         theme=gr.themes.Soft(),
         css="""
         .gradio-container {
             max-width: 1200px !important;
         }
         .summary-box {
+            border-left: 4px solid #2196F3;
             padding: 16px;
             background-color: #f8f9fa;
         }
         """
     ) as interface:
         gr.Markdown("""
+        # 📄 AI-Powered PDF Summarizer
+        Upload any PDF document and get an intelligent summary in seconds!
+        Perfect for research papers, reports, articles, and books.
+        **Features:**
+        - ⚡ Fast processing with BART model
+        - 📊 Document statistics
+        - 🎯 Multiple summary lengths
+        - 🔍 Smart text chunking
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 pdf_input = gr.File(
+                    label="📁 Upload PDF File",
                     file_types=[".pdf"],
                     type="filepath"
                 )
                 summary_type = gr.Radio(
                     choices=["Brief (Quick)", "Detailed", "Comprehensive"],
+                    value="Detailed",
+                    label="📏 Summary Length",
+                    info="Choose how detailed you want the summary to be"
                 )
                 summarize_btn = gr.Button(
+                    "🚀 Generate Summary",
                     variant="primary",
                     size="lg"
                 )
                 status_output = gr.Textbox(
                     label="📋 Status",
+                    interactive=False,
+                    max_lines=2
                 )
             with gr.Column(scale=2):
                 summary_output = gr.Textbox(
+                    label="📝 Generated Summary",
+                    lines=15,
+                    max_lines=20,
                     interactive=False,
                     elem_classes=["summary-box"]
                 )
+                stats_output = gr.Markdown(
+                    label="📊 Document Statistics",
+                    value="Upload a PDF to see statistics"
+                )
+        # Examples section
         gr.Markdown("""
+        ## 💡 Tips for Best Results:
+        - **File Quality**: Ensure your PDF has selectable text (not just images)
+        - **Length**: Works best with documents between 500-10,000 words
+        - **Language**: Optimized for English content
+        - **Format**: Clean, well-formatted PDFs produce better summaries
+        ## 🔧 Technical Details:
+        - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization)
+        - **Processing**: Smart text chunking with overlap prevention
+        - **Speed**: GPU-accelerated when available
         """)
+        # Connect the button to the function
         summarize_btn.click(
             fn=summarize_pdf_interface,
             inputs=[pdf_input, summary_type],
             outputs=[summary_output, stats_output, status_output]
         )
+        # Auto-process when file is uploaded
         pdf_input.change(
             fn=summarize_pdf_interface,
             inputs=[pdf_input, summary_type],
             outputs=[summary_output, stats_output, status_output]
         )
     return interface
+# Launch the application
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch()