pdf_summarization1

Sleeping

App Files Files Community

LovnishVerma commited on May 31, 2025

Commit

11c716d

verified ·

1 Parent(s): e226072

Update app.py

Browse files

Files changed (1) hide show

app.py +387 -160

app.py CHANGED Viewed

@@ -4,92 +4,178 @@ import io
 from transformers import pipeline, AutoTokenizer
 import torch
 import re
-from typing import List, Tuple
 import warnings
 warnings.filterwarnings("ignore")
-class PDFSummarizer:
     def __init__(self):
-        # Use a much faster, lighter model for summarization
-        self.model_name = "sshleifer/distilbart-cnn-12-6"  # Much faster than BART-large
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Using device: {self.device}")
         try:
-            # Initialize the summarization pipeline with optimizations
             self.summarizer = pipeline(
                 "summarization",
-                model=self.model_name,
                 device=0 if self.device == "cuda" else -1,
                 framework="pt",
-                model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
             )
-            # Initialize tokenizer for length calculations
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            print("Model loaded successfully")
         except Exception as e:
-            print(f"Error loading model: {e}")
-            # Fallback to an even faster model
-            self.model_name = "facebook/bart-large-cnn"
-            self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            print("Fallback model loaded")
-    def extract_text_from_pdf(self, pdf_file) -> str:
-        """Extract text content from PDF file"""
         try:
-            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
             text = ""
             for page_num, page in enumerate(pdf_reader.pages):
-                page_text = page.extract_text()
-                if page_text.strip():
-                    text += f"\n--- Page {page_num + 1} ---\n"
-                    text += page_text
-            return text.strip()
         except Exception as e:
             raise Exception(f"Error extracting text from PDF: {str(e)}")
-    def clean_text(self, text: str) -> str:
-        """Clean and preprocess text"""
-        # Remove extra whitespaces and newlines
-        text = re.sub(r'\s+', ' ', text)
-        # Remove special characters but keep punctuation
-        text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
         # Remove page markers
         text = re.sub(r'--- Page \d+ ---', '', text)
-        return text.strip()
-    def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
-        """Split text into smaller, more manageable chunks for faster processing"""
-        sentences = text.split('. ')
         chunks = []
         current_chunk = ""
         for sentence in sentences:
-            # Check if adding this sentence would exceed the limit
-            potential_chunk = current_chunk + sentence + ". "
-            # Use faster length estimation
-            if len(potential_chunk.split()) <= max_chunk_length:
-                current_chunk = potential_chunk
             else:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
-                current_chunk = sentence + ". "
         if current_chunk:
             chunks.append(current_chunk.strip())
-        # Limit number of chunks for speed
-        return chunks[:5]  # Process max 5 chunks for speed
     def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
-        """Summarize a single chunk of text with speed optimizations"""
         try:
-            # Speed optimizations
             summary = self.summarizer(
                 chunk,
                 max_length=max_length,
@@ -97,202 +183,343 @@ class PDFSummarizer:
                 do_sample=False,
                 truncation=True,
                 early_stopping=True,
-                num_beams=2  # Reduced from default 4 for speed
             )
-            return summary[0]['summary_text']
         except Exception as e:
-            return f"Error summarizing chunk: {str(e)}"
-    def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
-        """Main function to process PDF and generate summary"""
         try:
             # Extract text from PDF
-            raw_text = self.extract_text_from_pdf(pdf_file)
             if not raw_text.strip():
                 return "❌ Error: No text could be extracted from the PDF.", "", ""
-            # Clean the text
-            cleaned_text = self.clean_text(raw_text)
             # Calculate text statistics
             word_count = len(cleaned_text.split())
             char_count = len(cleaned_text)
             if word_count < 50:
-                return "❌ Error: PDF contains too little text to summarize.", "", ""
-            # Chunk the text for processing
-            chunks = self.chunk_text(cleaned_text)
-            # Determine summary parameters based on type (optimized for speed)
-            if summary_type == "Brief (Quick)":
-                max_len, min_len = 60, 20
-            elif summary_type == "Detailed":
-                max_len, min_len = 100, 40
-            else:  # Comprehensive
-                max_len, min_len = 150, 60
-            # Summarize each chunk (with progress tracking)
             chunk_summaries = []
             for i, chunk in enumerate(chunks):
-                print(f"Processing chunk {i+1}/{len(chunks)}")
-                summary = self.summarize_chunk(chunk, max_len, min_len)
-                chunk_summaries.append(summary)
-            # Combine summaries
             combined_summary = " ".join(chunk_summaries)
-            # Skip final summarization for speed if we have few chunks
-            if len(chunks) <= 2:
-                final_summary = combined_summary
             else:
-                # Quick final summary for multiple chunks
-                final_summary = self.summarize_chunk(
-                    combined_summary,
-                    max_length=min(200, max_len * 1.5),
-                    min_length=min_len
-                )
-            # Create statistics
-            summary_stats = f"""
-📊 **Document Statistics:**
-- Original word count: {word_count:,}
-- Original character count: {char_count:,}
-- Pages processed: {len(chunks)}
-- Summary word count: {len(final_summary.split()):,}
-- Compression ratio: {word_count / len(final_summary.split()):.1f}:1
             """
-            return final_summary, summary_stats, "✅ Summary generated successfully!"
         except Exception as e:
-            return f"❌ Error processing PDF: {str(e)}", "", ""
-# Initialize the summarizer
-pdf_summarizer = PDFSummarizer()
-def summarize_pdf_interface(pdf_file, summary_type):
-    """Gradio interface function"""
     if pdf_file is None:
-        return "❌ Please upload a PDF file.", "", ""
     try:
-        # Read the uploaded file - pdf_file is already the file path
-        with open(pdf_file, 'rb') as f:
-            pdf_content = f.read()
         # Process the PDF
-        summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
         return summary, stats, status
     except Exception as e:
-        return f"❌ Error: {str(e)}", "", ""
-# Create Gradio interface
-def create_interface():
     with gr.Blocks(
-        title="📄 AI PDF Summarizer",
         theme=gr.themes.Soft(),
-        css="""
-        .gradio-container {
-            max-width: 1200px !important;
-        }
-        .summary-box {
-            border-left: 4px solid #2196F3;
-            padding: 16px;
-            background-color: #f8f9fa;
-        }
-        """
     ) as interface:
-        gr.Markdown("""
-        # 📄 AI-Powered PDF Summarizer
-        Upload any PDF document and get an intelligent summary in seconds!
-        Perfect for research papers, reports, articles, and books.
-        **Features:**
-        - ⚡ Fast processing with BART model
-        - 📊 Document statistics
-        - 🎯 Multiple summary lengths
-        - 🔍 Smart text chunking
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 pdf_input = gr.File(
-                    label="📁 Upload PDF File",
                     file_types=[".pdf"],
                     type="filepath"
                 )
-                summary_type = gr.Radio(
-                    choices=["Brief (Quick)", "Detailed", "Comprehensive"],
-                    value="Detailed",
-                    label="📏 Summary Length",
-                    info="Choose how detailed you want the summary to be"
-                )
                 summarize_btn = gr.Button(
-                    "🚀 Generate Summary",
                     variant="primary",
                     size="lg"
                 )
                 status_output = gr.Textbox(
-                    label="📋 Status",
                     interactive=False,
                     max_lines=2
                 )
             with gr.Column(scale=2):
                 summary_output = gr.Textbox(
-                    label="📝 Generated Summary",
-                    lines=15,
-                    max_lines=20,
                     interactive=False,
-                    elem_classes=["summary-box"]
                 )
                 stats_output = gr.Markdown(
-                    label="📊 Document Statistics",
-                    value="Upload a PDF to see statistics"
                 )
-        # Examples section
-        gr.Markdown("""
-        ## 💡 Tips for Best Results:
-        - **File Quality**: Ensure your PDF has selectable text (not just images)
-        - **Length**: Works best with documents between 500-10,000 words
-        - **Language**: Optimized for English content
-        - **Format**: Clean, well-formatted PDFs produce better summaries
-        ## 🔧 Technical Details:
-        - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization)
-        - **Processing**: Smart text chunking with overlap prevention
-        - **Speed**: GPU-accelerated when available
-        """)
-        # Connect the button to the function
         summarize_btn.click(
             fn=summarize_pdf_interface,
-            inputs=[pdf_input, summary_type],
             outputs=[summary_output, stats_output, status_output]
         )
-        # Auto-process when file is uploaded
         pdf_input.change(
             fn=summarize_pdf_interface,
-            inputs=[pdf_input, summary_type],
             outputs=[summary_output, stats_output, status_output]
         )
     return interface
-# Launch the application
 if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch()

 from transformers import pipeline, AutoTokenizer
 import torch
 import re
+from typing import List, Tuple, Optional
 import warnings
+import time
+import logging
+from pathlib import Path
+import nltk
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
 warnings.filterwarnings("ignore")
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class EnhancedPDFSummarizer:
     def __init__(self):
+        # Multiple model options for different speed/quality tradeoffs
+        self.models = {
+            "fast": "sshleifer/distilbart-cnn-12-6",
+            "balanced": "facebook/bart-large-cnn",
+            "quality": "microsoft/DialoGPT-large"
+        }
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.current_model = "fast"
+        logger.info(f"Using device: {self.device}")
+        # Initialize with fast model by default
+        self.load_model(self.current_model)
+        # Download required NLTK data
         try:
+            nltk.download('punkt', quiet=True)
+            nltk.download('stopwords', quiet=True)
+        except:
+            logger.warning("Could not download NLTK data")
+    def load_model(self, model_type: str = "fast"):
+        """Load summarization model with error handling"""
+        try:
+            model_name = self.models[model_type]
+            logger.info(f"Loading model: {model_name}")
+            # Model configuration for better performance
+            model_kwargs = {
+                "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
+                "low_cpu_mem_usage": True
+            }
             self.summarizer = pipeline(
                 "summarization",
+                model=model_name,
                 device=0 if self.device == "cuda" else -1,
                 framework="pt",
+                model_kwargs=model_kwargs
             )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.current_model = model_type
+            logger.info(f"Model {model_name} loaded successfully")
         except Exception as e:
+            logger.error(f"Error loading model {model_type}: {e}")
+            # Fallback to simplest model
+            if model_type != "fast":
+                logger.info("Falling back to fast model")
+                self.load_model("fast")
+            else:
+                raise Exception(f"Could not load any summarization model: {e}")
+    def extract_text_from_pdf(self, pdf_file) -> Tuple[str, int]:
+        """Extract text content from PDF file with better error handling"""
         try:
+            if isinstance(pdf_file, str):
+                # If it's a file path
+                with open(pdf_file, 'rb') as f:
+                    pdf_content = f.read()
+            else:
+                # If it's already bytes
+                pdf_content = pdf_file
+            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
             text = ""
+            page_count = len(pdf_reader.pages)
             for page_num, page in enumerate(pdf_reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    if page_text and page_text.strip():
+                        text += f"\n--- Page {page_num + 1} ---\n"
+                        text += page_text
+                except Exception as e:
+                    logger.warning(f"Could not extract text from page {page_num + 1}: {e}")
+                    continue
+            if not text.strip():
+                raise Exception("No readable text found in PDF. The PDF might be image-based or encrypted.")
+            return text.strip(), page_count
         except Exception as e:
             raise Exception(f"Error extracting text from PDF: {str(e)}")
+    def clean_and_preprocess_text(self, text: str) -> str:
+        """Enhanced text cleaning and preprocessing"""
         # Remove page markers
         text = re.sub(r'--- Page \d+ ---', '', text)
+        # Fix common PDF extraction issues
+        text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)  # Fix hyphenated words
+        text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
+        text = re.sub(r'[^\w\s.,!?;:()\-"\'\n]', ' ', text)  # Remove special chars
+        # Remove excessive repetition (common in PDFs)
+        lines = text.split('\n')
+        cleaned_lines = []
+        prev_line = ""
+        for line in lines:
+            line = line.strip()
+            if line and line != prev_line and len(line) > 10:  # Avoid short repeated lines
+                cleaned_lines.append(line)
+                prev_line = line
+        return ' '.join(cleaned_lines).strip()
+    def intelligent_chunking(self, text: str, max_chunk_length: int = 512) -> List[str]:
+        """Intelligent text chunking based on sentences and semantic boundaries"""
+        try:
+            # Try to use NLTK for better sentence splitting
+            sentences = nltk.sent_tokenize(text)
+        except:
+            # Fallback to simple splitting
+            sentences = text.split('. ')
+            sentences = [s + '.' for s in sentences[:-1]] + [sentences[-1]]
         chunks = []
         current_chunk = ""
         for sentence in sentences:
+            # Estimate token count (rough approximation: 1 token ≈ 4 characters)
+            potential_chunk = current_chunk + " " + sentence
+            estimated_tokens = len(potential_chunk) // 4
+            if estimated_tokens <= max_chunk_length:
+                current_chunk = potential_chunk.strip()
             else:
                 if current_chunk:
                     chunks.append(current_chunk.strip())
+                current_chunk = sentence.strip()
         if current_chunk:
             chunks.append(current_chunk.strip())
+        # Filter out very short chunks
+        chunks = [chunk for chunk in chunks if len(chunk.split()) >= 20]
+        return chunks
     def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
+        """Summarize a single chunk with enhanced parameters"""
         try:
+            # Adjust parameters based on chunk length
+            chunk_words = len(chunk.split())
+            # Dynamic length adjustment
+            if chunk_words < 100:
+                max_length = min(max_length, chunk_words // 2)
+                min_length = min(min_length, max_length // 2)
             summary = self.summarizer(
                 chunk,
                 max_length=max_length,
                 do_sample=False,
                 truncation=True,
                 early_stopping=True,
+                num_beams=3,  # Balanced quality/speed
+                length_penalty=1.0,
+                repetition_penalty=1.1
             )
+            return summary[0]['summary_text'].strip()
+        except Exception as e:
+            logger.error(f"Error summarizing chunk: {e}")
+            # Return first few sentences as fallback
+            sentences = chunk.split('. ')[:3]
+            return '. '.join(sentences) + '.' if sentences else chunk[:200] + '...'
+    def generate_extractive_summary(self, text: str, num_sentences: int = 5) -> str:
+        """Generate extractive summary as fallback or complement"""
+        try:
+            sentences = nltk.sent_tokenize(text)
+            if len(sentences) <= num_sentences:
+                return text
+            # Simple scoring based on word frequency
+            words = re.findall(r'\w+', text.lower())
+            word_freq = {}
+            for word in words:
+                if len(word) > 3:  # Ignore short words
+                    word_freq[word] = word_freq.get(word, 0) + 1
+            # Score sentences
+            sentence_scores = []
+            for i, sentence in enumerate(sentences):
+                score = 0
+                words_in_sentence = re.findall(r'\w+', sentence.lower())
+                for word in words_in_sentence:
+                    if word in word_freq:
+                        score += word_freq[word]
+                # Boost score for sentences with numbers (often important)
+                if re.search(r'\d+', sentence):
+                    score *= 1.2
+                sentence_scores.append((score, i, sentence))
+            # Get top sentences
+            sentence_scores.sort(reverse=True)
+            selected_sentences = sorted(sentence_scores[:num_sentences], key=lambda x: x[1])
+            return ' '.join([sent[2] for sent in selected_sentences])
         except Exception as e:
+            logger.error(f"Error in extractive summary: {e}")
+            return text[:1000] + '...' if len(text) > 1000 else text
+    def process_pdf(self, pdf_file, summary_type: str, model_choice: str = "fast") -> Tuple[str, str, str]:
+        """Enhanced PDF processing with better error handling and features"""
+        start_time = time.time()
         try:
+            # Switch model if needed
+            if model_choice != self.current_model:
+                self.load_model(model_choice)
             # Extract text from PDF
+            logger.info("Extracting text from PDF...")
+            raw_text, page_count = self.extract_text_from_pdf(pdf_file)
             if not raw_text.strip():
                 return "❌ Error: No text could be extracted from the PDF.", "", ""
+            # Clean and preprocess text
+            logger.info("Cleaning and preprocessing text...")
+            cleaned_text = self.clean_and_preprocess_text(raw_text)
             # Calculate text statistics
             word_count = len(cleaned_text.split())
             char_count = len(cleaned_text)
             if word_count < 50:
+                return "❌ Error: PDF contains too little text to summarize (minimum 50 words required).", "", ""
+            # Determine processing strategy based on text length
+            if word_count < 500:
+                # Short document - direct summarization
+                chunks = [cleaned_text]
+            else:
+                # Long document - intelligent chunking
+                logger.info("Chunking text...")
+                chunks = self.intelligent_chunking(cleaned_text)
+            # Limit chunks based on summary type for performance
+            max_chunks = {"Brief (Quick)": 3, "Detailed": 6, "Comprehensive": 10}.get(summary_type, 6)
+            if len(chunks) > max_chunks:
+                # Select most representative chunks
+                chunks = chunks[:max_chunks]
+            # Set summary parameters
+            summary_params = {
+                "Brief (Quick)": {"max_len": 80, "min_len": 20},
+                "Detailed": {"max_len": 130, "min_len": 40},
+                "Comprehensive": {"max_len": 200, "min_len": 60}
+            }
+            params = summary_params.get(summary_type, summary_params["Detailed"])
+            # Process chunks
+            logger.info(f"Processing {len(chunks)} chunks...")
             chunk_summaries = []
             for i, chunk in enumerate(chunks):
+                logger.info(f"Processing chunk {i+1}/{len(chunks)}")
+                try:
+                    summary = self.summarize_chunk(
+                        chunk,
+                        max_length=params["max_len"],
+                        min_length=params["min_len"]
+                    )
+                    if summary and len(summary.strip()) > 10:
+                        chunk_summaries.append(summary)
+                except Exception as e:
+                    logger.warning(f"Failed to summarize chunk {i+1}: {e}")
+                    # Use extractive summary as fallback
+                    extractive = self.generate_extractive_summary(chunk, 2)
+                    chunk_summaries.append(extractive)
+            if not chunk_summaries:
+                return "❌ Error: Could not generate any summaries from the PDF content.", "", ""
+            # Combine and refine summary
             combined_summary = " ".join(chunk_summaries)
+            # Final summarization step for multi-chunk documents
+            if len(chunks) > 2 and len(combined_summary.split()) > params["max_len"]:
+                logger.info("Generating final summary...")
+                try:
+                    final_summary = self.summarize_chunk(
+                        combined_summary,
+                        max_length=min(300, params["max_len"] * 2),
+                        min_length=params["min_len"]
+                    )
+                except:
+                    final_summary = combined_summary
             else:
+                final_summary = combined_summary
+            # Processing time
+            processing_time = time.time() - start_time
+            # Enhanced statistics
+            summary_words = len(final_summary.split())
+            compression_ratio = word_count / summary_words if summary_words > 0 else 0
+            stats = f"""
+📊 **Document Analysis:**
+- **Pages:** {page_count}
+- **Original words:** {word_count:,}
+- **Original characters:** {char_count:,}
+- **Chunks processed:** {len(chunks)}
+- **Summary words:** {summary_words:,}
+- **Compression ratio:** {compression_ratio:.1f}:1
+- **Processing time:** {processing_time:.1f}s
+- **Model used:** {self.models[self.current_model]}
+📈 **Quality Metrics:**
+- **Readability:** {'High' if summary_words > 50 else 'Medium' if summary_words > 20 else 'Low'}
+- **Coverage:** {min(100, (len(chunks) * 100) // max(1, page_count)):.0f}%
             """
+            success_message = f"✅ Summary generated successfully! ({summary_words} words in {processing_time:.1f}s)"
+            return final_summary, stats, success_message
         except Exception as e:
+            logger.error(f"Error processing PDF: {e}")
+            return f"❌ Error processing PDF: {str(e)}", "", "❌ Processing failed"
+# Initialize the enhanced summarizer
+pdf_summarizer = EnhancedPDFSummarizer()
+def summarize_pdf_interface(pdf_file, summary_type, model_choice):
+    """Enhanced Gradio interface function"""
     if pdf_file is None:
+        return "❌ Please upload a PDF file.", "", "⏳ Waiting for file upload..."
     try:
         # Process the PDF
+        summary, stats, status = pdf_summarizer.process_pdf(pdf_file, summary_type, model_choice)
         return summary, stats, status
     except Exception as e:
+        logger.error(f"Interface error: {e}")
+        return f"❌ Error: {str(e)}", "", "❌ Processing failed"
+def create_enhanced_interface():
+    """Create enhanced Gradio interface"""
+    custom_css = """
+    .gradio-container {
+        max-width: 1400px !important;
+        margin: auto;
+    }
+    .summary-box {
+        border-left: 4px solid #2196F3;
+        padding: 20px;
+        background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    }
+    .stats-box {
+        background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
+        padding: 15px;
+        border-radius: 8px;
+        border-left: 4px solid #1976d2;
+    }
+    .header-title {
+        text-align: center;
+        color: #1976d2;
+        margin-bottom: 20px;
+    }
+    """
     with gr.Blocks(
+        title="📄 Enhanced AI PDF Summarizer",
         theme=gr.themes.Soft(),
+        css=custom_css
     ) as interface:
+        gr.HTML("""
+        <div class="header-title">
+            <h1>📄 Enhanced AI-Powered PDF Summarizer</h1>
+            <p>Advanced document processing with multiple AI models and intelligent text analysis</p>
+        </div>
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 pdf_input = gr.File(
+                    label="📁 Upload PDF Document",
                     file_types=[".pdf"],
                     type="filepath"
                 )
+                with gr.Row():
+                    summary_type = gr.Radio(
+                        choices=["Brief (Quick)", "Detailed", "Comprehensive"],
+                        value="Detailed",
+                        label="📏 Summary Detail Level",
+                        info="Choose the depth of analysis"
+                    )
+                    model_choice = gr.Radio(
+                        choices=["fast", "balanced", "quality"],
+                        value="fast",
+                        label="🧠 AI Model",
+                        info="Speed vs Quality tradeoff"
+                    )
                 summarize_btn = gr.Button(
+                    "🚀 Generate Smart Summary",
                     variant="primary",
                     size="lg"
                 )
                 status_output = gr.Textbox(
+                    label="📋 Processing Status",
+                    value="⏳ Ready to process your PDF...",
                     interactive=False,
                     max_lines=2
                 )
             with gr.Column(scale=2):
                 summary_output = gr.Textbox(
+                    label="📝 AI-Generated Summary",
+                    lines=18,
+                    max_lines=25,
                     interactive=False,
+                    elem_classes=["summary-box"],
+                    placeholder="Your intelligent summary will appear here..."
                 )
                 stats_output = gr.Markdown(
+                    value="📊 Upload a PDF to see detailed analysis and statistics",
+                    elem_classes=["stats-box"]
                 )
+        # Enhanced information section
+        with gr.Accordion("💡 How to Get the Best Results", open=False):
+            gr.Markdown("""
+            ### 📋 Document Requirements:
+            - **Text-based PDFs**: Ensure your PDF contains selectable text (not scanned images)
+            - **Optimal length**: 500-50,000 words work best
+            - **Language**: Optimized for English content
+            - **Quality**: Well-formatted documents produce superior summaries
+            ### 🎯 Summary Types:
+            - **Brief (Quick)**: Fast overview, 60-80 words per section
+            - **Detailed**: Balanced analysis, 100-130 words per section
+            - **Comprehensive**: In-depth summary, 150-200 words per section
+            ### 🧠 AI Models:
+            - **Fast**: DistilBART - Quick processing, good quality
+            - **Balanced**: BART-Large - Better quality, moderate speed
+            - **Quality**: Premium model - Best results, slower processing
+            ### 🔧 Advanced Features:
+            - **Intelligent Chunking**: Semantic boundary detection
+            - **Multi-stage Processing**: Hierarchical summarization
+            - **Quality Metrics**: Readability and coverage analysis
+            - **Fallback Systems**: Extractive summarization backup
+            """)
+        # Connect functionality
         summarize_btn.click(
             fn=summarize_pdf_interface,
+            inputs=[pdf_input, summary_type, model_choice],
             outputs=[summary_output, stats_output, status_output]
         )
+        # Auto-process on file upload
         pdf_input.change(
             fn=summarize_pdf_interface,
+            inputs=[pdf_input, summary_type, model_choice],
             outputs=[summary_output, stats_output, status_output]
         )
+        # Footer
+        gr.HTML("""
+        <div style="text-align: center; margin-top: 20px; color: #666;">
+            <p>Powered by Transformers 🤗 | Enhanced with intelligent text processing</p>
+        </div>
+        """)
     return interface
+# Launch the enhanced application
 if __name__ == "__main__":
+    interface = create_enhanced_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=False
+    )