File size: 11,691 Bytes
bcdad9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import gradio as gr
import PyPDF2
import io
from transformers import pipeline, AutoTokenizer
import torch
import re
from typing import List, Tuple
import warnings
warnings.filterwarnings("ignore")

class PDFSummarizer:
    def __init__(self):
        # Use a much faster, lighter model for summarization
        self.model_name = "sshleifer/distilbart-cnn-12-6"  # Much faster than BART-large
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        try:
            # Initialize the summarization pipeline with optimizations
            self.summarizer = pipeline(
                "summarization",
                model=self.model_name,
                device=0 if self.device == "cuda" else -1,
                framework="pt",
                model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
            )
            
            # Initialize tokenizer for length calculations
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            print("Model loaded successfully")
            
        except Exception as e:
            print(f"Error loading model: {e}")
            # Fallback to an even faster model
            self.model_name = "facebook/bart-large-cnn"
            self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            print("Fallback model loaded")
        
    def extract_text_from_pdf(self, pdf_file) -> str:
        """Extract text content from PDF file"""
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
            text = ""
            
            for page_num, page in enumerate(pdf_reader.pages):
                page_text = page.extract_text()
                if page_text.strip():
                    text += f"\n--- Page {page_num + 1} ---\n"
                    text += page_text
            
            return text.strip()
        except Exception as e:
            raise Exception(f"Error extracting text from PDF: {str(e)}")
    
    def clean_text(self, text: str) -> str:
        """Clean and preprocess text"""
        # Remove extra whitespaces and newlines
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep punctuation
        text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
        # Remove page markers
        text = re.sub(r'--- Page \d+ ---', '', text)
        return text.strip()
    
    def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
        """Split text into smaller, more manageable chunks for faster processing"""
        sentences = text.split('. ')
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            # Check if adding this sentence would exceed the limit
            potential_chunk = current_chunk + sentence + ". "
            # Use faster length estimation
            if len(potential_chunk.split()) <= max_chunk_length:
                current_chunk = potential_chunk
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "
        
        if current_chunk:
            chunks.append(current_chunk.strip())
        
        # Limit number of chunks for speed
        return chunks[:5]  # Process max 5 chunks for speed
    
    def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
        """Summarize a single chunk of text with speed optimizations"""
        try:
            # Speed optimizations
            summary = self.summarizer(
                chunk,
                max_length=max_length,
                min_length=min_length,
                do_sample=False,
                truncation=True,
                early_stopping=True,
                num_beams=2  # Reduced from default 4 for speed
            )
            return summary[0]['summary_text']
        except Exception as e:
            return f"Error summarizing chunk: {str(e)}"
    
    def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
        """Main function to process PDF and generate summary"""
        try:
            # Extract text from PDF
            raw_text = self.extract_text_from_pdf(pdf_file)
            
            if not raw_text.strip():
                return "❌ Error: No text could be extracted from the PDF.", "", ""
            
            # Clean the text
            cleaned_text = self.clean_text(raw_text)
            
            # Calculate text statistics
            word_count = len(cleaned_text.split())
            char_count = len(cleaned_text)
            
            if word_count < 50:
                return "❌ Error: PDF contains too little text to summarize.", "", ""
            
            # Chunk the text for processing
            chunks = self.chunk_text(cleaned_text)
            
            # Determine summary parameters based on type (optimized for speed)
            if summary_type == "Brief (Quick)":
                max_len, min_len = 60, 20
            elif summary_type == "Detailed":
                max_len, min_len = 100, 40
            else:  # Comprehensive
                max_len, min_len = 150, 60
            
            # Summarize each chunk (with progress tracking)
            chunk_summaries = []
            for i, chunk in enumerate(chunks):
                print(f"Processing chunk {i+1}/{len(chunks)}")
                summary = self.summarize_chunk(chunk, max_len, min_len)
                chunk_summaries.append(summary)
            
            # Combine summaries
            combined_summary = " ".join(chunk_summaries)
            
            # Skip final summarization for speed if we have few chunks
            if len(chunks) <= 2:
                final_summary = combined_summary
            else:
                # Quick final summary for multiple chunks
                final_summary = self.summarize_chunk(
                    combined_summary, 
                    max_length=min(200, max_len * 1.5), 
                    min_length=min_len
                )
            
            # Create statistics
            summary_stats = f"""
πŸ“Š **Document Statistics:**
- Original word count: {word_count:,}
- Original character count: {char_count:,}
- Pages processed: {len(chunks)}
- Summary word count: {len(final_summary.split()):,}
- Compression ratio: {word_count / len(final_summary.split()):.1f}:1
            """
            
            return final_summary, summary_stats, "βœ… Summary generated successfully!"
            
        except Exception as e:
            return f"❌ Error processing PDF: {str(e)}", "", ""

# Initialize the summarizer
pdf_summarizer = PDFSummarizer()

def summarize_pdf_interface(pdf_file, summary_type):
    """Gradio interface function"""
    if pdf_file is None:
        return "❌ Please upload a PDF file.", "", ""
    
    try:
        # Read the uploaded file - pdf_file is already the file path
        with open(pdf_file, 'rb') as f:
            pdf_content = f.read()
        
        # Process the PDF
        summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
        
        return summary, stats, status
        
    except Exception as e:
        return f"❌ Error: {str(e)}", "", ""

# Create Gradio interface
def create_interface():
    with gr.Blocks(
        title="πŸ“„ AI PDF Summarizer",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            max-width: 1200px !important;
        }
        .summary-box {
            border-left: 4px solid #2196F3;
            padding: 16px;
            background-color: #f8f9fa;
        }
        """
    ) as interface:
        
        gr.Markdown("""
        # ✨ Your AI-Powered PDF Assistant
        
        ### Transform lengthy documents into clear, concise summaries in seconds.
        
        Upload any PDF file below and let our intelligent model do the work for you. Perfect for students, researchers, and professionals who need to quickly grasp the core ideas of any document.
        
        ---
        
        ## Key Features
        
        - ⚑ **Lightning-Fast Summaries**: Powered by a state-of-the-art AI model.
        - 🧠 **Intelligent Text Chunking**: Handles long documents with ease by breaking them down into manageable pieces.
        - πŸ“Š **Detailed Document Stats**: Get instant insights on word count, compression ratio, and more.
        - 🎯 **Customizable Summary Length**: Choose the level of detail that works best for you.
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                pdf_input = gr.File(
                    label="πŸ“ Upload PDF File",
                    file_types=[".pdf"],
                    type="filepath"
                )
                
                summary_type = gr.Radio(
                    choices=["Brief (Quick)", "Detailed", "Comprehensive"],
                    value="Detailed",
                    label="πŸ“ Summary Length",
                    info="Choose how detailed you want the summary to be"
                )
                
                summarize_btn = gr.Button(
                    "πŸš€ Generate Summary", 
                    variant="primary",
                    size="lg"
                )
                
                status_output = gr.Textbox(
                    label="πŸ“‹ Status",
                    interactive=False,
                    max_lines=2
                )
            
            with gr.Column(scale=2):
                summary_output = gr.Textbox(
                    label="πŸ“ Generated Summary",
                    lines=15,
                    max_lines=20,
                    interactive=False,
                    elem_classes=["summary-box"]
                )
                
                stats_output = gr.Markdown(
                    label="πŸ“Š Document Statistics",
                    value="Upload a PDF to see statistics"
                )
        
        # Examples section
        gr.Markdown("""
        ---
        
        ## πŸ’‘ Tips for Optimal Results:
        
        - **Quality Matters**: Ensure your PDF has selectable text (not just scanned images).
        - **Length**: Works best with documents between 500-10,000 words.
        - **Language**: Optimized for English content.
        - **Format**: Clean, well-formatted PDFs produce the best summaries.
        
        ## πŸ”§ How It Works:
        
        This application uses a fine-tuned **BART** model for summarization. The process involves:
        1.  Extracting text from your PDF.
        2.  Cleaning and preprocessing the text.
        3.  Intelligently chunking the document to handle long texts.
        4.  Generating a summary for each chunk and then combining them for a final, coherent result.
        """)
        
        # Connect the button to the function
        summarize_btn.click(
            fn=summarize_pdf_interface,
            inputs=[pdf_input, summary_type],
            outputs=[summary_output, stats_output, status_output]
        )
        
        # Auto-process when file is uploaded
        pdf_input.change(
            fn=summarize_pdf_interface,
            inputs=[pdf_input, summary_type],
            outputs=[summary_output, stats_output, status_output]
        )
    
    return interface

# Launch the application
if __name__ == "__main__":
    interface = create_interface()
    interface.launch()