Layan22 commited on
Commit
8bae34d
Β·
verified Β·
1 Parent(s): 565b2dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +195 -1
app.py CHANGED
@@ -100,4 +100,198 @@ class PDFSummarizer:
100
  num_beams=2 # Reduced from default 4 for speed
101
  )
102
  return summary[0]['summary_text']
103
- except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  num_beams=2 # Reduced from default 4 for speed
101
  )
102
  return summary[0]['summary_text']
103
+ except Exception as e:
104
+ return f"Error summarizing chunk: {str(e)}"
105
+
106
+ def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
107
+ """Main function to process PDF and generate summary"""
108
+ try:
109
+ # Extract text from PDF
110
+ raw_text = self.extract_text_from_pdf(pdf_file)
111
+
112
+ if not raw_text.strip():
113
+ return "❌ Error: No text could be extracted from the PDF.", "", ""
114
+
115
+ # Clean the text
116
+ cleaned_text = self.clean_text(raw_text)
117
+
118
+ # Calculate text statistics
119
+ word_count = len(cleaned_text.split())
120
+ char_count = len(cleaned_text)
121
+
122
+ if word_count < 50:
123
+ return "❌ Error: PDF contains too little text to summarize.", "", ""
124
+
125
+ # Chunk the text for processing
126
+ chunks = self.chunk_text(cleaned_text)
127
+
128
+ # Determine summary parameters based on type (optimized for speed)
129
+ if summary_type == "Brief (Quick)":
130
+ max_len, min_len = 60, 20
131
+ elif summary_type == "Detailed":
132
+ max_len, min_len = 100, 40
133
+ else: # Comprehensive
134
+ max_len, min_len = 150, 60
135
+
136
+ # Summarize each chunk (with progress tracking)
137
+ chunk_summaries = []
138
+ for i, chunk in enumerate(chunks):
139
+ print(f"Processing chunk {i+1}/{len(chunks)}")
140
+ summary = self.summarize_chunk(chunk, max_len, min_len)
141
+ chunk_summaries.append(summary)
142
+
143
+ # Combine summaries
144
+ combined_summary = " ".join(chunk_summaries)
145
+
146
+ # Skip final summarization for speed if we have few chunks
147
+ if len(chunks) <= 2:
148
+ final_summary = combined_summary
149
+ else:
150
+ # Quick final summary for multiple chunks
151
+ final_summary = self.summarize_chunk(
152
+ combined_summary,
153
+ max_length=min(200, max_len * 1.5),
154
+ min_length=min_len
155
+ )
156
+
157
+ # Create statistics
158
+ summary_stats = f"""
159
+ πŸ“Š Document Statistics:
160
+ - Original word count: {word_count:,}
161
+ - Original character count: {char_count:,}
162
+ - Pages processed: {len(chunks)}
163
+ - Summary word count: {len(final_summary.split()):,}
164
+ - Compression ratio: {word_count / len(final_summary.split()):.1f}:1
165
+ """
166
+
167
+ return final_summary, summary_stats, "βœ… Summary generated successfully!"
168
+
169
+ except Exception as e:
170
+ return f"❌ Error processing PDF: {str(e)}", "", ""
171
+
172
+ # Initialize the summarizer
173
+ pdf_summarizer = PDFSummarizer()
174
+
175
+ def summarize_pdf_interface(pdf_file, summary_type):
176
+ """Gradio interface function"""
177
+ if pdf_file is None:
178
+ return "❌ Please upload a PDF file.", "", ""
179
+
180
+ try:
181
+ # Read the uploaded file - pdf_file is already the file path
182
+ with open(pdf_file, 'rb') as f:
183
+ pdf_content = f.read()
184
+
185
+ # Process the PDF
186
+ summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
187
+
188
+ return summary, stats, status
189
+
190
+ except Exception as e:
191
+ return f"❌ Error: {str(e)}", "", ""
192
+ # Create Gradio interface
193
+ def create_interface():
194
+ with gr.Blocks(
195
+ title="πŸ“„ AI PDF Summarizer",
196
+ theme=gr.themes.Soft(),
197
+ css="""
198
+ .gradio-container {
199
+ max-width: 1200px !important;
200
+ }
201
+ .summary-box {
202
+ border-left: 4px solid #2196F3;
203
+ padding: 16px;
204
+ background-color: #f8f9fa;
205
+ }
206
+ """
207
+ ) as interface:
208
+
209
+ gr.Markdown("""
210
+ # πŸ“„ AI-Powered PDF Summarizer
211
+
212
+ Upload any PDF document and get an intelligent summary in seconds!
213
+ Perfect for research papers, reports, articles, and books.
214
+
215
+ Features:
216
+ - ⚑️ Fast processing with BART model
217
+ - πŸ“Š Document statistics
218
+ - 🎯 Multiple summary lengths
219
+ - πŸ” Smart text chunking
220
+ """)
221
+
222
+ with gr.Row():
223
+ with gr.Column(scale=1):
224
+ pdf_input = gr.File(
225
+ label="πŸ“ Upload PDF File",
226
+ file_types=[".pdf"],
227
+ type="filepath"
228
+ )
229
+
230
+ summary_type = gr.Radio(
231
+ choices=["Brief (Quick)", "Detailed", "Comprehensive"],
232
+ value="Detailed",
233
+ label="πŸ“ Summary Length",
234
+ info="Choose how detailed you want the summary to be"
235
+ )
236
+
237
+ summarize_btn = gr.Button(
238
+ "πŸš€ Generate Summary",
239
+ variant="primary",
240
+ size="lg"
241
+ )
242
+
243
+ status_output = gr.Textbox(
244
+ label="πŸ“‹ Status",
245
+ interactive=False,
246
+ max_lines=2
247
+ )
248
+
249
+ with gr.Column(scale=2):
250
+ summary_output = gr.Textbox(
251
+ label="πŸ“ Generated Summary",
252
+ lines=15,
253
+ max_lines=20,
254
+ interactive=False,
255
+ elem_classes=["summary-box"]
256
+ )
257
+
258
+ stats_output = gr.Markdown(
259
+ label="πŸ“Š Document Statistics",
260
+ value="Upload a PDF to see statistics"
261
+ )
262
+
263
+ # Examples section
264
+ gr.Markdown("""
265
+ ## πŸ’‘ Tips for Best Results:
266
+
267
+ - File Quality: Ensure your PDF has selectable text (not just images)
268
+ - Length: Works best with documents between 500-10,000 words
269
+ - Language: Optimized for English content
270
+ - Format: Clean, well-formatted PDFs produce better summaries
271
+
272
+ ## πŸ”§ Technical Details:
273
+ - Model: Facebook BART-Large-CNN (state-of-the-art summarization)
274
+ - Processing: Smart text chunking with overlap prevention
275
+ - Speed: GPU-accelerated when available
276
+ """)
277
+
278
+ # Connect the button to the function
279
+ summarize_btn.click(
280
+ fn=summarize_pdf_interface,
281
+ inputs=[pdf_input, summary_type],
282
+ outputs=[summary_output, stats_output, status_output]
283
+ )
284
+
285
+ # Auto-process when file is uploaded
286
+ pdf_input.change(
287
+ fn=summarize_pdf_interface,
288
+ inputs=[pdf_input, summary_type],
289
+ outputs=[summary_output, stats_output, status_output]
290
+ )
291
+
292
+ return interface
293
+
294
+ # Launch the application
295
+ if name == "main":
296
+ interface = create_interface()
297
+ interface.launch()