LovnishVerma commited on
Commit
099d4aa
Β·
verified Β·
1 Parent(s): 7272788

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -218
app.py CHANGED
@@ -6,325 +6,293 @@ import torch
6
  import re
7
  from typing import List, Tuple
8
  import warnings
9
- import nltk
10
- from nltk.tokenize import sent_tokenize
11
- import heapq
12
- import numpy as np
13
- from collections import Counter
14
- import string
15
-
16
  warnings.filterwarnings("ignore")
17
 
18
- # Download required NLTK data
19
- try:
20
- nltk.data.find('tokenizers/punkt_tab')
21
- except LookupError:
22
- print("Downloading NLTK data...")
23
- nltk.download('punkt_tab', quiet=True)
24
- # Fallback for older NLTK versions
25
- try:
26
- nltk.data.find('tokenizers/punkt')
27
- except LookupError:
28
- nltk.download('punkt', quiet=True)
29
-
30
- class FastPDFSummarizer:
31
  def __init__(self):
32
- # Use the fastest available model for critical path
33
- self.model_name = "facebook/bart-large-cnn" # Fastest stable option
34
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
35
  print(f"Using device: {self.device}")
36
- # Initialize summarizer with maximum speed optimizations
37
- self.summarizer = None
38
- self.tokenizer = None
39
- self._initialize_model()
40
-
41
- def _initialize_model(self):
42
- """Initialize model with lazy loading and speed optimizations"""
43
  try:
44
- # Only initialize when first needed
45
- if self.summarizer is None:
46
- print("Loading model...")
47
- self.summarizer = pipeline(
48
- "summarization",
49
- model=self.model_name,
50
- device=0 if self.device == "cuda" else -1,
51
- framework="pt",
52
- model_kwargs={
53
- "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
54
- "low_cpu_mem_usage": True,
55
- "use_cache": True
56
- },
57
- tokenizer_kwargs={"padding": True, "truncation": True}
58
- )
59
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
60
- print("Model loaded successfully")
61
  except Exception as e:
62
  print(f"Error loading model: {e}")
63
- # Ultra-fast fallback - extractive summarization
64
- self.use_extractive = True
65
-
 
 
 
66
  def extract_text_from_pdf(self, pdf_file) -> str:
67
- """Extract text with better error handling and speed"""
68
  try:
69
- if isinstance(pdf_file, str):
70
- with open(pdf_file, 'rb') as f:
71
- pdf_content = f.read()
72
- else:
73
- pdf_content = pdf_file
74
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
75
- text_parts = []
76
- # Limit pages for speed (process max 20 pages)
77
- max_pages = min(20, len(pdf_reader.pages))
78
- for page_num in range(max_pages):
79
- try:
80
- page_text = pdf_reader.pages[page_num].extract_text()
81
- if page_text.strip():
82
- text_parts.append(page_text)
83
- except Exception:
84
- continue # Skip problematic pages
85
- return " ".join(text_parts)
86
  except Exception as e:
87
  raise Exception(f"Error extracting text from PDF: {str(e)}")
88
-
89
- def clean_text_fast(self, text: str) -> str:
90
- """Ultra-fast text cleaning"""
91
- # Remove excessive whitespace
92
  text = re.sub(r'\s+', ' ', text)
93
- # Remove page numbers and headers/footers (common patterns)
94
- text = re.sub(r'\b\d+\b(?=\s|$)', '', text) # Remove standalone numbers
95
  text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
96
- # Additional cleaning for PDF artifacts
97
- text = re.sub(r'\uf0b7', '', text) # Remove bullet points
98
- text = re.sub(r'\u2022', '', text) # Remove bullet points
99
- text = re.sub(r'β€’', '', text) # Remove bullet points
100
- text = re.sub(r'\u00a0', ' ', text) # Replace non-breaking spaces
101
- text = re.sub(r'\n+', ' ', text) # Replace multiple newlines with a single space
102
  return text.strip()
103
-
104
- def extractive_summary(self, text: str, num_sentences: int = 5) -> str:
105
- """Ultra-fast extractive summarization as fallback"""
106
- try:
107
- sentences = sent_tokenize(text)
108
- except LookupError:
109
- # Fallback to simple sentence splitting if NLTK fails
110
- sentences = text.split('. ')
111
- sentences = [s.strip() + '.' for s in sentences if s.strip()]
112
- if len(sentences) <= num_sentences:
113
- return text
114
- # Simple frequency-based scoring
115
- words = text.lower().split()
116
- word_freq = Counter(word for word in words
117
- if word not in string.punctuation and len(word) > 3)
118
- sentence_scores = {}
119
- for sentence in sentences:
120
- words_in_sentence = sentence.lower().split()
121
- score = sum(word_freq[word] for word in words_in_sentence
122
- if word in word_freq)
123
- sentence_scores[sentence] = score
124
- # Get top sentences
125
- top_sentences = heapq.nlargest(num_sentences, sentence_scores.keys(),
126
- key=lambda x: sentence_scores[x])
127
- # Return in original order
128
- result = []
129
- for sentence in sentences:
130
- if sentence in top_sentences:
131
- result.append(sentence)
132
- return " ".join(result)
133
-
134
- def smart_chunk_text(self, text: str, max_length: int = 1000) -> List[str]:
135
- """Smarter, faster text chunking"""
136
- # For short texts, don't chunk
137
- if len(text.split()) <= max_length:
138
- return [text]
139
- try:
140
- sentences = sent_tokenize(text)
141
- except LookupError:
142
- # Fallback to simple sentence splitting
143
- sentences = text.split('. ')
144
- sentences = [s.strip() + '.' for s in sentences if s.strip()]
145
  chunks = []
146
- current_chunk = []
147
- current_length = 0
148
  for sentence in sentences:
149
- sentence_length = len(sentence.split())
150
- if current_length + sentence_length <= max_length:
151
- current_chunk.append(sentence)
152
- current_length += sentence_length
 
153
  else:
154
  if current_chunk:
155
- chunks.append(" ".join(current_chunk))
156
- current_chunk = [sentence]
157
- current_length = sentence_length
158
  if current_chunk:
159
- chunks.append(" ".join(current_chunk))
160
- # Ensure chunks are meaningful
161
- return chunks
162
-
163
- def fast_summarize(self, text: str, max_length: int = 150) -> str:
164
- """Optimized summarization with fallbacks"""
 
165
  try:
166
- # Initialize model if not already done
167
- if self.summarizer is None:
168
- self._initialize_model()
169
- # Use extractive summarization for very long texts or as fallback
170
- if hasattr(self, 'use_extractive') or len(text.split()) > 2000:
171
- return self.extractive_summary(text, num_sentences=max_length//25)
172
- # Fast abstractive summarization
173
- result = self.summarizer(
174
- text,
175
  max_length=max_length,
176
- min_length=max_length//3,
177
  do_sample=False,
178
  truncation=True,
179
  early_stopping=True,
180
- num_beams=2, # Increase number of beams for better quality
181
- length_penalty=1.0,
182
- repetition_penalty=1.1
183
  )
184
- return result[0]['summary_text']
185
  except Exception as e:
186
- print(f"Falling back to extractive summarization: {e}")
187
- return self.extractive_summary(text, num_sentences=max_length//25)
188
-
189
- def process_pdf_fast(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
190
- """Optimized main processing function"""
191
  try:
192
- # Extract text
193
- print("Extracting text...")
194
  raw_text = self.extract_text_from_pdf(pdf_file)
 
195
  if not raw_text.strip():
196
  return "❌ Error: No text could be extracted from the PDF.", "", ""
197
- # Fast cleaning
198
- cleaned_text = self.clean_text_fast(raw_text)
 
 
 
199
  word_count = len(cleaned_text.split())
 
 
200
  if word_count < 50:
201
  return "❌ Error: PDF contains too little text to summarize.", "", ""
202
- # Determine summary length
 
 
 
 
203
  if summary_type == "Brief (Quick)":
204
- target_length = min(100, word_count // 10)
205
  elif summary_type == "Detailed":
206
- target_length = min(200, word_count // 5)
207
  else: # Comprehensive
208
- target_length = min(300, word_count // 3)
209
- print("Generating summary...")
210
- # For very short documents, use direct summarization
211
- if word_count <= 1000:
212
- summary = self.fast_summarize(cleaned_text, target_length)
 
 
 
 
 
 
 
 
 
 
213
  else:
214
- # Chunk and summarize
215
- chunks = self.smart_chunk_text(cleaned_text)
216
- chunk_summaries = []
217
- for chunk in chunks:
218
- chunk_summary = self.fast_summarize(chunk, target_length // len(chunks))
219
- chunk_summaries.append(chunk_summary)
220
- # Combine summaries
221
- if len(chunk_summaries) == 1:
222
- summary = chunk_summaries[0]
223
- else:
224
- combined = " ".join(chunk_summaries)
225
- if len(combined.split()) > target_length:
226
- summary = self.fast_summarize(combined, target_length)
227
- else:
228
- summary = combined
229
- # Statistics
230
- summary_word_count = len(summary.split())
231
- stats = f"""
232
  πŸ“Š **Document Statistics:**
233
- - Original words: {word_count:,}
234
- - Summary words: {summary_word_count:,}
235
- - Compression: {word_count/summary_word_count:.1f}:1
236
- - Processing: ⚑ Optimized mode
 
237
  """
238
- return summary, stats, "βœ… Summary generated successfully!"
 
 
239
  except Exception as e:
240
- return f"❌ Error: {str(e)}", "", ""
241
 
242
- # Global instance for reuse
243
- pdf_summarizer = FastPDFSummarizer()
244
 
245
  def summarize_pdf_interface(pdf_file, summary_type):
246
  """Gradio interface function"""
247
  if pdf_file is None:
248
  return "❌ Please upload a PDF file.", "", ""
 
249
  try:
250
- return pdf_summarizer.process_pdf_fast(pdf_file, summary_type)
 
 
 
 
 
 
 
 
251
  except Exception as e:
252
  return f"❌ Error: {str(e)}", "", ""
253
 
 
254
  def create_interface():
255
  with gr.Blocks(
256
- title="⚑ Ultra-Fast PDF Summarizer",
257
  theme=gr.themes.Soft(),
258
  css="""
259
  .gradio-container {
260
  max-width: 1200px !important;
261
  }
262
  .summary-box {
263
- border-left: 4px solid #4CAF50;
264
  padding: 16px;
265
  background-color: #f8f9fa;
266
  }
267
  """
268
  ) as interface:
 
269
  gr.Markdown("""
270
- # ⚑ Ultra-Fast AI PDF Summarizer
271
- **Optimized for Speed & Accuracy!** Get intelligent summaries in seconds.
272
- **Speed & Accuracy Optimizations:**
273
- - πŸš€ Lazy model loading
274
- - 🎯 Smart text chunking (max 3 chunks)
275
- - ⚑ Extractive fallback for large docs
276
- - πŸ”§ Multi-beam generation for better quality
277
- - πŸ“„ Page limit (20 pages max)
 
 
278
  """)
 
279
  with gr.Row():
280
  with gr.Column(scale=1):
281
  pdf_input = gr.File(
282
- label="πŸ“ Upload PDF",
283
  file_types=[".pdf"],
284
  type="filepath"
285
  )
 
286
  summary_type = gr.Radio(
287
  choices=["Brief (Quick)", "Detailed", "Comprehensive"],
288
- value="Brief (Quick)",
289
- label="πŸ“ Summary Type"
 
290
  )
 
291
  summarize_btn = gr.Button(
292
- "⚑ Generate Summary",
293
  variant="primary",
294
  size="lg"
295
  )
 
296
  status_output = gr.Textbox(
297
  label="πŸ“‹ Status",
298
- interactive=False
 
299
  )
 
300
  with gr.Column(scale=2):
301
  summary_output = gr.Textbox(
302
- label="πŸ“ Summary",
303
- lines=12,
 
304
  interactive=False,
305
  elem_classes=["summary-box"]
306
  )
307
- stats_output = gr.Markdown()
 
 
 
 
 
 
308
  gr.Markdown("""
309
- ## ⚑ Speed & Accuracy Features:
310
- - **Smart Processing**: Automatically switches to extractive summarization for large documents
311
- - **Limited Pages**: Processes max 20 pages for speed
312
- - **Optimized Models**: Uses fastest available AI models
313
- - **Chunking**: Max 3 chunks to reduce processing time
314
- - **Multi-beam Generation**: Improves summary quality
 
 
 
 
 
315
  """)
 
 
316
  summarize_btn.click(
317
  fn=summarize_pdf_interface,
318
  inputs=[pdf_input, summary_type],
319
  outputs=[summary_output, stats_output, status_output]
320
  )
 
 
321
  pdf_input.change(
322
  fn=summarize_pdf_interface,
323
  inputs=[pdf_input, summary_type],
324
  outputs=[summary_output, stats_output, status_output]
325
  )
 
326
  return interface
327
 
 
328
  if __name__ == "__main__":
329
  interface = create_interface()
330
  interface.launch()
 
6
  import re
7
  from typing import List, Tuple
8
  import warnings
 
 
 
 
 
 
 
9
  warnings.filterwarnings("ignore")
10
 
11
+ class PDFSummarizer:
 
 
 
 
 
 
 
 
 
 
 
 
12
  def __init__(self):
13
+ # Use a much faster, lighter model for summarization
14
+ self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large
15
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
16
  print(f"Using device: {self.device}")
17
+
 
 
 
 
 
 
18
  try:
19
+ # Initialize the summarization pipeline with optimizations
20
+ self.summarizer = pipeline(
21
+ "summarization",
22
+ model=self.model_name,
23
+ device=0 if self.device == "cuda" else -1,
24
+ framework="pt",
25
+ model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
26
+ )
27
+
28
+ # Initialize tokenizer for length calculations
29
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
30
+ print("Model loaded successfully")
31
+
 
 
 
 
32
  except Exception as e:
33
  print(f"Error loading model: {e}")
34
+ # Fallback to an even faster model
35
+ self.model_name = "facebook/bart-large-cnn"
36
+ self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
37
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
38
+ print("Fallback model loaded")
39
+
40
  def extract_text_from_pdf(self, pdf_file) -> str:
41
+ """Extract text content from PDF file"""
42
  try:
43
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
44
+ text = ""
45
+
46
+ for page_num, page in enumerate(pdf_reader.pages):
47
+ page_text = page.extract_text()
48
+ if page_text.strip():
49
+ text += f"\n--- Page {page_num + 1} ---\n"
50
+ text += page_text
51
+
52
+ return text.strip()
 
 
 
 
 
 
 
53
  except Exception as e:
54
  raise Exception(f"Error extracting text from PDF: {str(e)}")
55
+
56
+ def clean_text(self, text: str) -> str:
57
+ """Clean and preprocess text"""
58
+ # Remove extra whitespaces and newlines
59
  text = re.sub(r'\s+', ' ', text)
60
+ # Remove special characters but keep punctuation
 
61
  text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
62
+ # Remove page markers
63
+ text = re.sub(r'--- Page \d+ ---', '', text)
 
 
 
 
64
  return text.strip()
65
+
66
+ def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
67
+ """Split text into smaller, more manageable chunks for faster processing"""
68
+ sentences = text.split('. ')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  chunks = []
70
+ current_chunk = ""
71
+
72
  for sentence in sentences:
73
+ # Check if adding this sentence would exceed the limit
74
+ potential_chunk = current_chunk + sentence + ". "
75
+ # Use faster length estimation
76
+ if len(potential_chunk.split()) <= max_chunk_length:
77
+ current_chunk = potential_chunk
78
  else:
79
  if current_chunk:
80
+ chunks.append(current_chunk.strip())
81
+ current_chunk = sentence + ". "
82
+
83
  if current_chunk:
84
+ chunks.append(current_chunk.strip())
85
+
86
+ # Limit number of chunks for speed
87
+ return chunks[:5] # Process max 5 chunks for speed
88
+
89
+ def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
90
+ """Summarize a single chunk of text with speed optimizations"""
91
  try:
92
+ # Speed optimizations
93
+ summary = self.summarizer(
94
+ chunk,
 
 
 
 
 
 
95
  max_length=max_length,
96
+ min_length=min_length,
97
  do_sample=False,
98
  truncation=True,
99
  early_stopping=True,
100
+ num_beams=2 # Reduced from default 4 for speed
 
 
101
  )
102
+ return summary[0]['summary_text']
103
  except Exception as e:
104
+ return f"Error summarizing chunk: {str(e)}"
105
+
106
+ def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
107
+ """Main function to process PDF and generate summary"""
 
108
  try:
109
+ # Extract text from PDF
 
110
  raw_text = self.extract_text_from_pdf(pdf_file)
111
+
112
  if not raw_text.strip():
113
  return "❌ Error: No text could be extracted from the PDF.", "", ""
114
+
115
+ # Clean the text
116
+ cleaned_text = self.clean_text(raw_text)
117
+
118
+ # Calculate text statistics
119
  word_count = len(cleaned_text.split())
120
+ char_count = len(cleaned_text)
121
+
122
  if word_count < 50:
123
  return "❌ Error: PDF contains too little text to summarize.", "", ""
124
+
125
+ # Chunk the text for processing
126
+ chunks = self.chunk_text(cleaned_text)
127
+
128
+ # Determine summary parameters based on type (optimized for speed)
129
  if summary_type == "Brief (Quick)":
130
+ max_len, min_len = 60, 20
131
  elif summary_type == "Detailed":
132
+ max_len, min_len = 100, 40
133
  else: # Comprehensive
134
+ max_len, min_len = 150, 60
135
+
136
+ # Summarize each chunk (with progress tracking)
137
+ chunk_summaries = []
138
+ for i, chunk in enumerate(chunks):
139
+ print(f"Processing chunk {i+1}/{len(chunks)}")
140
+ summary = self.summarize_chunk(chunk, max_len, min_len)
141
+ chunk_summaries.append(summary)
142
+
143
+ # Combine summaries
144
+ combined_summary = " ".join(chunk_summaries)
145
+
146
+ # Skip final summarization for speed if we have few chunks
147
+ if len(chunks) <= 2:
148
+ final_summary = combined_summary
149
  else:
150
+ # Quick final summary for multiple chunks
151
+ final_summary = self.summarize_chunk(
152
+ combined_summary,
153
+ max_length=min(200, max_len * 1.5),
154
+ min_length=min_len
155
+ )
156
+
157
+ # Create statistics
158
+ summary_stats = f"""
 
 
 
 
 
 
 
 
 
159
  πŸ“Š **Document Statistics:**
160
+ - Original word count: {word_count:,}
161
+ - Original character count: {char_count:,}
162
+ - Pages processed: {len(chunks)}
163
+ - Summary word count: {len(final_summary.split()):,}
164
+ - Compression ratio: {word_count / len(final_summary.split()):.1f}:1
165
  """
166
+
167
+ return final_summary, summary_stats, "βœ… Summary generated successfully!"
168
+
169
  except Exception as e:
170
+ return f"❌ Error processing PDF: {str(e)}", "", ""
171
 
172
+ # Initialize the summarizer
173
+ pdf_summarizer = PDFSummarizer()
174
 
175
  def summarize_pdf_interface(pdf_file, summary_type):
176
  """Gradio interface function"""
177
  if pdf_file is None:
178
  return "❌ Please upload a PDF file.", "", ""
179
+
180
  try:
181
+ # Read the uploaded file - pdf_file is already the file path
182
+ with open(pdf_file, 'rb') as f:
183
+ pdf_content = f.read()
184
+
185
+ # Process the PDF
186
+ summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
187
+
188
+ return summary, stats, status
189
+
190
  except Exception as e:
191
  return f"❌ Error: {str(e)}", "", ""
192
 
193
+ # Create Gradio interface
194
  def create_interface():
195
  with gr.Blocks(
196
+ title="πŸ“„ AI PDF Summarizer",
197
  theme=gr.themes.Soft(),
198
  css="""
199
  .gradio-container {
200
  max-width: 1200px !important;
201
  }
202
  .summary-box {
203
+ border-left: 4px solid #2196F3;
204
  padding: 16px;
205
  background-color: #f8f9fa;
206
  }
207
  """
208
  ) as interface:
209
+
210
  gr.Markdown("""
211
+ # πŸ“„ AI-Powered PDF Summarizer
212
+
213
+ Upload any PDF document and get an intelligent summary in seconds!
214
+ Perfect for research papers, reports, articles, and books.
215
+
216
+ **Features:**
217
+ - ⚑ Fast processing with BART model
218
+ - πŸ“Š Document statistics
219
+ - 🎯 Multiple summary lengths
220
+ - πŸ” Smart text chunking
221
  """)
222
+
223
  with gr.Row():
224
  with gr.Column(scale=1):
225
  pdf_input = gr.File(
226
+ label="πŸ“ Upload PDF File",
227
  file_types=[".pdf"],
228
  type="filepath"
229
  )
230
+
231
  summary_type = gr.Radio(
232
  choices=["Brief (Quick)", "Detailed", "Comprehensive"],
233
+ value="Detailed",
234
+ label="πŸ“ Summary Length",
235
+ info="Choose how detailed you want the summary to be"
236
  )
237
+
238
  summarize_btn = gr.Button(
239
+ "πŸš€ Generate Summary",
240
  variant="primary",
241
  size="lg"
242
  )
243
+
244
  status_output = gr.Textbox(
245
  label="πŸ“‹ Status",
246
+ interactive=False,
247
+ max_lines=2
248
  )
249
+
250
  with gr.Column(scale=2):
251
  summary_output = gr.Textbox(
252
+ label="πŸ“ Generated Summary",
253
+ lines=15,
254
+ max_lines=20,
255
  interactive=False,
256
  elem_classes=["summary-box"]
257
  )
258
+
259
+ stats_output = gr.Markdown(
260
+ label="πŸ“Š Document Statistics",
261
+ value="Upload a PDF to see statistics"
262
+ )
263
+
264
+ # Examples section
265
  gr.Markdown("""
266
+ ## πŸ’‘ Tips for Best Results:
267
+
268
+ - **File Quality**: Ensure your PDF has selectable text (not just images)
269
+ - **Length**: Works best with documents between 500-10,000 words
270
+ - **Language**: Optimized for English content
271
+ - **Format**: Clean, well-formatted PDFs produce better summaries
272
+
273
+ ## πŸ”§ Technical Details:
274
+ - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization)
275
+ - **Processing**: Smart text chunking with overlap prevention
276
+ - **Speed**: GPU-accelerated when available
277
  """)
278
+
279
+ # Connect the button to the function
280
  summarize_btn.click(
281
  fn=summarize_pdf_interface,
282
  inputs=[pdf_input, summary_type],
283
  outputs=[summary_output, stats_output, status_output]
284
  )
285
+
286
+ # Auto-process when file is uploaded
287
  pdf_input.change(
288
  fn=summarize_pdf_interface,
289
  inputs=[pdf_input, summary_type],
290
  outputs=[summary_output, stats_output, status_output]
291
  )
292
+
293
  return interface
294
 
295
+ # Launch the application
296
  if __name__ == "__main__":
297
  interface = create_interface()
298
  interface.launch()