LovnishVerma commited on
Commit
11c716d
Β·
verified Β·
1 Parent(s): e226072

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +387 -160
app.py CHANGED
@@ -4,92 +4,178 @@ import io
4
  from transformers import pipeline, AutoTokenizer
5
  import torch
6
  import re
7
- from typing import List, Tuple
8
  import warnings
 
 
 
 
 
 
 
 
9
  warnings.filterwarnings("ignore")
10
 
11
- class PDFSummarizer:
 
 
 
 
12
  def __init__(self):
13
- # Use a much faster, lighter model for summarization
14
- self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large
 
 
 
 
 
15
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
16
- print(f"Using device: {self.device}")
 
 
 
 
17
 
 
18
  try:
19
- # Initialize the summarization pipeline with optimizations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  self.summarizer = pipeline(
21
  "summarization",
22
- model=self.model_name,
23
  device=0 if self.device == "cuda" else -1,
24
  framework="pt",
25
- model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32}
26
  )
27
 
28
- # Initialize tokenizer for length calculations
29
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
30
- print("Model loaded successfully")
31
 
32
  except Exception as e:
33
- print(f"Error loading model: {e}")
34
- # Fallback to an even faster model
35
- self.model_name = "facebook/bart-large-cnn"
36
- self.summarizer = pipeline("summarization", model=self.model_name, device=-1)
37
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
38
- print("Fallback model loaded")
39
-
40
- def extract_text_from_pdf(self, pdf_file) -> str:
41
- """Extract text content from PDF file"""
 
42
  try:
43
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
 
 
 
 
 
 
 
 
44
  text = ""
 
45
 
46
  for page_num, page in enumerate(pdf_reader.pages):
47
- page_text = page.extract_text()
48
- if page_text.strip():
49
- text += f"\n--- Page {page_num + 1} ---\n"
50
- text += page_text
 
 
 
 
 
 
 
 
 
51
 
52
- return text.strip()
53
  except Exception as e:
54
  raise Exception(f"Error extracting text from PDF: {str(e)}")
55
 
56
- def clean_text(self, text: str) -> str:
57
- """Clean and preprocess text"""
58
- # Remove extra whitespaces and newlines
59
- text = re.sub(r'\s+', ' ', text)
60
- # Remove special characters but keep punctuation
61
- text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
62
  # Remove page markers
63
  text = re.sub(r'--- Page \d+ ---', '', text)
64
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
- def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]:
67
- """Split text into smaller, more manageable chunks for faster processing"""
68
- sentences = text.split('. ')
 
 
 
 
 
 
 
69
  chunks = []
70
  current_chunk = ""
71
 
72
  for sentence in sentences:
73
- # Check if adding this sentence would exceed the limit
74
- potential_chunk = current_chunk + sentence + ". "
75
- # Use faster length estimation
76
- if len(potential_chunk.split()) <= max_chunk_length:
77
- current_chunk = potential_chunk
 
78
  else:
79
  if current_chunk:
80
  chunks.append(current_chunk.strip())
81
- current_chunk = sentence + ". "
82
 
83
  if current_chunk:
84
  chunks.append(current_chunk.strip())
85
 
86
- # Limit number of chunks for speed
87
- return chunks[:5] # Process max 5 chunks for speed
 
 
88
 
89
  def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
90
- """Summarize a single chunk of text with speed optimizations"""
91
  try:
92
- # Speed optimizations
 
 
 
 
 
 
 
93
  summary = self.summarizer(
94
  chunk,
95
  max_length=max_length,
@@ -97,202 +183,343 @@ class PDFSummarizer:
97
  do_sample=False,
98
  truncation=True,
99
  early_stopping=True,
100
- num_beams=2 # Reduced from default 4 for speed
 
 
101
  )
102
- return summary[0]['summary_text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  except Exception as e:
104
- return f"Error summarizing chunk: {str(e)}"
 
105
 
106
- def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
107
- """Main function to process PDF and generate summary"""
 
 
108
  try:
 
 
 
 
109
  # Extract text from PDF
110
- raw_text = self.extract_text_from_pdf(pdf_file)
 
111
 
112
  if not raw_text.strip():
113
  return "❌ Error: No text could be extracted from the PDF.", "", ""
114
 
115
- # Clean the text
116
- cleaned_text = self.clean_text(raw_text)
 
117
 
118
  # Calculate text statistics
119
  word_count = len(cleaned_text.split())
120
  char_count = len(cleaned_text)
121
 
122
  if word_count < 50:
123
- return "❌ Error: PDF contains too little text to summarize.", "", ""
 
 
 
 
 
 
 
 
 
124
 
125
- # Chunk the text for processing
126
- chunks = self.chunk_text(cleaned_text)
 
 
 
127
 
128
- # Determine summary parameters based on type (optimized for speed)
129
- if summary_type == "Brief (Quick)":
130
- max_len, min_len = 60, 20
131
- elif summary_type == "Detailed":
132
- max_len, min_len = 100, 40
133
- else: # Comprehensive
134
- max_len, min_len = 150, 60
135
 
136
- # Summarize each chunk (with progress tracking)
 
137
  chunk_summaries = []
 
138
  for i, chunk in enumerate(chunks):
139
- print(f"Processing chunk {i+1}/{len(chunks)}")
140
- summary = self.summarize_chunk(chunk, max_len, min_len)
141
- chunk_summaries.append(summary)
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- # Combine summaries
 
 
 
144
  combined_summary = " ".join(chunk_summaries)
145
 
146
- # Skip final summarization for speed if we have few chunks
147
- if len(chunks) <= 2:
148
- final_summary = combined_summary
 
 
 
 
 
 
 
 
149
  else:
150
- # Quick final summary for multiple chunks
151
- final_summary = self.summarize_chunk(
152
- combined_summary,
153
- max_length=min(200, max_len * 1.5),
154
- min_length=min_len
155
- )
156
 
157
- # Create statistics
158
- summary_stats = f"""
159
- πŸ“Š **Document Statistics:**
160
- - Original word count: {word_count:,}
161
- - Original character count: {char_count:,}
162
- - Pages processed: {len(chunks)}
163
- - Summary word count: {len(final_summary.split()):,}
164
- - Compression ratio: {word_count / len(final_summary.split()):.1f}:1
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  """
166
 
167
- return final_summary, summary_stats, "βœ… Summary generated successfully!"
 
 
168
 
169
  except Exception as e:
170
- return f"❌ Error processing PDF: {str(e)}", "", ""
 
171
 
172
- # Initialize the summarizer
173
- pdf_summarizer = PDFSummarizer()
174
 
175
- def summarize_pdf_interface(pdf_file, summary_type):
176
- """Gradio interface function"""
177
  if pdf_file is None:
178
- return "❌ Please upload a PDF file.", "", ""
179
 
180
  try:
181
- # Read the uploaded file - pdf_file is already the file path
182
- with open(pdf_file, 'rb') as f:
183
- pdf_content = f.read()
184
-
185
  # Process the PDF
186
- summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
187
-
188
  return summary, stats, status
189
 
190
  except Exception as e:
191
- return f"❌ Error: {str(e)}", "", ""
 
192
 
193
- # Create Gradio interface
194
- def create_interface():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  with gr.Blocks(
196
- title="πŸ“„ AI PDF Summarizer",
197
  theme=gr.themes.Soft(),
198
- css="""
199
- .gradio-container {
200
- max-width: 1200px !important;
201
- }
202
- .summary-box {
203
- border-left: 4px solid #2196F3;
204
- padding: 16px;
205
- background-color: #f8f9fa;
206
- }
207
- """
208
  ) as interface:
209
 
210
- gr.Markdown("""
211
- # πŸ“„ AI-Powered PDF Summarizer
212
-
213
- Upload any PDF document and get an intelligent summary in seconds!
214
- Perfect for research papers, reports, articles, and books.
215
-
216
- **Features:**
217
- - ⚑ Fast processing with BART model
218
- - πŸ“Š Document statistics
219
- - 🎯 Multiple summary lengths
220
- - πŸ” Smart text chunking
221
  """)
222
 
223
  with gr.Row():
224
  with gr.Column(scale=1):
225
  pdf_input = gr.File(
226
- label="πŸ“ Upload PDF File",
227
  file_types=[".pdf"],
228
  type="filepath"
229
  )
230
 
231
- summary_type = gr.Radio(
232
- choices=["Brief (Quick)", "Detailed", "Comprehensive"],
233
- value="Detailed",
234
- label="πŸ“ Summary Length",
235
- info="Choose how detailed you want the summary to be"
236
- )
 
 
 
 
 
 
 
 
237
 
238
  summarize_btn = gr.Button(
239
- "πŸš€ Generate Summary",
240
  variant="primary",
241
  size="lg"
242
  )
243
 
244
  status_output = gr.Textbox(
245
- label="πŸ“‹ Status",
 
246
  interactive=False,
247
  max_lines=2
248
  )
249
 
250
  with gr.Column(scale=2):
251
  summary_output = gr.Textbox(
252
- label="πŸ“ Generated Summary",
253
- lines=15,
254
- max_lines=20,
255
  interactive=False,
256
- elem_classes=["summary-box"]
 
257
  )
258
 
259
  stats_output = gr.Markdown(
260
- label="πŸ“Š Document Statistics",
261
- value="Upload a PDF to see statistics"
262
  )
263
 
264
- # Examples section
265
- gr.Markdown("""
266
- ## πŸ’‘ Tips for Best Results:
267
-
268
- - **File Quality**: Ensure your PDF has selectable text (not just images)
269
- - **Length**: Works best with documents between 500-10,000 words
270
- - **Language**: Optimized for English content
271
- - **Format**: Clean, well-formatted PDFs produce better summaries
272
-
273
- ## πŸ”§ Technical Details:
274
- - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization)
275
- - **Processing**: Smart text chunking with overlap prevention
276
- - **Speed**: GPU-accelerated when available
277
- """)
 
 
 
 
 
 
 
 
 
 
 
278
 
279
- # Connect the button to the function
280
  summarize_btn.click(
281
  fn=summarize_pdf_interface,
282
- inputs=[pdf_input, summary_type],
283
  outputs=[summary_output, stats_output, status_output]
284
  )
285
 
286
- # Auto-process when file is uploaded
287
  pdf_input.change(
288
  fn=summarize_pdf_interface,
289
- inputs=[pdf_input, summary_type],
290
  outputs=[summary_output, stats_output, status_output]
291
  )
 
 
 
 
 
 
 
292
 
293
  return interface
294
 
295
- # Launch the application
296
  if __name__ == "__main__":
297
- interface = create_interface()
298
- interface.launch()
 
 
 
 
 
 
4
  from transformers import pipeline, AutoTokenizer
5
  import torch
6
  import re
7
+ from typing import List, Tuple, Optional
8
  import warnings
9
+ import time
10
+ import logging
11
+ from pathlib import Path
12
+ import nltk
13
+ from sentence_transformers import SentenceTransformer
14
+ import numpy as np
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+
17
  warnings.filterwarnings("ignore")
18
 
19
+ # Set up logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ class EnhancedPDFSummarizer:
24
  def __init__(self):
25
+ # Multiple model options for different speed/quality tradeoffs
26
+ self.models = {
27
+ "fast": "sshleifer/distilbart-cnn-12-6",
28
+ "balanced": "facebook/bart-large-cnn",
29
+ "quality": "microsoft/DialoGPT-large"
30
+ }
31
+
32
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ self.current_model = "fast"
34
+ logger.info(f"Using device: {self.device}")
35
+
36
+ # Initialize with fast model by default
37
+ self.load_model(self.current_model)
38
 
39
+ # Download required NLTK data
40
  try:
41
+ nltk.download('punkt', quiet=True)
42
+ nltk.download('stopwords', quiet=True)
43
+ except:
44
+ logger.warning("Could not download NLTK data")
45
+
46
+ def load_model(self, model_type: str = "fast"):
47
+ """Load summarization model with error handling"""
48
+ try:
49
+ model_name = self.models[model_type]
50
+ logger.info(f"Loading model: {model_name}")
51
+
52
+ # Model configuration for better performance
53
+ model_kwargs = {
54
+ "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
55
+ "low_cpu_mem_usage": True
56
+ }
57
+
58
  self.summarizer = pipeline(
59
  "summarization",
60
+ model=model_name,
61
  device=0 if self.device == "cuda" else -1,
62
  framework="pt",
63
+ model_kwargs=model_kwargs
64
  )
65
 
66
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
67
+ self.current_model = model_type
68
+ logger.info(f"Model {model_name} loaded successfully")
69
 
70
  except Exception as e:
71
+ logger.error(f"Error loading model {model_type}: {e}")
72
+ # Fallback to simplest model
73
+ if model_type != "fast":
74
+ logger.info("Falling back to fast model")
75
+ self.load_model("fast")
76
+ else:
77
+ raise Exception(f"Could not load any summarization model: {e}")
78
+
79
+ def extract_text_from_pdf(self, pdf_file) -> Tuple[str, int]:
80
+ """Extract text content from PDF file with better error handling"""
81
  try:
82
+ if isinstance(pdf_file, str):
83
+ # If it's a file path
84
+ with open(pdf_file, 'rb') as f:
85
+ pdf_content = f.read()
86
+ else:
87
+ # If it's already bytes
88
+ pdf_content = pdf_file
89
+
90
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
91
  text = ""
92
+ page_count = len(pdf_reader.pages)
93
 
94
  for page_num, page in enumerate(pdf_reader.pages):
95
+ try:
96
+ page_text = page.extract_text()
97
+ if page_text and page_text.strip():
98
+ text += f"\n--- Page {page_num + 1} ---\n"
99
+ text += page_text
100
+ except Exception as e:
101
+ logger.warning(f"Could not extract text from page {page_num + 1}: {e}")
102
+ continue
103
+
104
+ if not text.strip():
105
+ raise Exception("No readable text found in PDF. The PDF might be image-based or encrypted.")
106
+
107
+ return text.strip(), page_count
108
 
 
109
  except Exception as e:
110
  raise Exception(f"Error extracting text from PDF: {str(e)}")
111
 
112
+ def clean_and_preprocess_text(self, text: str) -> str:
113
+ """Enhanced text cleaning and preprocessing"""
 
 
 
 
114
  # Remove page markers
115
  text = re.sub(r'--- Page \d+ ---', '', text)
116
+
117
+ # Fix common PDF extraction issues
118
+ text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text) # Fix hyphenated words
119
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
120
+ text = re.sub(r'[^\w\s.,!?;:()\-"\'\n]', ' ', text) # Remove special chars
121
+
122
+ # Remove excessive repetition (common in PDFs)
123
+ lines = text.split('\n')
124
+ cleaned_lines = []
125
+ prev_line = ""
126
+
127
+ for line in lines:
128
+ line = line.strip()
129
+ if line and line != prev_line and len(line) > 10: # Avoid short repeated lines
130
+ cleaned_lines.append(line)
131
+ prev_line = line
132
+
133
+ return ' '.join(cleaned_lines).strip()
134
 
135
+ def intelligent_chunking(self, text: str, max_chunk_length: int = 512) -> List[str]:
136
+ """Intelligent text chunking based on sentences and semantic boundaries"""
137
+ try:
138
+ # Try to use NLTK for better sentence splitting
139
+ sentences = nltk.sent_tokenize(text)
140
+ except:
141
+ # Fallback to simple splitting
142
+ sentences = text.split('. ')
143
+ sentences = [s + '.' for s in sentences[:-1]] + [sentences[-1]]
144
+
145
  chunks = []
146
  current_chunk = ""
147
 
148
  for sentence in sentences:
149
+ # Estimate token count (rough approximation: 1 token β‰ˆ 4 characters)
150
+ potential_chunk = current_chunk + " " + sentence
151
+ estimated_tokens = len(potential_chunk) // 4
152
+
153
+ if estimated_tokens <= max_chunk_length:
154
+ current_chunk = potential_chunk.strip()
155
  else:
156
  if current_chunk:
157
  chunks.append(current_chunk.strip())
158
+ current_chunk = sentence.strip()
159
 
160
  if current_chunk:
161
  chunks.append(current_chunk.strip())
162
 
163
+ # Filter out very short chunks
164
+ chunks = [chunk for chunk in chunks if len(chunk.split()) >= 20]
165
+
166
+ return chunks
167
 
168
  def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str:
169
+ """Summarize a single chunk with enhanced parameters"""
170
  try:
171
+ # Adjust parameters based on chunk length
172
+ chunk_words = len(chunk.split())
173
+
174
+ # Dynamic length adjustment
175
+ if chunk_words < 100:
176
+ max_length = min(max_length, chunk_words // 2)
177
+ min_length = min(min_length, max_length // 2)
178
+
179
  summary = self.summarizer(
180
  chunk,
181
  max_length=max_length,
 
183
  do_sample=False,
184
  truncation=True,
185
  early_stopping=True,
186
+ num_beams=3, # Balanced quality/speed
187
+ length_penalty=1.0,
188
+ repetition_penalty=1.1
189
  )
190
+
191
+ return summary[0]['summary_text'].strip()
192
+
193
+ except Exception as e:
194
+ logger.error(f"Error summarizing chunk: {e}")
195
+ # Return first few sentences as fallback
196
+ sentences = chunk.split('. ')[:3]
197
+ return '. '.join(sentences) + '.' if sentences else chunk[:200] + '...'
198
+
199
+ def generate_extractive_summary(self, text: str, num_sentences: int = 5) -> str:
200
+ """Generate extractive summary as fallback or complement"""
201
+ try:
202
+ sentences = nltk.sent_tokenize(text)
203
+ if len(sentences) <= num_sentences:
204
+ return text
205
+
206
+ # Simple scoring based on word frequency
207
+ words = re.findall(r'\w+', text.lower())
208
+ word_freq = {}
209
+ for word in words:
210
+ if len(word) > 3: # Ignore short words
211
+ word_freq[word] = word_freq.get(word, 0) + 1
212
+
213
+ # Score sentences
214
+ sentence_scores = []
215
+ for i, sentence in enumerate(sentences):
216
+ score = 0
217
+ words_in_sentence = re.findall(r'\w+', sentence.lower())
218
+ for word in words_in_sentence:
219
+ if word in word_freq:
220
+ score += word_freq[word]
221
+
222
+ # Boost score for sentences with numbers (often important)
223
+ if re.search(r'\d+', sentence):
224
+ score *= 1.2
225
+
226
+ sentence_scores.append((score, i, sentence))
227
+
228
+ # Get top sentences
229
+ sentence_scores.sort(reverse=True)
230
+ selected_sentences = sorted(sentence_scores[:num_sentences], key=lambda x: x[1])
231
+
232
+ return ' '.join([sent[2] for sent in selected_sentences])
233
+
234
  except Exception as e:
235
+ logger.error(f"Error in extractive summary: {e}")
236
+ return text[:1000] + '...' if len(text) > 1000 else text
237
 
238
+ def process_pdf(self, pdf_file, summary_type: str, model_choice: str = "fast") -> Tuple[str, str, str]:
239
+ """Enhanced PDF processing with better error handling and features"""
240
+ start_time = time.time()
241
+
242
  try:
243
+ # Switch model if needed
244
+ if model_choice != self.current_model:
245
+ self.load_model(model_choice)
246
+
247
  # Extract text from PDF
248
+ logger.info("Extracting text from PDF...")
249
+ raw_text, page_count = self.extract_text_from_pdf(pdf_file)
250
 
251
  if not raw_text.strip():
252
  return "❌ Error: No text could be extracted from the PDF.", "", ""
253
 
254
+ # Clean and preprocess text
255
+ logger.info("Cleaning and preprocessing text...")
256
+ cleaned_text = self.clean_and_preprocess_text(raw_text)
257
 
258
  # Calculate text statistics
259
  word_count = len(cleaned_text.split())
260
  char_count = len(cleaned_text)
261
 
262
  if word_count < 50:
263
+ return "❌ Error: PDF contains too little text to summarize (minimum 50 words required).", "", ""
264
+
265
+ # Determine processing strategy based on text length
266
+ if word_count < 500:
267
+ # Short document - direct summarization
268
+ chunks = [cleaned_text]
269
+ else:
270
+ # Long document - intelligent chunking
271
+ logger.info("Chunking text...")
272
+ chunks = self.intelligent_chunking(cleaned_text)
273
 
274
+ # Limit chunks based on summary type for performance
275
+ max_chunks = {"Brief (Quick)": 3, "Detailed": 6, "Comprehensive": 10}.get(summary_type, 6)
276
+ if len(chunks) > max_chunks:
277
+ # Select most representative chunks
278
+ chunks = chunks[:max_chunks]
279
 
280
+ # Set summary parameters
281
+ summary_params = {
282
+ "Brief (Quick)": {"max_len": 80, "min_len": 20},
283
+ "Detailed": {"max_len": 130, "min_len": 40},
284
+ "Comprehensive": {"max_len": 200, "min_len": 60}
285
+ }
286
+ params = summary_params.get(summary_type, summary_params["Detailed"])
287
 
288
+ # Process chunks
289
+ logger.info(f"Processing {len(chunks)} chunks...")
290
  chunk_summaries = []
291
+
292
  for i, chunk in enumerate(chunks):
293
+ logger.info(f"Processing chunk {i+1}/{len(chunks)}")
294
+ try:
295
+ summary = self.summarize_chunk(
296
+ chunk,
297
+ max_length=params["max_len"],
298
+ min_length=params["min_len"]
299
+ )
300
+ if summary and len(summary.strip()) > 10:
301
+ chunk_summaries.append(summary)
302
+ except Exception as e:
303
+ logger.warning(f"Failed to summarize chunk {i+1}: {e}")
304
+ # Use extractive summary as fallback
305
+ extractive = self.generate_extractive_summary(chunk, 2)
306
+ chunk_summaries.append(extractive)
307
 
308
+ if not chunk_summaries:
309
+ return "❌ Error: Could not generate any summaries from the PDF content.", "", ""
310
+
311
+ # Combine and refine summary
312
  combined_summary = " ".join(chunk_summaries)
313
 
314
+ # Final summarization step for multi-chunk documents
315
+ if len(chunks) > 2 and len(combined_summary.split()) > params["max_len"]:
316
+ logger.info("Generating final summary...")
317
+ try:
318
+ final_summary = self.summarize_chunk(
319
+ combined_summary,
320
+ max_length=min(300, params["max_len"] * 2),
321
+ min_length=params["min_len"]
322
+ )
323
+ except:
324
+ final_summary = combined_summary
325
  else:
326
+ final_summary = combined_summary
 
 
 
 
 
327
 
328
+ # Processing time
329
+ processing_time = time.time() - start_time
330
+
331
+ # Enhanced statistics
332
+ summary_words = len(final_summary.split())
333
+ compression_ratio = word_count / summary_words if summary_words > 0 else 0
334
+
335
+ stats = f"""
336
+ πŸ“Š **Document Analysis:**
337
+ - **Pages:** {page_count}
338
+ - **Original words:** {word_count:,}
339
+ - **Original characters:** {char_count:,}
340
+ - **Chunks processed:** {len(chunks)}
341
+ - **Summary words:** {summary_words:,}
342
+ - **Compression ratio:** {compression_ratio:.1f}:1
343
+ - **Processing time:** {processing_time:.1f}s
344
+ - **Model used:** {self.models[self.current_model]}
345
+
346
+ πŸ“ˆ **Quality Metrics:**
347
+ - **Readability:** {'High' if summary_words > 50 else 'Medium' if summary_words > 20 else 'Low'}
348
+ - **Coverage:** {min(100, (len(chunks) * 100) // max(1, page_count)):.0f}%
349
  """
350
 
351
+ success_message = f"βœ… Summary generated successfully! ({summary_words} words in {processing_time:.1f}s)"
352
+
353
+ return final_summary, stats, success_message
354
 
355
  except Exception as e:
356
+ logger.error(f"Error processing PDF: {e}")
357
+ return f"❌ Error processing PDF: {str(e)}", "", "❌ Processing failed"
358
 
359
+ # Initialize the enhanced summarizer
360
+ pdf_summarizer = EnhancedPDFSummarizer()
361
 
362
+ def summarize_pdf_interface(pdf_file, summary_type, model_choice):
363
+ """Enhanced Gradio interface function"""
364
  if pdf_file is None:
365
+ return "❌ Please upload a PDF file.", "", "⏳ Waiting for file upload..."
366
 
367
  try:
 
 
 
 
368
  # Process the PDF
369
+ summary, stats, status = pdf_summarizer.process_pdf(pdf_file, summary_type, model_choice)
 
370
  return summary, stats, status
371
 
372
  except Exception as e:
373
+ logger.error(f"Interface error: {e}")
374
+ return f"❌ Error: {str(e)}", "", "❌ Processing failed"
375
 
376
+ def create_enhanced_interface():
377
+ """Create enhanced Gradio interface"""
378
+
379
+ custom_css = """
380
+ .gradio-container {
381
+ max-width: 1400px !important;
382
+ margin: auto;
383
+ }
384
+ .summary-box {
385
+ border-left: 4px solid #2196F3;
386
+ padding: 20px;
387
+ background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
388
+ border-radius: 8px;
389
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
390
+ }
391
+ .stats-box {
392
+ background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
393
+ padding: 15px;
394
+ border-radius: 8px;
395
+ border-left: 4px solid #1976d2;
396
+ }
397
+ .header-title {
398
+ text-align: center;
399
+ color: #1976d2;
400
+ margin-bottom: 20px;
401
+ }
402
+ """
403
+
404
  with gr.Blocks(
405
+ title="πŸ“„ Enhanced AI PDF Summarizer",
406
  theme=gr.themes.Soft(),
407
+ css=custom_css
 
 
 
 
 
 
 
 
 
408
  ) as interface:
409
 
410
+ gr.HTML("""
411
+ <div class="header-title">
412
+ <h1>πŸ“„ Enhanced AI-Powered PDF Summarizer</h1>
413
+ <p>Advanced document processing with multiple AI models and intelligent text analysis</p>
414
+ </div>
 
 
 
 
 
 
415
  """)
416
 
417
  with gr.Row():
418
  with gr.Column(scale=1):
419
  pdf_input = gr.File(
420
+ label="πŸ“ Upload PDF Document",
421
  file_types=[".pdf"],
422
  type="filepath"
423
  )
424
 
425
+ with gr.Row():
426
+ summary_type = gr.Radio(
427
+ choices=["Brief (Quick)", "Detailed", "Comprehensive"],
428
+ value="Detailed",
429
+ label="πŸ“ Summary Detail Level",
430
+ info="Choose the depth of analysis"
431
+ )
432
+
433
+ model_choice = gr.Radio(
434
+ choices=["fast", "balanced", "quality"],
435
+ value="fast",
436
+ label="🧠 AI Model",
437
+ info="Speed vs Quality tradeoff"
438
+ )
439
 
440
  summarize_btn = gr.Button(
441
+ "πŸš€ Generate Smart Summary",
442
  variant="primary",
443
  size="lg"
444
  )
445
 
446
  status_output = gr.Textbox(
447
+ label="πŸ“‹ Processing Status",
448
+ value="⏳ Ready to process your PDF...",
449
  interactive=False,
450
  max_lines=2
451
  )
452
 
453
  with gr.Column(scale=2):
454
  summary_output = gr.Textbox(
455
+ label="πŸ“ AI-Generated Summary",
456
+ lines=18,
457
+ max_lines=25,
458
  interactive=False,
459
+ elem_classes=["summary-box"],
460
+ placeholder="Your intelligent summary will appear here..."
461
  )
462
 
463
  stats_output = gr.Markdown(
464
+ value="πŸ“Š Upload a PDF to see detailed analysis and statistics",
465
+ elem_classes=["stats-box"]
466
  )
467
 
468
+ # Enhanced information section
469
+ with gr.Accordion("πŸ’‘ How to Get the Best Results", open=False):
470
+ gr.Markdown("""
471
+ ### πŸ“‹ Document Requirements:
472
+ - **Text-based PDFs**: Ensure your PDF contains selectable text (not scanned images)
473
+ - **Optimal length**: 500-50,000 words work best
474
+ - **Language**: Optimized for English content
475
+ - **Quality**: Well-formatted documents produce superior summaries
476
+
477
+ ### 🎯 Summary Types:
478
+ - **Brief (Quick)**: Fast overview, 60-80 words per section
479
+ - **Detailed**: Balanced analysis, 100-130 words per section
480
+ - **Comprehensive**: In-depth summary, 150-200 words per section
481
+
482
+ ### 🧠 AI Models:
483
+ - **Fast**: DistilBART - Quick processing, good quality
484
+ - **Balanced**: BART-Large - Better quality, moderate speed
485
+ - **Quality**: Premium model - Best results, slower processing
486
+
487
+ ### πŸ”§ Advanced Features:
488
+ - **Intelligent Chunking**: Semantic boundary detection
489
+ - **Multi-stage Processing**: Hierarchical summarization
490
+ - **Quality Metrics**: Readability and coverage analysis
491
+ - **Fallback Systems**: Extractive summarization backup
492
+ """)
493
 
494
+ # Connect functionality
495
  summarize_btn.click(
496
  fn=summarize_pdf_interface,
497
+ inputs=[pdf_input, summary_type, model_choice],
498
  outputs=[summary_output, stats_output, status_output]
499
  )
500
 
501
+ # Auto-process on file upload
502
  pdf_input.change(
503
  fn=summarize_pdf_interface,
504
+ inputs=[pdf_input, summary_type, model_choice],
505
  outputs=[summary_output, stats_output, status_output]
506
  )
507
+
508
+ # Footer
509
+ gr.HTML("""
510
+ <div style="text-align: center; margin-top: 20px; color: #666;">
511
+ <p>Powered by Transformers πŸ€— | Enhanced with intelligent text processing</p>
512
+ </div>
513
+ """)
514
 
515
  return interface
516
 
517
+ # Launch the enhanced application
518
  if __name__ == "__main__":
519
+ interface = create_enhanced_interface()
520
+ interface.launch(
521
+ server_name="0.0.0.0",
522
+ server_port=7860,
523
+ share=False,
524
+ debug=False
525
+ )