LovnishVerma commited on
Commit
bb6a28b
Β·
verified Β·
1 Parent(s): fc57051

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +184 -173
app.py CHANGED
@@ -4,75 +4,70 @@ import io
4
  from transformers import pipeline, AutoTokenizer
5
  import torch
6
  import re
7
- from typing import Optional
8
- import logging
9
-
10
- # Set up logging
11
- logging.basicConfig(level=logging.INFO)
12
- logger = logging.getLogger(__name__)
13
 
14
  class PDFSummarizer:
15
  def __init__(self):
16
- """Initialize the PDF summarizer with optimized models."""
17
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
- logger.info(f"Using device: {self.device}")
19
-
20
  # Use a fast, efficient model for summarization
21
- model_name = "facebook/bart-large-cnn"
 
 
22
 
23
  try:
24
- # Load tokenizer and pipeline
25
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
26
  self.summarizer = pipeline(
27
  "summarization",
28
- model=model_name,
29
- tokenizer=self.tokenizer,
30
- device=0 if self.device == "cuda" else -1,
31
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
32
  )
33
- logger.info("Model loaded successfully")
 
 
 
 
34
  except Exception as e:
35
- logger.error(f"Error loading model: {e}")
36
- # Fallback to a smaller model
37
- self.summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
38
-
39
  def extract_text_from_pdf(self, pdf_file) -> str:
40
- """Extract text from uploaded PDF file."""
41
  try:
42
- # Read the PDF file
43
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
44
  text = ""
45
 
46
- # Extract text from all pages
47
- for page_num in range(len(pdf_reader.pages)):
48
- page = pdf_reader.pages[page_num]
49
- text += page.extract_text() + "\n"
50
-
51
- # Clean the text
52
- text = self.clean_text(text)
53
- return text
54
 
 
55
  except Exception as e:
56
- logger.error(f"Error extracting PDF text: {e}")
57
- return f"Error reading PDF: {str(e)}"
58
 
59
  def clean_text(self, text: str) -> str:
60
- """Clean and preprocess the extracted text."""
61
- # Remove extra whitespace and newlines
62
  text = re.sub(r'\s+', ' ', text)
63
  # Remove special characters but keep punctuation
64
- text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
 
 
65
  return text.strip()
66
 
67
- def chunk_text(self, text: str, max_chunk_length: int = 1000) -> list:
68
- """Split text into chunks for processing."""
69
  sentences = text.split('. ')
70
  chunks = []
71
  current_chunk = ""
72
 
73
  for sentence in sentences:
74
- if len(current_chunk) + len(sentence) < max_chunk_length:
75
- current_chunk += sentence + ". "
 
 
76
  else:
77
  if current_chunk:
78
  chunks.append(current_chunk.strip())
@@ -83,192 +78,208 @@ class PDFSummarizer:
83
 
84
  return chunks
85
 
86
- def summarize_text(self, text: str, summary_length: str = "medium") -> str:
87
- """Summarize the extracted text."""
88
- if not text or len(text.strip()) < 50:
89
- return "Text too short to summarize or empty content."
90
-
 
 
 
 
 
 
 
 
 
 
 
91
  try:
92
- # Set summary parameters based on length preference
93
- length_params = {
94
- "short": {"max_length": 100, "min_length": 30},
95
- "medium": {"max_length": 200, "min_length": 50},
96
- "long": {"max_length": 400, "min_length": 100}
97
- }
98
 
99
- params = length_params.get(summary_length, length_params["medium"])
 
100
 
101
- # Handle long texts by chunking
102
- if len(text) > 1024:
103
- chunks = self.chunk_text(text, 900)
104
- summaries = []
105
-
106
- for chunk in chunks[:5]: # Limit to first 5 chunks for speed
107
- try:
108
- summary = self.summarizer(
109
- chunk,
110
- max_length=params["max_length"] // len(chunks[:5]),
111
- min_length=params["min_length"] // len(chunks[:5]),
112
- do_sample=False
113
- )
114
- summaries.append(summary[0]['summary_text'])
115
- except Exception as e:
116
- logger.error(f"Error summarizing chunk: {e}")
117
- continue
118
-
119
- # Combine chunk summaries
120
- combined_summary = " ".join(summaries)
121
-
122
- # Final summarization if combined text is still long
123
- if len(combined_summary) > 512:
124
- final_summary = self.summarizer(
125
- combined_summary,
126
- max_length=params["max_length"],
127
- min_length=params["min_length"],
128
- do_sample=False
129
- )
130
- return final_summary[0]['summary_text']
131
- else:
132
- return combined_summary
133
- else:
134
- # Direct summarization for shorter texts
135
- summary = self.summarizer(
136
- text,
137
- max_length=params["max_length"],
138
- min_length=params["min_length"],
139
- do_sample=False
140
  )
141
- return summary[0]['summary_text']
142
-
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  except Exception as e:
144
- logger.error(f"Error during summarization: {e}")
145
- return f"Error generating summary: {str(e)}"
146
 
147
  # Initialize the summarizer
148
  pdf_summarizer = PDFSummarizer()
149
 
150
- def process_pdf(pdf_file, summary_length):
151
- """Main function to process PDF and return summary."""
152
  if pdf_file is None:
153
- return "Please upload a PDF file.", ""
154
 
155
  try:
156
- # Extract text from PDF
157
- extracted_text = pdf_summarizer.extract_text_from_pdf(pdf_file)
158
-
159
- if extracted_text.startswith("Error"):
160
- return extracted_text, ""
161
 
162
- # Generate summary
163
- summary = pdf_summarizer.summarize_text(extracted_text, summary_length)
164
 
165
- return summary, extracted_text[:1000] + "..." if len(extracted_text) > 1000 else extracted_text
166
 
167
  except Exception as e:
168
- logger.error(f"Error processing PDF: {e}")
169
- return f"Error processing PDF: {str(e)}", ""
170
 
171
  # Create Gradio interface
172
  def create_interface():
173
- """Create and configure the Gradio interface."""
174
-
175
  with gr.Blocks(
176
- title="PDF Summarizer",
177
  theme=gr.themes.Soft(),
178
  css="""
179
  .gradio-container {
180
- max-width: 1200px;
181
- margin: 0 auto;
182
  }
183
- .header {
184
- text-align: center;
185
- margin-bottom: 2rem;
 
186
  }
187
  """
188
- ) as app:
 
 
 
 
 
 
189
 
190
- gr.HTML("""
191
- <div class="header">
192
- <h1>πŸš€ Fast PDF Summarizer</h1>
193
- <p>Upload a PDF file and get an instant AI-powered summary!</p>
194
- </div>
195
  """)
196
 
197
  with gr.Row():
198
  with gr.Column(scale=1):
199
- # Input components
200
  pdf_input = gr.File(
201
- label="Upload PDF File",
202
  file_types=[".pdf"],
203
- file_count="single"
204
  )
205
 
206
- summary_length = gr.Radio(
207
- choices=["short", "medium", "long"],
208
- value="medium",
209
- label="Summary Length",
210
  info="Choose how detailed you want the summary to be"
211
  )
212
 
213
  summarize_btn = gr.Button(
214
- "Summarize PDF",
215
  variant="primary",
216
  size="lg"
217
  )
 
 
 
 
 
 
218
 
219
  with gr.Column(scale=2):
220
- # Output components
221
  summary_output = gr.Textbox(
222
- label="Summary",
223
- lines=10,
224
- placeholder="Your PDF summary will appear here...",
225
- max_lines=15
 
226
  )
227
 
228
- with gr.Accordion("View Extracted Text", open=False):
229
- extracted_text_output = gr.Textbox(
230
- label="Extracted Text (Preview)",
231
- lines=5,
232
- max_lines=10,
233
- placeholder="Extracted text preview will appear here..."
234
- )
 
 
 
 
 
 
235
 
236
- # Event handlers
 
 
 
 
 
 
237
  summarize_btn.click(
238
- fn=process_pdf,
239
- inputs=[pdf_input, summary_length],
240
- outputs=[summary_output, extracted_text_output],
241
- show_progress=True
242
  )
243
 
244
  # Auto-process when file is uploaded
245
  pdf_input.change(
246
- fn=process_pdf,
247
- inputs=[pdf_input, summary_length],
248
- outputs=[summary_output, extracted_text_output]
249
  )
250
-
251
- # Examples section
252
- gr.HTML("""
253
- <div style="margin-top: 2rem; padding: 1rem; background-color: #f0f0f0; border-radius: 8px;">
254
- <h3>πŸ’‘ Tips for Best Results:</h3>
255
- <ul>
256
- <li>Upload clear, text-based PDFs (not scanned images)</li>
257
- <li>Choose 'short' for quick overviews, 'long' for detailed summaries</li>
258
- <li>Large PDFs are automatically chunked for faster processing</li>
259
- <li>The app works best with documents under 50 pages</li>
260
- </ul>
261
- </div>
262
- """)
263
 
264
- return app
265
 
266
- # Create and launch the app
267
  if __name__ == "__main__":
268
- app = create_interface()
269
- app.launch(
270
- share=True,
271
- server_name="0.0.0.0",
272
- server_port=7860,
273
- max_file_size="10mb"
274
- )
 
4
  from transformers import pipeline, AutoTokenizer
5
  import torch
6
  import re
7
+ from typing import List, Tuple
8
+ import warnings
9
+ warnings.filterwarnings("ignore")
 
 
 
10
 
11
  class PDFSummarizer:
12
  def __init__(self):
 
 
 
 
13
  # Use a fast, efficient model for summarization
14
+ self.model_name = "facebook/bart-large-cnn"
15
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ print(f"Using device: {self.device}")
17
 
18
  try:
19
+ # Initialize the summarization pipeline
 
20
  self.summarizer = pipeline(
21
  "summarization",
22
+ model=self.model_name,
23
+ device=0 if self.device == "cuda" else -1
 
 
24
  )
25
+
26
+ # Initialize tokenizer for length calculations
27
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
28
+ print("Model loaded successfully")
29
+
30
  except Exception as e:
31
+ print(f"Error loading model: {e}")
32
+ raise e
33
+
 
34
  def extract_text_from_pdf(self, pdf_file) -> str:
35
+ """Extract text content from PDF file"""
36
  try:
 
37
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
38
  text = ""
39
 
40
+ for page_num, page in enumerate(pdf_reader.pages):
41
+ page_text = page.extract_text()
42
+ if page_text.strip():
43
+ text += f"\n--- Page {page_num + 1} ---\n"
44
+ text += page_text
 
 
 
45
 
46
+ return text.strip()
47
  except Exception as e:
48
+ raise Exception(f"Error extracting text from PDF: {str(e)}")
 
49
 
50
  def clean_text(self, text: str) -> str:
51
+ """Clean and preprocess text"""
52
+ # Remove extra whitespaces and newlines
53
  text = re.sub(r'\s+', ' ', text)
54
  # Remove special characters but keep punctuation
55
+ text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text)
56
+ # Remove page markers
57
+ text = re.sub(r'--- Page \d+ ---', '', text)
58
  return text.strip()
59
 
60
+ def chunk_text(self, text: str, max_chunk_length: int = 900) -> List[str]:
61
+ """Split text into manageable chunks for processing"""
62
  sentences = text.split('. ')
63
  chunks = []
64
  current_chunk = ""
65
 
66
  for sentence in sentences:
67
+ # Check if adding this sentence would exceed the limit
68
+ potential_chunk = current_chunk + sentence + ". "
69
+ if len(self.tokenizer.encode(potential_chunk)) <= max_chunk_length:
70
+ current_chunk = potential_chunk
71
  else:
72
  if current_chunk:
73
  chunks.append(current_chunk.strip())
 
78
 
79
  return chunks
80
 
81
+ def summarize_chunk(self, chunk: str, max_length: int = 150, min_length: int = 50) -> str:
82
+ """Summarize a single chunk of text"""
83
+ try:
84
+ summary = self.summarizer(
85
+ chunk,
86
+ max_length=max_length,
87
+ min_length=min_length,
88
+ do_sample=False,
89
+ truncation=True
90
+ )
91
+ return summary[0]['summary_text']
92
+ except Exception as e:
93
+ return f"Error summarizing chunk: {str(e)}"
94
+
95
+ def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]:
96
+ """Main function to process PDF and generate summary"""
97
  try:
98
+ # Extract text from PDF
99
+ raw_text = self.extract_text_from_pdf(pdf_file)
 
 
 
 
100
 
101
+ if not raw_text.strip():
102
+ return "❌ Error: No text could be extracted from the PDF.", "", ""
103
 
104
+ # Clean the text
105
+ cleaned_text = self.clean_text(raw_text)
106
+
107
+ # Calculate text statistics
108
+ word_count = len(cleaned_text.split())
109
+ char_count = len(cleaned_text)
110
+
111
+ if word_count < 50:
112
+ return "❌ Error: PDF contains too little text to summarize.", "", ""
113
+
114
+ # Chunk the text for processing
115
+ chunks = self.chunk_text(cleaned_text)
116
+
117
+ # Determine summary parameters based on type
118
+ if summary_type == "Brief (Quick)":
119
+ max_len, min_len = 100, 30
120
+ elif summary_type == "Detailed":
121
+ max_len, min_len = 200, 80
122
+ else: # Comprehensive
123
+ max_len, min_len = 300, 120
124
+
125
+ # Summarize each chunk
126
+ chunk_summaries = []
127
+ for i, chunk in enumerate(chunks):
128
+ summary = self.summarize_chunk(chunk, max_len, min_len)
129
+ chunk_summaries.append(summary)
130
+
131
+ # Combine summaries
132
+ combined_summary = " ".join(chunk_summaries)
133
+
134
+ # If we have multiple chunks, create a final summary
135
+ if len(chunks) > 1:
136
+ final_summary = self.summarize_chunk(
137
+ combined_summary,
138
+ max_length=min(500, max_len * 2),
139
+ min_length=min_len
 
 
 
140
  )
141
+ else:
142
+ final_summary = combined_summary
143
+
144
+ # Create statistics
145
+ summary_stats = f"""
146
+ πŸ“Š **Document Statistics:**
147
+ - Original word count: {word_count:,}
148
+ - Original character count: {char_count:,}
149
+ - Pages processed: {len(chunks)}
150
+ - Summary word count: {len(final_summary.split()):,}
151
+ - Compression ratio: {word_count / len(final_summary.split()):.1f}:1
152
+ """
153
+
154
+ return final_summary, summary_stats, "βœ… Summary generated successfully!"
155
+
156
  except Exception as e:
157
+ return f"❌ Error processing PDF: {str(e)}", "", ""
 
158
 
159
  # Initialize the summarizer
160
  pdf_summarizer = PDFSummarizer()
161
 
162
+ def summarize_pdf_interface(pdf_file, summary_type):
163
+ """Gradio interface function"""
164
  if pdf_file is None:
165
+ return "❌ Please upload a PDF file.", "", ""
166
 
167
  try:
168
+ # Read the uploaded file
169
+ with open(pdf_file.name, 'rb') as f:
170
+ pdf_content = f.read()
 
 
171
 
172
+ # Process the PDF
173
+ summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type)
174
 
175
+ return summary, stats, status
176
 
177
  except Exception as e:
178
+ return f"❌ Error: {str(e)}", "", ""
 
179
 
180
  # Create Gradio interface
181
  def create_interface():
 
 
182
  with gr.Blocks(
183
+ title="πŸ“„ AI PDF Summarizer",
184
  theme=gr.themes.Soft(),
185
  css="""
186
  .gradio-container {
187
+ max-width: 1200px !important;
 
188
  }
189
+ .summary-box {
190
+ border-left: 4px solid #2196F3;
191
+ padding: 16px;
192
+ background-color: #f8f9fa;
193
  }
194
  """
195
+ ) as interface:
196
+
197
+ gr.Markdown("""
198
+ # πŸ“„ AI-Powered PDF Summarizer
199
+
200
+ Upload any PDF document and get an intelligent summary in seconds!
201
+ Perfect for research papers, reports, articles, and books.
202
 
203
+ **Features:**
204
+ - ⚑ Fast processing with BART model
205
+ - πŸ“Š Document statistics
206
+ - 🎯 Multiple summary lengths
207
+ - πŸ” Smart text chunking
208
  """)
209
 
210
  with gr.Row():
211
  with gr.Column(scale=1):
 
212
  pdf_input = gr.File(
213
+ label="πŸ“ Upload PDF File",
214
  file_types=[".pdf"],
215
+ type="binary"
216
  )
217
 
218
+ summary_type = gr.Radio(
219
+ choices=["Brief (Quick)", "Detailed", "Comprehensive"],
220
+ value="Detailed",
221
+ label="πŸ“ Summary Length",
222
  info="Choose how detailed you want the summary to be"
223
  )
224
 
225
  summarize_btn = gr.Button(
226
+ "πŸš€ Generate Summary",
227
  variant="primary",
228
  size="lg"
229
  )
230
+
231
+ status_output = gr.Textbox(
232
+ label="πŸ“‹ Status",
233
+ interactive=False,
234
+ max_lines=2
235
+ )
236
 
237
  with gr.Column(scale=2):
 
238
  summary_output = gr.Textbox(
239
+ label="πŸ“ Generated Summary",
240
+ lines=15,
241
+ max_lines=20,
242
+ interactive=False,
243
+ elem_classes=["summary-box"]
244
  )
245
 
246
+ stats_output = gr.Markdown(
247
+ label="πŸ“Š Document Statistics",
248
+ value="Upload a PDF to see statistics"
249
+ )
250
+
251
+ # Examples section
252
+ gr.Markdown("""
253
+ ## πŸ’‘ Tips for Best Results:
254
+
255
+ - **File Quality**: Ensure your PDF has selectable text (not just images)
256
+ - **Length**: Works best with documents between 500-10,000 words
257
+ - **Language**: Optimized for English content
258
+ - **Format**: Clean, well-formatted PDFs produce better summaries
259
 
260
+ ## πŸ”§ Technical Details:
261
+ - **Model**: Facebook BART-Large-CNN (state-of-the-art summarization)
262
+ - **Processing**: Smart text chunking with overlap prevention
263
+ - **Speed**: GPU-accelerated when available
264
+ """)
265
+
266
+ # Connect the button to the function
267
  summarize_btn.click(
268
+ fn=summarize_pdf_interface,
269
+ inputs=[pdf_input, summary_type],
270
+ outputs=[summary_output, stats_output, status_output]
 
271
  )
272
 
273
  # Auto-process when file is uploaded
274
  pdf_input.change(
275
+ fn=summarize_pdf_interface,
276
+ inputs=[pdf_input, summary_type],
277
+ outputs=[summary_output, stats_output, status_output]
278
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
+ return interface
281
 
282
+ # Launch the application
283
  if __name__ == "__main__":
284
+ interface = create_interface()
285
+ interface.launch()