LovnishVerma commited on
Commit
97d1ec6
·
verified ·
1 Parent(s): 3b0bb36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +266 -39
app.py CHANGED
@@ -1,47 +1,274 @@
1
  import gradio as gr
2
- from pdfminer.high_level import extract_text
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
4
- import os
 
 
 
 
5
 
6
- # Load summarization model
7
- model_name = "google/pegasus-xsum"
8
- tokenizer = AutoTokenizer.from_pretrained(model_name)
9
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
10
- summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
11
 
12
- # Extract text from PDF
13
- def extract_text_from_pdf(pdf_file):
14
- with open(pdf_file.name, "rb") as f:
15
- return extract_text(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # Summarize text (truncate to first 1024 words)
18
- def summarize_text(text):
19
- # Optional preprocessing
20
- text = text.replace("\n", " ")
21
- words = text.split()
22
- if len(words) > 1024:
23
- text = " ".join(words[:1024])
 
24
  try:
25
- summary = summarizer(text, max_length=128, min_length=30, do_sample=False)[0]["summary_text"]
26
- return summary
 
 
 
 
 
 
 
 
 
27
  except Exception as e:
28
- return f" Error: {str(e)}"
29
-
30
- # Gradio interface
31
- def summarize_pdf(pdf_file):
32
- text = extract_text_from_pdf(pdf_file)
33
- if not text.strip():
34
- return "⚠️ No extractable text found in the PDF."
35
- return summarize_text(text)
36
 
37
- # UI
38
- with gr.Blocks() as demo:
39
- gr.Markdown("# 📄 PDF Summarizer (Fast, Pegasus-XSum)\nUpload a PDF file to generate a quick and accurate summary.")
40
- with gr.Row():
41
- file_input = gr.File(label="Upload your PDF", file_types=[".pdf"])
42
- with gr.Row():
43
- output = gr.Textbox(label="Summary", lines=15)
44
- btn = gr.Button("Summarize")
45
- btn.click(summarize_pdf, inputs=file_input, outputs=output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- demo.launch()
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import PyPDF2
3
+ import io
4
+ from transformers import pipeline, AutoTokenizer
5
+ import torch
6
+ import re
7
+ from typing import Optional
8
+ import logging
9
 
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
 
 
13
 
14
+ class PDFSummarizer:
15
+ def __init__(self):
16
+ """Initialize the PDF summarizer with optimized models."""
17
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ logger.info(f"Using device: {self.device}")
19
+
20
+ # Use a fast, efficient model for summarization
21
+ model_name = "facebook/bart-large-cnn"
22
+
23
+ try:
24
+ # Load tokenizer and pipeline
25
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
26
+ self.summarizer = pipeline(
27
+ "summarization",
28
+ model=model_name,
29
+ tokenizer=self.tokenizer,
30
+ device=0 if self.device == "cuda" else -1,
31
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
32
+ )
33
+ logger.info("Model loaded successfully")
34
+ except Exception as e:
35
+ logger.error(f"Error loading model: {e}")
36
+ # Fallback to a smaller model
37
+ self.summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
38
+
39
+ def extract_text_from_pdf(self, pdf_file) -> str:
40
+ """Extract text from uploaded PDF file."""
41
+ try:
42
+ # Read the PDF file
43
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
44
+ text = ""
45
+
46
+ # Extract text from all pages
47
+ for page_num in range(len(pdf_reader.pages)):
48
+ page = pdf_reader.pages[page_num]
49
+ text += page.extract_text() + "\n"
50
+
51
+ # Clean the text
52
+ text = self.clean_text(text)
53
+ return text
54
+
55
+ except Exception as e:
56
+ logger.error(f"Error extracting PDF text: {e}")
57
+ return f"Error reading PDF: {str(e)}"
58
+
59
+ def clean_text(self, text: str) -> str:
60
+ """Clean and preprocess the extracted text."""
61
+ # Remove extra whitespace and newlines
62
+ text = re.sub(r'\s+', ' ', text)
63
+ # Remove special characters but keep punctuation
64
+ text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
65
+ return text.strip()
66
+
67
+ def chunk_text(self, text: str, max_chunk_length: int = 1000) -> list:
68
+ """Split text into chunks for processing."""
69
+ sentences = text.split('. ')
70
+ chunks = []
71
+ current_chunk = ""
72
+
73
+ for sentence in sentences:
74
+ if len(current_chunk) + len(sentence) < max_chunk_length:
75
+ current_chunk += sentence + ". "
76
+ else:
77
+ if current_chunk:
78
+ chunks.append(current_chunk.strip())
79
+ current_chunk = sentence + ". "
80
+
81
+ if current_chunk:
82
+ chunks.append(current_chunk.strip())
83
+
84
+ return chunks
85
+
86
+ def summarize_text(self, text: str, summary_length: str = "medium") -> str:
87
+ """Summarize the extracted text."""
88
+ if not text or len(text.strip()) < 50:
89
+ return "Text too short to summarize or empty content."
90
+
91
+ try:
92
+ # Set summary parameters based on length preference
93
+ length_params = {
94
+ "short": {"max_length": 100, "min_length": 30},
95
+ "medium": {"max_length": 200, "min_length": 50},
96
+ "long": {"max_length": 400, "min_length": 100}
97
+ }
98
+
99
+ params = length_params.get(summary_length, length_params["medium"])
100
+
101
+ # Handle long texts by chunking
102
+ if len(text) > 1024:
103
+ chunks = self.chunk_text(text, 900)
104
+ summaries = []
105
+
106
+ for chunk in chunks[:5]: # Limit to first 5 chunks for speed
107
+ try:
108
+ summary = self.summarizer(
109
+ chunk,
110
+ max_length=params["max_length"] // len(chunks[:5]),
111
+ min_length=params["min_length"] // len(chunks[:5]),
112
+ do_sample=False
113
+ )
114
+ summaries.append(summary[0]['summary_text'])
115
+ except Exception as e:
116
+ logger.error(f"Error summarizing chunk: {e}")
117
+ continue
118
+
119
+ # Combine chunk summaries
120
+ combined_summary = " ".join(summaries)
121
+
122
+ # Final summarization if combined text is still long
123
+ if len(combined_summary) > 512:
124
+ final_summary = self.summarizer(
125
+ combined_summary,
126
+ max_length=params["max_length"],
127
+ min_length=params["min_length"],
128
+ do_sample=False
129
+ )
130
+ return final_summary[0]['summary_text']
131
+ else:
132
+ return combined_summary
133
+ else:
134
+ # Direct summarization for shorter texts
135
+ summary = self.summarizer(
136
+ text,
137
+ max_length=params["max_length"],
138
+ min_length=params["min_length"],
139
+ do_sample=False
140
+ )
141
+ return summary[0]['summary_text']
142
+
143
+ except Exception as e:
144
+ logger.error(f"Error during summarization: {e}")
145
+ return f"Error generating summary: {str(e)}"
146
 
147
+ # Initialize the summarizer
148
+ pdf_summarizer = PDFSummarizer()
149
+
150
+ def process_pdf(pdf_file, summary_length):
151
+ """Main function to process PDF and return summary."""
152
+ if pdf_file is None:
153
+ return "Please upload a PDF file.", ""
154
+
155
  try:
156
+ # Extract text from PDF
157
+ extracted_text = pdf_summarizer.extract_text_from_pdf(pdf_file)
158
+
159
+ if extracted_text.startswith("Error"):
160
+ return extracted_text, ""
161
+
162
+ # Generate summary
163
+ summary = pdf_summarizer.summarize_text(extracted_text, summary_length)
164
+
165
+ return summary, extracted_text[:1000] + "..." if len(extracted_text) > 1000 else extracted_text
166
+
167
  except Exception as e:
168
+ logger.error(f"Error processing PDF: {e}")
169
+ return f"Error processing PDF: {str(e)}", ""
 
 
 
 
 
 
170
 
171
+ # Create Gradio interface
172
+ def create_interface():
173
+ """Create and configure the Gradio interface."""
174
+
175
+ with gr.Blocks(
176
+ title="PDF Summarizer",
177
+ theme=gr.themes.Soft(),
178
+ css="""
179
+ .gradio-container {
180
+ max-width: 1200px;
181
+ margin: 0 auto;
182
+ }
183
+ .header {
184
+ text-align: center;
185
+ margin-bottom: 2rem;
186
+ }
187
+ """
188
+ ) as app:
189
+
190
+ gr.HTML("""
191
+ <div class="header">
192
+ <h1>🚀 Fast PDF Summarizer</h1>
193
+ <p>Upload a PDF file and get an instant AI-powered summary!</p>
194
+ </div>
195
+ """)
196
+
197
+ with gr.Row():
198
+ with gr.Column(scale=1):
199
+ # Input components
200
+ pdf_input = gr.File(
201
+ label="Upload PDF File",
202
+ file_types=[".pdf"],
203
+ file_count="single"
204
+ )
205
+
206
+ summary_length = gr.Radio(
207
+ choices=["short", "medium", "long"],
208
+ value="medium",
209
+ label="Summary Length",
210
+ info="Choose how detailed you want the summary to be"
211
+ )
212
+
213
+ summarize_btn = gr.Button(
214
+ "Summarize PDF",
215
+ variant="primary",
216
+ size="lg"
217
+ )
218
+
219
+ with gr.Column(scale=2):
220
+ # Output components
221
+ summary_output = gr.Textbox(
222
+ label="Summary",
223
+ lines=10,
224
+ placeholder="Your PDF summary will appear here...",
225
+ max_lines=15
226
+ )
227
+
228
+ with gr.Accordion("View Extracted Text", open=False):
229
+ extracted_text_output = gr.Textbox(
230
+ label="Extracted Text (Preview)",
231
+ lines=5,
232
+ max_lines=10,
233
+ placeholder="Extracted text preview will appear here..."
234
+ )
235
+
236
+ # Event handlers
237
+ summarize_btn.click(
238
+ fn=process_pdf,
239
+ inputs=[pdf_input, summary_length],
240
+ outputs=[summary_output, extracted_text_output],
241
+ show_progress=True
242
+ )
243
+
244
+ # Auto-process when file is uploaded
245
+ pdf_input.change(
246
+ fn=process_pdf,
247
+ inputs=[pdf_input, summary_length],
248
+ outputs=[summary_output, extracted_text_output]
249
+ )
250
+
251
+ # Examples section
252
+ gr.HTML("""
253
+ <div style="margin-top: 2rem; padding: 1rem; background-color: #f0f0f0; border-radius: 8px;">
254
+ <h3>💡 Tips for Best Results:</h3>
255
+ <ul>
256
+ <li>Upload clear, text-based PDFs (not scanned images)</li>
257
+ <li>Choose 'short' for quick overviews, 'long' for detailed summaries</li>
258
+ <li>Large PDFs are automatically chunked for faster processing</li>
259
+ <li>The app works best with documents under 50 pages</li>
260
+ </ul>
261
+ </div>
262
+ """)
263
+
264
+ return app
265
 
266
+ # Create and launch the app
267
+ if __name__ == "__main__":
268
+ app = create_interface()
269
+ app.launch(
270
+ share=True,
271
+ server_name="0.0.0.0",
272
+ server_port=7860,
273
+ max_file_size="10mb"
274
+ )