LovnishVerma commited on
Commit
28b7150
Β·
verified Β·
1 Parent(s): 52d9d60

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +331 -0
app.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import gradio as gr
3
+ from transformers import pipeline
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ import numpy as np
7
+ import os
8
+ import time
9
+ from typing import List, Tuple, Optional
10
+
11
+ # Initialize models with error handling
12
+ try:
13
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
14
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
15
+ qa_pipeline = pipeline("question-answering",
16
+ model="distilbert-base-uncased-distilled-squad")
17
+ except Exception as e:
18
+ print(f"Error loading models: {e}")
19
+ raise
20
+
21
+
22
+ def extract_text_from_pdf(file_path: str) -> Tuple[str, int, bool]:
23
+ """Extract text from PDF with metadata and error handling."""
24
+ try:
25
+ doc = fitz.open(file_path)
26
+ text = ""
27
+ page_count = len(doc)
28
+
29
+ for page_num, page in enumerate(doc):
30
+ page_text = page.get_text()
31
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}"
32
+
33
+ doc.close()
34
+
35
+ # Check if text extraction was successful
36
+ is_text_rich = len(text.strip()) > 100
37
+ return text, page_count, is_text_rich
38
+
39
+ except Exception as e:
40
+ return f"Error extracting text: {str(e)}", 0, False
41
+
42
+
43
+ def chunk_text(text: str, max_chunk_size: int = 500, overlap: int = 50) -> List[str]:
44
+ """Improved text chunking with overlap for better context preservation."""
45
+ words = text.split()
46
+ chunks = []
47
+
48
+ if len(words) <= max_chunk_size:
49
+ return [text]
50
+
51
+ start = 0
52
+ while start < len(words):
53
+ end = min(start + max_chunk_size, len(words))
54
+ chunk = " ".join(words[start:end])
55
+ chunks.append(chunk)
56
+
57
+ # Move start position with overlap
58
+ start = end - overlap if end < len(words) else end
59
+
60
+ return chunks
61
+
62
+
63
+ def build_faiss_index(chunks: List[str]) -> Tuple[faiss.IndexFlatL2, np.ndarray]:
64
+ """Build FAISS index with error handling."""
65
+ try:
66
+ embeddings = embedding_model.encode(chunks, show_progress_bar=True)
67
+ dimension = embeddings.shape[1]
68
+ index = faiss.IndexFlatL2(dimension)
69
+ index.add(embeddings.astype('float32'))
70
+ return index, embeddings
71
+ except Exception as e:
72
+ raise Exception(f"Error building FAISS index: {str(e)}")
73
+
74
+
75
+ def retrieve_relevant_chunks(query: str, chunks: List[str], index: faiss.IndexFlatL2,
76
+ embeddings: np.ndarray, top_k: int = 5) -> List[str]:
77
+ """Retrieve most relevant chunks for the query."""
78
+ try:
79
+ query_embedding = embedding_model.encode([query]).astype('float32')
80
+ distances, indices = index.search(
81
+ query_embedding, min(top_k, len(chunks)))
82
+ retrieved_chunks = [chunks[i] for i in indices[0] if i < len(chunks)]
83
+ return retrieved_chunks
84
+ except Exception as e:
85
+ return [f"Error retrieving chunks: {str(e)}"]
86
+
87
+
88
+ def summarize_pdf(file_path: Optional[str]) -> str:
89
+ """Enhanced PDF summarization with progress tracking."""
90
+ if not file_path:
91
+ return "Please upload a PDF file first."
92
+
93
+ if not os.path.exists(file_path):
94
+ return "File not found. Please upload a valid PDF."
95
+
96
+ start_time = time.time()
97
+
98
+ # Extract text
99
+ raw_text, page_count, is_text_rich = extract_text_from_pdf(file_path)
100
+
101
+ if not is_text_rich:
102
+ return f"⚠️ Warning: Limited text extracted from PDF ({page_count} pages). The file may contain mostly images or be corrupted.\n\nExtracted content:\n{raw_text[:500]}..."
103
+
104
+ try:
105
+ # Improved chunking for summarization
106
+ max_chunk = 1000 # Slightly larger chunks for better context
107
+ chunks = [raw_text[i:i+max_chunk]
108
+ # 50% overlap
109
+ for i in range(0, len(raw_text), max_chunk//2)]
110
+
111
+ summary_parts = []
112
+ total_chunks = len(chunks)
113
+
114
+ for i, chunk in enumerate(chunks):
115
+ if len(chunk.strip()) < 50: # Skip very short chunks
116
+ continue
117
+
118
+ try:
119
+ # Adjust max_length based on chunk size
120
+ max_len = min(150, len(chunk.split()) // 3)
121
+ min_len = min(30, max_len // 3)
122
+
123
+ result = summarizer(chunk, max_length=max_len,
124
+ min_length=min_len, do_sample=False)
125
+ summary_parts.append(result[0]['summary_text'])
126
+
127
+ except Exception as chunk_error:
128
+ summary_parts.append(
129
+ f"[Error summarizing chunk {i+1}: {str(chunk_error)}]")
130
+
131
+ processing_time = time.time() - start_time
132
+
133
+ if summary_parts:
134
+ final_summary = " ".join(summary_parts)
135
+
136
+ # Meta information
137
+ meta_info = f"πŸ“„ Document Summary ({page_count} pages, processed in {processing_time:.1f}s)\n" + \
138
+ "="*60 + "\n\n"
139
+
140
+ return meta_info + final_summary
141
+ else:
142
+ return "Unable to generate summary. The document may be too short or contain unsupported content."
143
+
144
+ except Exception as e:
145
+ return f"Error during summarization: {str(e)}"
146
+
147
+
148
+ def answer_question(file_path: Optional[str], question: str) -> str:
149
+ """Enhanced Q&A with better error handling and context."""
150
+ if not file_path:
151
+ return "Please upload a PDF file first."
152
+
153
+ if not question.strip():
154
+ return "Please enter a question."
155
+
156
+ if not os.path.exists(file_path):
157
+ return "File not found. Please upload a valid PDF."
158
+
159
+ try:
160
+ start_time = time.time()
161
+
162
+ # Extract and process text
163
+ raw_text, page_count, is_text_rich = extract_text_from_pdf(file_path)
164
+
165
+ if not is_text_rich:
166
+ return f"⚠️ Limited text available for Q&A. Extracted content may be insufficient.\n\nAttempting to answer based on available text..."
167
+
168
+ # Create chunks with overlap for better context
169
+ chunks = chunk_text(raw_text, max_chunk_size=400, overlap=50)
170
+
171
+ if not chunks:
172
+ return "No processable text found in the document."
173
+
174
+ # Build search index
175
+ index, embeddings = build_faiss_index(chunks)
176
+
177
+ # Retrieve relevant chunks
178
+ relevant_chunks = retrieve_relevant_chunks(
179
+ question, chunks, index, embeddings, top_k=5)
180
+
181
+ if not relevant_chunks:
182
+ return "No relevant information found for your question."
183
+
184
+ # Combine context (limit to avoid token limits)
185
+ context = " ".join(relevant_chunks)[:2000] # Limit context length
186
+
187
+ # Get answer
188
+ try:
189
+ result = qa_pipeline(question=question, context=context)
190
+ answer = result['answer']
191
+ confidence = result.get('score', 0)
192
+
193
+ processing_time = time.time() - start_time
194
+
195
+ # Format response with metadata
196
+ response = f"🎯 Answer (confidence: {confidence:.2f}, {processing_time:.1f}s):\n"
197
+ response += f"{answer}\n\n"
198
+
199
+ if confidence < 0.5:
200
+ response += "⚠️ Low confidence answer. The information might not be directly stated in the document."
201
+
202
+ return response
203
+
204
+ except Exception as qa_error:
205
+ return f"Error generating answer: {str(qa_error)}\n\nRelevant context found:\n{context[:300]}..."
206
+
207
+ except Exception as e:
208
+ return f"Error processing question: {str(e)}"
209
+
210
+
211
+ def clear_inputs():
212
+ """Helper function to clear inputs."""
213
+ return None, ""
214
+
215
+
216
+ # Enhanced Gradio UI
217
+ with gr.Blocks(title="PDF Analyzer", theme=gr.themes.Soft()) as demo:
218
+ gr.Markdown("""
219
+ # πŸ“š Advanced PDF Analyzer
220
+ Upload a PDF document to generate summaries or ask questions about its content.
221
+
222
+ **Features:**
223
+ - πŸ” Intelligent text extraction and processing
224
+ - πŸ“ AI-powered document summarization
225
+ - ❓ Question-answering with semantic search
226
+ - πŸ“Š Processing metadata and confidence scores
227
+ """)
228
+
229
+ with gr.Tab("πŸ“ Document Summarization"):
230
+ gr.Markdown("### Generate an AI summary of your PDF document")
231
+
232
+ with gr.Row():
233
+ with gr.Column(scale=1):
234
+ pdf_input = gr.File(
235
+ label="Upload PDF Document",
236
+ file_types=[".pdf"],
237
+ type="filepath"
238
+ )
239
+ summarize_button = gr.Button(
240
+ "πŸ“ Generate Summary", variant="primary")
241
+ clear_summary_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
242
+
243
+ with gr.Column(scale=2):
244
+ summary_output = gr.Textbox(
245
+ label="Document Summary",
246
+ lines=15,
247
+ placeholder="Upload a PDF and click 'Generate Summary' to see results here...",
248
+ show_copy_button=True
249
+ )
250
+
251
+ summarize_button.click(
252
+ fn=summarize_pdf,
253
+ inputs=pdf_input,
254
+ outputs=summary_output
255
+ )
256
+
257
+ clear_summary_btn.click(
258
+ fn=lambda: (None, ""),
259
+ outputs=[pdf_input, summary_output]
260
+ )
261
+
262
+ with gr.Tab("❓ Question & Answer"):
263
+ gr.Markdown("### Ask questions about your PDF document")
264
+
265
+ with gr.Row():
266
+ with gr.Column(scale=1):
267
+ pdf_input_qa = gr.File(
268
+ label="Upload PDF Document",
269
+ file_types=[".pdf"],
270
+ type="filepath"
271
+ )
272
+ question_input = gr.Textbox(
273
+ label="Your Question",
274
+ placeholder="e.g., What is the main topic of this document?",
275
+ lines=2
276
+ )
277
+ answer_button = gr.Button("🎯 Get Answer", variant="primary")
278
+ clear_qa_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
279
+
280
+ with gr.Column(scale=2):
281
+ answer_output = gr.Textbox(
282
+ label="Answer",
283
+ lines=10,
284
+ placeholder="Upload a PDF, enter your question, and click 'Get Answer'...",
285
+ show_copy_button=True
286
+ )
287
+
288
+ # Example questions
289
+ gr.Markdown("**Example questions:**")
290
+ example_questions = [
291
+ "What is the main topic of this document?",
292
+ "Who are the key people mentioned?",
293
+ "What are the main conclusions?",
294
+ "Can you explain the methodology used?"
295
+ ]
296
+
297
+ with gr.Row():
298
+ for i, eq in enumerate(example_questions):
299
+ gr.Button(eq, size="sm").click(
300
+ fn=lambda q=eq: q,
301
+ outputs=question_input
302
+ )
303
+
304
+ answer_button.click(
305
+ fn=answer_question,
306
+ inputs=[pdf_input_qa, question_input],
307
+ outputs=answer_output
308
+ )
309
+
310
+ clear_qa_btn.click(
311
+ fn=lambda: (None, "", ""),
312
+ outputs=[pdf_input_qa, question_input, answer_output]
313
+ )
314
+
315
+ # Footer with tips
316
+ gr.Markdown("""
317
+ ---
318
+ ### πŸ’‘ Tips for better results:
319
+ - **For summarization**: Works best with text-heavy documents (research papers, reports, articles)
320
+ - **For Q&A**: Ask specific questions and ensure your PDF contains searchable text
321
+ - **File size**: Larger documents may take longer to process
322
+ - **Quality**: Scanned PDFs without OCR may have limited text extraction
323
+ """)
324
+
325
+ if __name__ == "__main__":
326
+ demo.launch(
327
+ share=False,
328
+ server_name="0.0.0.0",
329
+ server_port=7860,
330
+ show_error=True
331
+ )