SimranShaikh commited on
Commit
cadd6a8
Β·
verified Β·
1 Parent(s): 48716e1
Files changed (1) hide show
  1. src/streamlit_app.py +199 -73
src/streamlit_app.py CHANGED
@@ -1,4 +1,4 @@
1
- # Solution 1: Updated SimplePDFRAG with cache directory fix
2
  import streamlit as st
3
  import PyPDF2
4
  from sentence_transformers import SentenceTransformer
@@ -22,6 +22,7 @@ class SimplePDFRAG:
22
  self.embedding_model = None
23
  self.granite_model = None
24
  self.tokenizer = None
 
25
 
26
  def setup_cache_directory(self):
27
  """Setup a custom cache directory with proper permissions"""
@@ -77,60 +78,113 @@ class SimplePDFRAG:
77
  return False
78
 
79
  def extract_pdf_text(self, pdf_file):
80
- """Extract text from PDF file"""
81
  try:
 
 
 
82
  pdf_reader = PyPDF2.PdfReader(pdf_file)
83
  text = ""
84
 
 
 
85
  for page_num, page in enumerate(pdf_reader.pages):
86
- page_text = page.extract_text()
87
- if page_text:
88
- text += page_text + "\n"
 
 
 
 
 
 
 
89
 
90
- return text
 
 
 
 
 
 
 
 
 
91
  except Exception as e:
92
- st.error(f"Error extracting PDF text: {e}")
 
93
  return None
94
 
95
  def chunk_text(self, text, chunk_size=500):
96
  """Split text into chunks"""
 
 
 
97
  words = text.split()
98
  chunks = []
99
 
100
  for i in range(0, len(words), chunk_size):
101
  chunk = " ".join(words[i:i + chunk_size])
102
- chunks.append(chunk)
 
103
 
 
104
  return chunks
105
 
106
- def process_pdf(self, pdf_file):
107
  """Process PDF and create embeddings"""
108
- # Extract text
109
- text = self.extract_pdf_text(pdf_file)
110
- if not text:
111
- return False
112
-
113
- # Chunk text
114
- chunks = self.chunk_text(text)
115
-
116
- # Create embeddings
117
- st.info(f"Creating embeddings for {len(chunks)} chunks...")
118
  try:
119
- embeddings = self.embedding_model.encode(chunks)
 
120
 
121
- # Store documents and embeddings
122
- self.documents = chunks
123
- self.embeddings = embeddings
 
 
 
124
 
125
- st.success(f"Processed PDF: {len(chunks)} chunks created")
126
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  except Exception as e:
128
- st.error(f"Error creating embeddings: {e}")
 
129
  return False
130
 
131
  def search_documents(self, query, top_k=3):
132
  """Search for relevant documents"""
133
- if not self.documents:
 
134
  return []
135
 
136
  try:
@@ -151,9 +205,12 @@ class SimplePDFRAG:
151
  'score': similarities[idx]
152
  })
153
 
 
154
  return results
 
155
  except Exception as e:
156
  st.error(f"Error searching documents: {e}")
 
157
  return []
158
 
159
  def generate_answer(self, query, context_docs):
@@ -210,6 +267,12 @@ Answer:"""
210
 
211
  def answer_question(self, query):
212
  """Main function to answer questions"""
 
 
 
 
 
 
213
  # Search for relevant documents
214
  relevant_docs = self.search_documents(query)
215
 
@@ -234,7 +297,7 @@ def main():
234
  layout="wide"
235
  )
236
 
237
- st.title("πŸ“„ Simple PDF RAG with IBM Granite (Cache Fixed)")
238
  st.write("Upload a PDF and ask questions about its content")
239
 
240
  # Initialize session state
@@ -246,75 +309,138 @@ def main():
246
 
247
  if 'pdf_processed' not in st.session_state:
248
  st.session_state.pdf_processed = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
  # Load models button
251
  if not st.session_state.models_loaded:
252
- if st.button("πŸ€– Load Models"):
253
  with st.spinner("Loading models... This may take a few minutes"):
254
  success = st.session_state.rag_system.load_models()
255
- st.session_state.models_loaded = success
 
 
256
 
257
  # Only show PDF upload if models are loaded
258
  if st.session_state.models_loaded:
259
- st.success("βœ… Models loaded successfully!")
 
260
 
261
  # PDF Upload
262
- uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
263
 
264
- if uploaded_file and st.button("πŸ“– Process PDF"):
265
- with st.spinner("Processing PDF..."):
266
- success = st.session_state.rag_system.process_pdf(uploaded_file)
267
- st.session_state.pdf_processed = success
 
 
 
 
 
 
 
 
 
268
 
269
- # Question answering
270
  if st.session_state.pdf_processed:
271
- st.success("οΏ½οΏ½ PDF processed successfully!")
 
 
 
 
 
272
 
273
- query = st.text_input("❓ Ask a question about the PDF:")
 
274
 
275
- if query and st.button("πŸ” Get Answer"):
276
- with st.spinner("Searching and generating answer..."):
277
- result = st.session_state.rag_system.answer_question(query)
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
- # Display answer
280
- st.subheader("πŸ€– Answer:")
281
- st.write(result['answer'])
 
 
 
 
 
282
 
283
- # Display sources
284
- if result.get('sources'):
285
- st.subheader("πŸ“š Relevant Sources:")
286
- for i, source in enumerate(result['sources']):
287
- with st.expander(f"Source {i+1} (Score: {source['score']:.3f})"):
288
- st.write(source['text'][:300] + "..." if len(source['text']) > 300 else source['text'])
289
 
290
- # Instructions with troubleshooting
291
  with st.sidebar:
292
  st.header("πŸ“‹ Instructions")
293
  st.write("""
294
- 1. Click 'Load Models' to initialize the system
295
- 2. Upload a PDF file
296
- 3. Click 'Process PDF' to extract and index content
297
- 4. Ask questions about the PDF content
298
- 5. Get AI-generated answers with source citations
299
  """)
300
 
301
- st.header("πŸ”§ Troubleshooting")
302
- st.write("""
303
- **Cache Permission Error Fixed:**
304
- - Uses temporary directory for model cache
305
- - Automatically handles permission issues
306
- - No manual cache cleanup needed
307
- """)
 
 
 
 
 
308
 
309
- st.header("βš™οΈ Alternative Solutions")
310
- st.code("""
311
- # Manual cache cleanup (if needed):
312
- rm -rf ~/.cache/huggingface/
313
- rm -rf ~/.cache/torch/
314
-
315
- # Or set environment variables:
316
- export HF_HOME=/tmp/hf_cache
317
- export TRANSFORMERS_CACHE=/tmp/transformers_cache
 
 
 
318
  """)
319
 
320
  if __name__ == "__main__":
 
1
+ # Fixed SimplePDFRAG with better state management and debugging
2
  import streamlit as st
3
  import PyPDF2
4
  from sentence_transformers import SentenceTransformer
 
22
  self.embedding_model = None
23
  self.granite_model = None
24
  self.tokenizer = None
25
+ self.pdf_name = None
26
 
27
  def setup_cache_directory(self):
28
  """Setup a custom cache directory with proper permissions"""
 
78
  return False
79
 
80
  def extract_pdf_text(self, pdf_file):
81
+ """Extract text from PDF file with better error handling"""
82
  try:
83
+ # Reset file pointer to beginning
84
+ pdf_file.seek(0)
85
+
86
  pdf_reader = PyPDF2.PdfReader(pdf_file)
87
  text = ""
88
 
89
+ st.info(f"PDF has {len(pdf_reader.pages)} pages")
90
+
91
  for page_num, page in enumerate(pdf_reader.pages):
92
+ try:
93
+ page_text = page.extract_text()
94
+ if page_text:
95
+ text += page_text + "\n"
96
+ st.write(f"βœ… Extracted text from page {page_num + 1}")
97
+ else:
98
+ st.warning(f"⚠️ No text found on page {page_num + 1}")
99
+ except Exception as page_error:
100
+ st.error(f"Error extracting page {page_num + 1}: {page_error}")
101
+ continue
102
 
103
+ if text.strip():
104
+ st.success(f"Total extracted text length: {len(text)} characters")
105
+ # Show preview of extracted text
106
+ st.write("πŸ“„ **Text Preview:**")
107
+ st.text(text[:500] + "..." if len(text) > 500 else text)
108
+ return text
109
+ else:
110
+ st.error("No text could be extracted from the PDF")
111
+ return None
112
+
113
  except Exception as e:
114
+ st.error(f"Error reading PDF file: {e}")
115
+ logger.error(f"PDF extraction error: {e}")
116
  return None
117
 
118
  def chunk_text(self, text, chunk_size=500):
119
  """Split text into chunks"""
120
+ if not text or not text.strip():
121
+ return []
122
+
123
  words = text.split()
124
  chunks = []
125
 
126
  for i in range(0, len(words), chunk_size):
127
  chunk = " ".join(words[i:i + chunk_size])
128
+ if chunk.strip(): # Only add non-empty chunks
129
+ chunks.append(chunk)
130
 
131
+ st.info(f"Created {len(chunks)} text chunks")
132
  return chunks
133
 
134
+ def process_pdf(self, pdf_file, pdf_name):
135
  """Process PDF and create embeddings"""
 
 
 
 
 
 
 
 
 
 
136
  try:
137
+ # Store PDF name
138
+ self.pdf_name = pdf_name
139
 
140
+ # Extract text
141
+ st.info("πŸ” Extracting text from PDF...")
142
+ text = self.extract_pdf_text(pdf_file)
143
+ if not text:
144
+ st.error("❌ Failed to extract text from PDF")
145
+ return False
146
 
147
+ # Chunk text
148
+ st.info("βœ‚οΈ Splitting text into chunks...")
149
+ chunks = self.chunk_text(text)
150
+ if not chunks:
151
+ st.error("❌ No text chunks created")
152
+ return False
153
+
154
+ # Create embeddings
155
+ st.info(f"πŸ”„ Creating embeddings for {len(chunks)} chunks...")
156
+ try:
157
+ embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
158
+
159
+ # Store documents and embeddings
160
+ self.documents = chunks
161
+ self.embeddings = embeddings
162
+
163
+ st.success(f"βœ… Successfully processed PDF: {len(chunks)} chunks created with embeddings")
164
+
165
+ # Show some stats
166
+ st.info(f"πŸ“Š **Processing Summary:**")
167
+ st.write(f"- PDF Name: {pdf_name}")
168
+ st.write(f"- Text length: {len(text)} characters")
169
+ st.write(f"- Number of chunks: {len(chunks)}")
170
+ st.write(f"- Embeddings shape: {embeddings.shape}")
171
+
172
+ return True
173
+
174
+ except Exception as e:
175
+ st.error(f"❌ Error creating embeddings: {e}")
176
+ logger.error(f"Embedding error: {e}")
177
+ return False
178
+
179
  except Exception as e:
180
+ st.error(f"❌ Error processing PDF: {e}")
181
+ logger.error(f"PDF processing error: {e}")
182
  return False
183
 
184
  def search_documents(self, query, top_k=3):
185
  """Search for relevant documents"""
186
+ if not self.documents or len(self.embeddings) == 0:
187
+ st.warning("No documents available for search")
188
  return []
189
 
190
  try:
 
205
  'score': similarities[idx]
206
  })
207
 
208
+ st.info(f"Found {len(results)} relevant document chunks")
209
  return results
210
+
211
  except Exception as e:
212
  st.error(f"Error searching documents: {e}")
213
+ logger.error(f"Search error: {e}")
214
  return []
215
 
216
  def generate_answer(self, query, context_docs):
 
267
 
268
  def answer_question(self, query):
269
  """Main function to answer questions"""
270
+ if not self.documents:
271
+ return {
272
+ 'answer': "No PDF has been processed yet. Please upload and process a PDF first.",
273
+ 'sources': []
274
+ }
275
+
276
  # Search for relevant documents
277
  relevant_docs = self.search_documents(query)
278
 
 
297
  layout="wide"
298
  )
299
 
300
+ st.title("πŸ“„ Simple PDF RAG with IBM Granite (Fixed)")
301
  st.write("Upload a PDF and ask questions about its content")
302
 
303
  # Initialize session state
 
309
 
310
  if 'pdf_processed' not in st.session_state:
311
  st.session_state.pdf_processed = False
312
+
313
+ if 'current_pdf_name' not in st.session_state:
314
+ st.session_state.current_pdf_name = None
315
+
316
+ # Status display
317
+ col1, col2, col3 = st.columns(3)
318
+ with col1:
319
+ if st.session_state.models_loaded:
320
+ st.success("πŸ€– Models: Loaded")
321
+ else:
322
+ st.error("πŸ€– Models: Not Loaded")
323
+
324
+ with col2:
325
+ if st.session_state.pdf_processed and st.session_state.current_pdf_name:
326
+ st.success(f"πŸ“„ PDF: {st.session_state.current_pdf_name}")
327
+ else:
328
+ st.error("πŸ“„ PDF: Not Processed")
329
+
330
+ with col3:
331
+ if st.session_state.models_loaded and st.session_state.pdf_processed:
332
+ st.success("🟒 Ready for Questions")
333
+ else:
334
+ st.error("πŸ”΄ Not Ready")
335
 
336
  # Load models button
337
  if not st.session_state.models_loaded:
338
+ if st.button("πŸ€– Load Models", key="load_models"):
339
  with st.spinner("Loading models... This may take a few minutes"):
340
  success = st.session_state.rag_system.load_models()
341
+ if success:
342
+ st.session_state.models_loaded = True
343
+ st.rerun()
344
 
345
  # Only show PDF upload if models are loaded
346
  if st.session_state.models_loaded:
347
+ st.markdown("---")
348
+ st.subheader("πŸ“ PDF Upload and Processing")
349
 
350
  # PDF Upload
351
+ uploaded_file = st.file_uploader("Upload PDF", type=['pdf'], key="pdf_uploader")
352
 
353
+ if uploaded_file is not None:
354
+ st.info(f"πŸ“„ Uploaded: {uploaded_file.name}")
355
+
356
+ if st.button("πŸ“– Process PDF", key="process_pdf"):
357
+ with st.spinner("Processing PDF..."):
358
+ success = st.session_state.rag_system.process_pdf(uploaded_file, uploaded_file.name)
359
+ if success:
360
+ st.session_state.pdf_processed = True
361
+ st.session_state.current_pdf_name = uploaded_file.name
362
+ st.rerun()
363
+ else:
364
+ st.session_state.pdf_processed = False
365
+ st.session_state.current_pdf_name = None
366
 
367
+ # Question answering section
368
  if st.session_state.pdf_processed:
369
+ st.markdown("---")
370
+ st.subheader("❓ Ask Questions")
371
+
372
+ # Show current document info
373
+ st.info(f"πŸ“š Current document: {st.session_state.current_pdf_name}")
374
+ st.info(f"πŸ“Š Document chunks: {len(st.session_state.rag_system.documents)}")
375
 
376
+ query = st.text_input("Ask a question about the PDF:", key="question_input",
377
+ placeholder="e.g., What is the main topic of this document?")
378
 
379
+ if query:
380
+ if st.button("πŸ” Get Answer", key="get_answer"):
381
+ with st.spinner("Searching and generating answer..."):
382
+ result = st.session_state.rag_system.answer_question(query)
383
+
384
+ # Display answer
385
+ st.markdown("### πŸ€– Answer:")
386
+ st.write(result['answer'])
387
+
388
+ # Display sources
389
+ if result.get('sources'):
390
+ st.markdown("### πŸ“š Relevant Sources:")
391
+ for i, source in enumerate(result['sources']):
392
+ with st.expander(f"Source {i+1} (Relevance Score: {source['score']:.3f})"):
393
+ st.write(source['text'][:500] + "..." if len(source['text']) > 500 else source['text'])
394
 
395
+ # Add some example questions
396
+ st.markdown("### πŸ’‘ Example Questions:")
397
+ example_questions = [
398
+ "What is the main topic of this document?",
399
+ "Can you summarize the key points?",
400
+ "What are the important details mentioned?",
401
+ "Who are the main people or entities discussed?"
402
+ ]
403
 
404
+ for i, example in enumerate(example_questions):
405
+ if st.button(f"πŸ“ {example}", key=f"example_{i}"):
406
+ st.session_state.question_input = example
407
+ st.rerun()
 
 
408
 
409
+ # Sidebar with instructions and debugging
410
  with st.sidebar:
411
  st.header("πŸ“‹ Instructions")
412
  st.write("""
413
+ 1. **Load Models**: Click to initialize AI models
414
+ 2. **Upload PDF**: Select a PDF file to analyze
415
+ 3. **Process PDF**: Extract and index PDF content
416
+ 4. **Ask Questions**: Get AI-powered answers
 
417
  """)
418
 
419
+ st.header("πŸ”§ Debug Info")
420
+ if st.session_state.models_loaded:
421
+ st.write("βœ… Models loaded")
422
+ else:
423
+ st.write("❌ Models not loaded")
424
+
425
+ if st.session_state.pdf_processed:
426
+ st.write(f"βœ… PDF processed: {st.session_state.current_pdf_name}")
427
+ if hasattr(st.session_state.rag_system, 'documents'):
428
+ st.write(f"πŸ“Š Chunks: {len(st.session_state.rag_system.documents)}")
429
+ else:
430
+ st.write("❌ No PDF processed")
431
 
432
+ # Reset button
433
+ if st.button("πŸ”„ Reset All", key="reset_all"):
434
+ for key in list(st.session_state.keys()):
435
+ del st.session_state[key]
436
+ st.rerun()
437
+
438
+ st.header("βš™οΈ Tips")
439
+ st.write("""
440
+ - **PDF not working?** Try a different PDF file
441
+ - **No text extracted?** PDF might be image-based
442
+ - **Poor answers?** Try more specific questions
443
+ - **Slow performance?** Use smaller PDF files
444
  """)
445
 
446
  if __name__ == "__main__":