SimranShaikh commited on
Commit
a2146e1
Β·
verified Β·
1 Parent(s): cadd6a8
Files changed (1) hide show
  1. src/streamlit_app.py +67 -262
src/streamlit_app.py CHANGED
@@ -1,4 +1,4 @@
1
- # Fixed SimplePDFRAG with better state management and debugging
2
  import streamlit as st
3
  import PyPDF2
4
  from sentence_transformers import SentenceTransformer
@@ -9,7 +9,6 @@ from sklearn.metrics.pairwise import cosine_similarity
9
  import logging
10
  import os
11
  import tempfile
12
- import shutil
13
 
14
  # Configure logging
15
  logging.basicConfig(level=logging.INFO)
@@ -25,14 +24,11 @@ class SimplePDFRAG:
25
  self.pdf_name = None
26
 
27
  def setup_cache_directory(self):
28
- """Setup a custom cache directory with proper permissions"""
29
  try:
30
- # Create a temporary directory for models
31
  cache_dir = tempfile.mkdtemp(prefix="model_cache_")
32
  os.environ['HF_HOME'] = cache_dir
33
  os.environ['TRANSFORMERS_CACHE'] = cache_dir
34
  os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir
35
-
36
  st.info(f"Using cache directory: {cache_dir}")
37
  return cache_dir
38
  except Exception as e:
@@ -40,54 +36,33 @@ class SimplePDFRAG:
40
  return None
41
 
42
  def load_models(self):
43
- """Load embedding model and Granite model with cache fix"""
44
  try:
45
- # Setup cache directory
46
  cache_dir = self.setup_cache_directory()
47
-
48
- # Load embedding model with cache directory
49
  st.info("Loading embedding model...")
50
  self.embedding_model = SentenceTransformer(
51
- 'all-MiniLM-L6-v2',
52
- cache_folder=cache_dir
53
  )
54
-
55
- # Load IBM Granite model
56
  st.info("Loading IBM Granite model...")
57
- model_name = "ibm-granite/granite-3.0-2b-instruct" # IBM Granite model
58
-
59
- self.tokenizer = AutoTokenizer.from_pretrained(
60
- model_name,
61
- cache_dir=cache_dir
62
- )
63
  self.granite_model = AutoModelForCausalLM.from_pretrained(
64
- model_name,
65
- cache_dir=cache_dir,
66
- torch_dtype=torch.float32 # Use float32 for compatibility
67
  )
68
-
69
  if self.tokenizer.pad_token is None:
70
  self.tokenizer.pad_token = self.tokenizer.eos_token
71
-
72
  st.success("Models loaded successfully!")
73
  return True
74
-
75
  except Exception as e:
76
  st.error(f"Error loading models: {e}")
77
  logger.error(f"Model loading error: {e}")
78
  return False
79
 
80
  def extract_pdf_text(self, pdf_file):
81
- """Extract text from PDF file with better error handling"""
82
  try:
83
- # Reset file pointer to beginning
84
  pdf_file.seek(0)
85
-
86
  pdf_reader = PyPDF2.PdfReader(pdf_file)
87
  text = ""
88
-
89
  st.info(f"PDF has {len(pdf_reader.pages)} pages")
90
-
91
  for page_num, page in enumerate(pdf_reader.pages):
92
  try:
93
  page_text = page.extract_text()
@@ -98,130 +73,66 @@ class SimplePDFRAG:
98
  st.warning(f"⚠️ No text found on page {page_num + 1}")
99
  except Exception as page_error:
100
  st.error(f"Error extracting page {page_num + 1}: {page_error}")
101
- continue
102
-
103
  if text.strip():
104
- st.success(f"Total extracted text length: {len(text)} characters")
105
- # Show preview of extracted text
106
  st.write("πŸ“„ **Text Preview:**")
107
  st.text(text[:500] + "..." if len(text) > 500 else text)
108
  return text
109
  else:
110
  st.error("No text could be extracted from the PDF")
111
  return None
112
-
113
  except Exception as e:
114
  st.error(f"Error reading PDF file: {e}")
115
  logger.error(f"PDF extraction error: {e}")
116
  return None
117
 
118
  def chunk_text(self, text, chunk_size=500):
119
- """Split text into chunks"""
120
  if not text or not text.strip():
121
  return []
122
-
123
  words = text.split()
124
- chunks = []
125
-
126
- for i in range(0, len(words), chunk_size):
127
- chunk = " ".join(words[i:i + chunk_size])
128
- if chunk.strip(): # Only add non-empty chunks
129
- chunks.append(chunk)
130
-
131
- st.info(f"Created {len(chunks)} text chunks")
132
- return chunks
133
 
134
  def process_pdf(self, pdf_file, pdf_name):
135
- """Process PDF and create embeddings"""
136
  try:
137
- # Store PDF name
138
  self.pdf_name = pdf_name
139
-
140
- # Extract text
141
  st.info("πŸ” Extracting text from PDF...")
142
  text = self.extract_pdf_text(pdf_file)
143
  if not text:
144
- st.error("❌ Failed to extract text from PDF")
145
  return False
146
-
147
- # Chunk text
148
  st.info("βœ‚οΈ Splitting text into chunks...")
149
  chunks = self.chunk_text(text)
150
  if not chunks:
151
- st.error("❌ No text chunks created")
152
  return False
153
-
154
- # Create embeddings
155
  st.info(f"πŸ”„ Creating embeddings for {len(chunks)} chunks...")
156
- try:
157
- embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
158
-
159
- # Store documents and embeddings
160
- self.documents = chunks
161
- self.embeddings = embeddings
162
-
163
- st.success(f"βœ… Successfully processed PDF: {len(chunks)} chunks created with embeddings")
164
-
165
- # Show some stats
166
- st.info(f"πŸ“Š **Processing Summary:**")
167
- st.write(f"- PDF Name: {pdf_name}")
168
- st.write(f"- Text length: {len(text)} characters")
169
- st.write(f"- Number of chunks: {len(chunks)}")
170
- st.write(f"- Embeddings shape: {embeddings.shape}")
171
-
172
- return True
173
-
174
- except Exception as e:
175
- st.error(f"❌ Error creating embeddings: {e}")
176
- logger.error(f"Embedding error: {e}")
177
- return False
178
-
179
  except Exception as e:
180
  st.error(f"❌ Error processing PDF: {e}")
181
  logger.error(f"PDF processing error: {e}")
182
  return False
183
 
184
  def search_documents(self, query, top_k=3):
185
- """Search for relevant documents"""
186
  if not self.documents or len(self.embeddings) == 0:
187
  st.warning("No documents available for search")
188
  return []
189
-
190
  try:
191
- # Get query embedding
192
  query_embedding = self.embedding_model.encode([query])
193
-
194
- # Calculate similarities
195
  similarities = cosine_similarity(query_embedding, self.embeddings)[0]
196
-
197
- # Get top k results
198
  top_indices = np.argsort(similarities)[-top_k:][::-1]
199
-
200
- results = []
201
- for idx in top_indices:
202
- if similarities[idx] > 0.1: # Minimum similarity threshold
203
- results.append({
204
- 'text': self.documents[idx],
205
- 'score': similarities[idx]
206
- })
207
-
208
- st.info(f"Found {len(results)} relevant document chunks")
209
- return results
210
-
211
  except Exception as e:
212
  st.error(f"Error searching documents: {e}")
213
  logger.error(f"Search error: {e}")
214
  return []
215
 
216
  def generate_answer(self, query, context_docs):
217
- """Generate answer using the language model"""
218
  if not self.granite_model or not context_docs:
219
  return "I don't have enough information to answer your question."
220
-
221
- # Prepare context
222
- context = "\n\n".join([doc['text'][:200] for doc in context_docs]) # Limit context
223
-
224
- # Create a more sophisticated prompt for Granite
225
  prompt = f"""You are a helpful AI assistant. Based on the following context, provide a clear and accurate answer to the question.
226
 
227
  Context:
@@ -230,218 +141,112 @@ Context:
230
  Question: {query}
231
 
232
  Answer:"""
233
-
234
  try:
235
- # Tokenize
236
- inputs = self.tokenizer.encode(
237
- prompt,
238
- return_tensors='pt',
239
- max_length=512, # Reduced length
240
- truncation=True
241
- )
242
-
243
- # Generate response
244
  with torch.no_grad():
245
  outputs = self.granite_model.generate(
246
  inputs,
247
- max_length=inputs.shape[1] + 100, # Shorter response
248
  temperature=0.7,
249
  do_sample=True,
250
  pad_token_id=self.tokenizer.eos_token_id,
251
  eos_token_id=self.tokenizer.eos_token_id
252
  )
253
-
254
- # Decode response
255
  response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
256
-
257
- # If response is empty or too short, provide context-based answer
258
- if not response or len(response.strip()) < 10:
259
- response = f"Based on the document: {context[:300]}..."
260
-
261
- return response.strip()
262
-
263
  except Exception as e:
264
  logger.error(f"Generation error: {e}")
265
- # Fallback to simple context-based answer
266
- return f"Based on the available information: {context[:300]}..."
267
 
268
  def answer_question(self, query):
269
- """Main function to answer questions"""
270
  if not self.documents:
271
- return {
272
- 'answer': "No PDF has been processed yet. Please upload and process a PDF first.",
273
- 'sources': []
274
- }
275
-
276
- # Search for relevant documents
277
  relevant_docs = self.search_documents(query)
278
-
279
  if not relevant_docs:
280
- return {
281
- 'answer': "I couldn't find relevant information in the PDF to answer your question.",
282
- 'sources': []
283
- }
284
-
285
- # Generate answer
286
- answer = self.generate_answer(query, relevant_docs)
287
-
288
  return {
289
- 'answer': answer,
290
  'sources': relevant_docs
291
  }
292
 
293
  def main():
294
- st.set_page_config(
295
- page_title="Simple PDF RAG with IBM Granite (Fixed)",
296
- page_icon="πŸ“„",
297
- layout="wide"
298
- )
299
-
300
  st.title("πŸ“„ Simple PDF RAG with IBM Granite (Fixed)")
301
  st.write("Upload a PDF and ask questions about its content")
302
-
303
- # Initialize session state
304
  if 'rag_system' not in st.session_state:
305
  st.session_state.rag_system = SimplePDFRAG()
306
-
307
  if 'models_loaded' not in st.session_state:
308
  st.session_state.models_loaded = False
309
-
310
  if 'pdf_processed' not in st.session_state:
311
  st.session_state.pdf_processed = False
312
-
313
  if 'current_pdf_name' not in st.session_state:
314
  st.session_state.current_pdf_name = None
315
-
316
- # Status display
317
  col1, col2, col3 = st.columns(3)
318
  with col1:
319
- if st.session_state.models_loaded:
320
- st.success("πŸ€– Models: Loaded")
321
- else:
322
- st.error("πŸ€– Models: Not Loaded")
323
-
324
  with col2:
325
- if st.session_state.pdf_processed and st.session_state.current_pdf_name:
326
- st.success(f"πŸ“„ PDF: {st.session_state.current_pdf_name}")
327
- else:
328
- st.error("πŸ“„ PDF: Not Processed")
329
-
330
  with col3:
331
- if st.session_state.models_loaded and st.session_state.pdf_processed:
332
- st.success("🟒 Ready for Questions")
333
- else:
334
- st.error("πŸ”΄ Not Ready")
335
-
336
- # Load models button
337
  if not st.session_state.models_loaded:
338
- if st.button("πŸ€– Load Models", key="load_models"):
339
- with st.spinner("Loading models... This may take a few minutes"):
340
  success = st.session_state.rag_system.load_models()
341
- if success:
342
- st.session_state.models_loaded = True
343
- st.rerun()
344
-
345
- # Only show PDF upload if models are loaded
346
  if st.session_state.models_loaded:
347
  st.markdown("---")
348
  st.subheader("πŸ“ PDF Upload and Processing")
349
-
350
- # PDF Upload
351
- uploaded_file = st.file_uploader("Upload PDF", type=['pdf'], key="pdf_uploader")
352
-
353
- if uploaded_file is not None:
354
- st.info(f"πŸ“„ Uploaded: {uploaded_file.name}")
355
-
356
- if st.button("πŸ“– Process PDF", key="process_pdf"):
 
 
 
 
357
  with st.spinner("Processing PDF..."):
358
- success = st.session_state.rag_system.process_pdf(uploaded_file, uploaded_file.name)
 
359
  if success:
360
  st.session_state.pdf_processed = True
361
- st.session_state.current_pdf_name = uploaded_file.name
362
  st.rerun()
363
- else:
364
- st.session_state.pdf_processed = False
365
- st.session_state.current_pdf_name = None
366
-
367
- # Question answering section
368
  if st.session_state.pdf_processed:
369
  st.markdown("---")
370
  st.subheader("❓ Ask Questions")
371
-
372
- # Show current document info
373
  st.info(f"πŸ“š Current document: {st.session_state.current_pdf_name}")
374
- st.info(f"πŸ“Š Document chunks: {len(st.session_state.rag_system.documents)}")
375
-
376
- query = st.text_input("Ask a question about the PDF:", key="question_input",
377
- placeholder="e.g., What is the main topic of this document?")
378
-
379
- if query:
380
- if st.button("πŸ” Get Answer", key="get_answer"):
381
- with st.spinner("Searching and generating answer..."):
382
- result = st.session_state.rag_system.answer_question(query)
383
-
384
- # Display answer
385
- st.markdown("### πŸ€– Answer:")
386
- st.write(result['answer'])
387
-
388
- # Display sources
389
- if result.get('sources'):
390
- st.markdown("### πŸ“š Relevant Sources:")
391
- for i, source in enumerate(result['sources']):
392
- with st.expander(f"Source {i+1} (Relevance Score: {source['score']:.3f})"):
393
- st.write(source['text'][:500] + "..." if len(source['text']) > 500 else source['text'])
394
-
395
- # Add some example questions
396
- st.markdown("### πŸ’‘ Example Questions:")
397
- example_questions = [
398
- "What is the main topic of this document?",
399
- "Can you summarize the key points?",
400
- "What are the important details mentioned?",
401
- "Who are the main people or entities discussed?"
402
- ]
403
-
404
- for i, example in enumerate(example_questions):
405
- if st.button(f"πŸ“ {example}", key=f"example_{i}"):
406
- st.session_state.question_input = example
407
- st.rerun()
408
-
409
- # Sidebar with instructions and debugging
410
  with st.sidebar:
411
  st.header("πŸ“‹ Instructions")
412
- st.write("""
413
- 1. **Load Models**: Click to initialize AI models
414
- 2. **Upload PDF**: Select a PDF file to analyze
415
- 3. **Process PDF**: Extract and index PDF content
416
- 4. **Ask Questions**: Get AI-powered answers
417
- """)
418
-
419
  st.header("πŸ”§ Debug Info")
420
- if st.session_state.models_loaded:
421
- st.write("βœ… Models loaded")
422
- else:
423
- st.write("❌ Models not loaded")
424
-
425
- if st.session_state.pdf_processed:
426
- st.write(f"βœ… PDF processed: {st.session_state.current_pdf_name}")
427
- if hasattr(st.session_state.rag_system, 'documents'):
428
- st.write(f"πŸ“Š Chunks: {len(st.session_state.rag_system.documents)}")
429
- else:
430
- st.write("❌ No PDF processed")
431
-
432
- # Reset button
433
- if st.button("πŸ”„ Reset All", key="reset_all"):
434
  for key in list(st.session_state.keys()):
435
  del st.session_state[key]
436
  st.rerun()
437
-
438
- st.header("βš™οΈ Tips")
439
- st.write("""
440
- - **PDF not working?** Try a different PDF file
441
- - **No text extracted?** PDF might be image-based
442
- - **Poor answers?** Try more specific questions
443
- - **Slow performance?** Use smaller PDF files
444
- """)
445
 
446
  if __name__ == "__main__":
447
- main()
 
1
+ # Fixed SimplePDFRAG with better state management and PDF caching
2
  import streamlit as st
3
  import PyPDF2
4
  from sentence_transformers import SentenceTransformer
 
9
  import logging
10
  import os
11
  import tempfile
 
12
 
13
  # Configure logging
14
  logging.basicConfig(level=logging.INFO)
 
24
  self.pdf_name = None
25
 
26
  def setup_cache_directory(self):
 
27
  try:
 
28
  cache_dir = tempfile.mkdtemp(prefix="model_cache_")
29
  os.environ['HF_HOME'] = cache_dir
30
  os.environ['TRANSFORMERS_CACHE'] = cache_dir
31
  os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir
 
32
  st.info(f"Using cache directory: {cache_dir}")
33
  return cache_dir
34
  except Exception as e:
 
36
  return None
37
 
38
  def load_models(self):
 
39
  try:
 
40
  cache_dir = self.setup_cache_directory()
 
 
41
  st.info("Loading embedding model...")
42
  self.embedding_model = SentenceTransformer(
43
+ 'all-MiniLM-L6-v2', cache_folder=cache_dir
 
44
  )
 
 
45
  st.info("Loading IBM Granite model...")
46
+ model_name = "ibm-granite/granite-3.0-2b-instruct"
47
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
 
 
 
48
  self.granite_model = AutoModelForCausalLM.from_pretrained(
49
+ model_name, cache_dir=cache_dir, torch_dtype=torch.float32
 
 
50
  )
 
51
  if self.tokenizer.pad_token is None:
52
  self.tokenizer.pad_token = self.tokenizer.eos_token
 
53
  st.success("Models loaded successfully!")
54
  return True
 
55
  except Exception as e:
56
  st.error(f"Error loading models: {e}")
57
  logger.error(f"Model loading error: {e}")
58
  return False
59
 
60
  def extract_pdf_text(self, pdf_file):
 
61
  try:
 
62
  pdf_file.seek(0)
 
63
  pdf_reader = PyPDF2.PdfReader(pdf_file)
64
  text = ""
 
65
  st.info(f"PDF has {len(pdf_reader.pages)} pages")
 
66
  for page_num, page in enumerate(pdf_reader.pages):
67
  try:
68
  page_text = page.extract_text()
 
73
  st.warning(f"⚠️ No text found on page {page_num + 1}")
74
  except Exception as page_error:
75
  st.error(f"Error extracting page {page_num + 1}: {page_error}")
 
 
76
  if text.strip():
77
+ st.success(f"Extracted {len(text)} characters")
 
78
  st.write("πŸ“„ **Text Preview:**")
79
  st.text(text[:500] + "..." if len(text) > 500 else text)
80
  return text
81
  else:
82
  st.error("No text could be extracted from the PDF")
83
  return None
 
84
  except Exception as e:
85
  st.error(f"Error reading PDF file: {e}")
86
  logger.error(f"PDF extraction error: {e}")
87
  return None
88
 
89
  def chunk_text(self, text, chunk_size=500):
 
90
  if not text or not text.strip():
91
  return []
 
92
  words = text.split()
93
+ return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
 
 
 
 
 
 
 
 
94
 
95
  def process_pdf(self, pdf_file, pdf_name):
 
96
  try:
 
97
  self.pdf_name = pdf_name
 
 
98
  st.info("πŸ” Extracting text from PDF...")
99
  text = self.extract_pdf_text(pdf_file)
100
  if not text:
 
101
  return False
 
 
102
  st.info("βœ‚οΈ Splitting text into chunks...")
103
  chunks = self.chunk_text(text)
104
  if not chunks:
 
105
  return False
 
 
106
  st.info(f"πŸ”„ Creating embeddings for {len(chunks)} chunks...")
107
+ embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
108
+ self.documents = chunks
109
+ self.embeddings = embeddings
110
+ st.success(f"βœ… Successfully processed PDF: {len(chunks)} chunks created with embeddings")
111
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  except Exception as e:
113
  st.error(f"❌ Error processing PDF: {e}")
114
  logger.error(f"PDF processing error: {e}")
115
  return False
116
 
117
  def search_documents(self, query, top_k=3):
 
118
  if not self.documents or len(self.embeddings) == 0:
119
  st.warning("No documents available for search")
120
  return []
 
121
  try:
 
122
  query_embedding = self.embedding_model.encode([query])
 
 
123
  similarities = cosine_similarity(query_embedding, self.embeddings)[0]
 
 
124
  top_indices = np.argsort(similarities)[-top_k:][::-1]
125
+ return [{'text': self.documents[i], 'score': similarities[i]}
126
+ for i in top_indices if similarities[i] > 0.1]
 
 
 
 
 
 
 
 
 
 
127
  except Exception as e:
128
  st.error(f"Error searching documents: {e}")
129
  logger.error(f"Search error: {e}")
130
  return []
131
 
132
  def generate_answer(self, query, context_docs):
 
133
  if not self.granite_model or not context_docs:
134
  return "I don't have enough information to answer your question."
135
+ context = "\n\n".join([doc['text'][:200] for doc in context_docs])
 
 
 
 
136
  prompt = f"""You are a helpful AI assistant. Based on the following context, provide a clear and accurate answer to the question.
137
 
138
  Context:
 
141
  Question: {query}
142
 
143
  Answer:"""
 
144
  try:
145
+ inputs = self.tokenizer.encode(prompt, return_tensors='pt', max_length=512, truncation=True)
 
 
 
 
 
 
 
 
146
  with torch.no_grad():
147
  outputs = self.granite_model.generate(
148
  inputs,
149
+ max_length=inputs.shape[1] + 100,
150
  temperature=0.7,
151
  do_sample=True,
152
  pad_token_id=self.tokenizer.eos_token_id,
153
  eos_token_id=self.tokenizer.eos_token_id
154
  )
 
 
155
  response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
156
+ return response.strip() if len(response.strip()) >= 10 else context[:300] + "..."
 
 
 
 
 
 
157
  except Exception as e:
158
  logger.error(f"Generation error: {e}")
159
+ return context[:300] + "..."
 
160
 
161
  def answer_question(self, query):
 
162
  if not self.documents:
163
+ return {'answer': "No PDF has been processed yet.", 'sources': []}
 
 
 
 
 
164
  relevant_docs = self.search_documents(query)
 
165
  if not relevant_docs:
166
+ return {'answer': "No relevant information found.", 'sources': []}
 
 
 
 
 
 
 
167
  return {
168
+ 'answer': self.generate_answer(query, relevant_docs),
169
  'sources': relevant_docs
170
  }
171
 
172
  def main():
173
+ st.set_page_config(page_title="Simple PDF RAG with IBM Granite (Fixed)", page_icon="πŸ“„", layout="wide")
 
 
 
 
 
174
  st.title("πŸ“„ Simple PDF RAG with IBM Granite (Fixed)")
175
  st.write("Upload a PDF and ask questions about its content")
176
+
 
177
  if 'rag_system' not in st.session_state:
178
  st.session_state.rag_system = SimplePDFRAG()
 
179
  if 'models_loaded' not in st.session_state:
180
  st.session_state.models_loaded = False
 
181
  if 'pdf_processed' not in st.session_state:
182
  st.session_state.pdf_processed = False
 
183
  if 'current_pdf_name' not in st.session_state:
184
  st.session_state.current_pdf_name = None
185
+
 
186
  col1, col2, col3 = st.columns(3)
187
  with col1:
188
+ st.success("πŸ€– Models: Loaded" if st.session_state.models_loaded else "πŸ€– Models: Not Loaded")
 
 
 
 
189
  with col2:
190
+ st.success(f"πŸ“„ PDF: {st.session_state.current_pdf_name}" if st.session_state.pdf_processed else "πŸ“„ PDF: Not Processed")
 
 
 
 
191
  with col3:
192
+ st.success("🟒 Ready" if st.session_state.models_loaded and st.session_state.pdf_processed else "πŸ”΄ Not Ready")
193
+
 
 
 
 
194
  if not st.session_state.models_loaded:
195
+ if st.button("πŸ€– Load Models"):
196
+ with st.spinner("Loading models..."):
197
  success = st.session_state.rag_system.load_models()
198
+ st.session_state.models_loaded = success
199
+ st.rerun()
200
+
 
 
201
  if st.session_state.models_loaded:
202
  st.markdown("---")
203
  st.subheader("πŸ“ PDF Upload and Processing")
204
+ uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
205
+
206
+ if uploaded_file and 'uploaded_file_path' not in st.session_state:
207
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
208
+ tmp.write(uploaded_file.read())
209
+ st.session_state.uploaded_file_path = tmp.name
210
+ st.session_state.uploaded_file_name = uploaded_file.name
211
+ st.rerun()
212
+
213
+ if 'uploaded_file_path' in st.session_state:
214
+ st.info(f"πŸ“„ Uploaded: {st.session_state.uploaded_file_name}")
215
+ if st.button("πŸ“– Process PDF"):
216
  with st.spinner("Processing PDF..."):
217
+ with open(st.session_state.uploaded_file_path, "rb") as f:
218
+ success = st.session_state.rag_system.process_pdf(f, st.session_state.uploaded_file_name)
219
  if success:
220
  st.session_state.pdf_processed = True
221
+ st.session_state.current_pdf_name = st.session_state.uploaded_file_name
222
  st.rerun()
223
+
 
 
 
 
224
  if st.session_state.pdf_processed:
225
  st.markdown("---")
226
  st.subheader("❓ Ask Questions")
 
 
227
  st.info(f"πŸ“š Current document: {st.session_state.current_pdf_name}")
228
+ query = st.text_input("Ask a question:", placeholder="e.g., What is the main topic?")
229
+ if query and st.button("πŸ” Get Answer"):
230
+ with st.spinner("Searching and generating answer..."):
231
+ result = st.session_state.rag_system.answer_question(query)
232
+ st.markdown("### πŸ€– Answer:")
233
+ st.write(result['answer'])
234
+ if result.get('sources'):
235
+ st.markdown("### πŸ“š Sources:")
236
+ for i, src in enumerate(result['sources']):
237
+ with st.expander(f"Source {i+1} (Score: {src['score']:.3f})"):
238
+ st.write(src['text'][:500] + "..." if len(src['text']) > 500 else src['text'])
239
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  with st.sidebar:
241
  st.header("πŸ“‹ Instructions")
242
+ st.markdown("1. Load Models\n2. Upload PDF\n3. Process PDF\n4. Ask Questions")
 
 
 
 
 
 
243
  st.header("πŸ”§ Debug Info")
244
+ st.write("βœ… Models loaded" if st.session_state.models_loaded else "❌ Models not loaded")
245
+ st.write(f"βœ… PDF: {st.session_state.current_pdf_name}" if st.session_state.pdf_processed else "❌ No PDF processed")
246
+ if st.button("πŸ”„ Reset All"):
 
 
 
 
 
 
 
 
 
 
 
247
  for key in list(st.session_state.keys()):
248
  del st.session_state[key]
249
  st.rerun()
 
 
 
 
 
 
 
 
250
 
251
  if __name__ == "__main__":
252
+ main()