Shubham170793 commited on
Commit
c9a83aa
Β·
verified Β·
1 Parent(s): 2239986

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +53 -187
src/streamlit_app.py CHANGED
@@ -13,6 +13,17 @@ from document_registry import DocumentRegistry
13
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
14
  print("CUDA available:", torch.cuda.is_available())
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  # ==========================================================
17
  # βš™οΈ CACHE SETUP
18
  # ==========================================================
@@ -83,11 +94,12 @@ Output: Write each question on a new line. Do not invent facts β€” base question
83
  questions = []
84
  for ln in lines:
85
  q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
86
- if not q.endswith("?") and len(q.split()) < 18 and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
 
 
87
  q += "?"
88
  if 8 <= len(q) <= 140:
89
  questions.append(q)
90
- # dedupe
91
  final = []
92
  seen = set()
93
  for q in questions:
@@ -101,7 +113,7 @@ Output: Write each question on a new line. Do not invent facts β€” base question
101
  return ["How do I start using this guide?", "What does this document cover?"]
102
 
103
  # ==========================================================
104
- # 🎨 STYLING β€” REVERT TO ORIGINAL
105
  # ==========================================================
106
  st.markdown("""
107
  <style>
@@ -138,11 +150,7 @@ h1, h2, h3 {color: #f3f4f6; font-weight: 600;}
138
  font-size: 15px !important;
139
  }
140
  .stTextInput > label {font-weight: 500;}
141
- .small-link {
142
- font-size: 13px;
143
- color: #60a5fa;
144
- cursor: pointer;
145
- }
146
  </style>
147
  """, unsafe_allow_html=True)
148
 
@@ -155,27 +163,20 @@ with st.sidebar:
155
  "",
156
  ("Strict (Document-only)", "Extended (Document + General)"),
157
  index=0,
158
- help="Strict = answers only from the uploaded document. Extended = may include related general info.",
159
  )
160
-
161
  st.markdown("---")
162
 
163
- # 🧩 Document Registry Viewer
164
  if "registry" in st.session_state:
165
  registry = st.session_state["registry"]
166
  registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
167
-
168
  if registered_docs:
169
  with st.expander("πŸ“š Registered Documents", expanded=False):
170
  for i, doc in enumerate(registered_docs, start=1):
171
  doc_name = doc.get("name", "Unknown")
172
  chunks = doc.get("num_chunks", "?")
173
  toc_source = doc.get("toc_source", "β€”")
174
- st.markdown(
175
- f"**{i}. {doc_name}** β€” {chunks} chunks *(TOC: {toc_source})*"
176
- )
177
 
178
- # 🧭 Active Document Selector (Commit #3)
179
  st.markdown("---")
180
  active_doc_name = st.selectbox(
181
  "πŸ“„ Select Active Document",
@@ -183,7 +184,6 @@ with st.sidebar:
183
  index=0,
184
  key="active_doc_selector"
185
  )
186
-
187
  selected_doc = registry.get_doc(active_doc_name)
188
  if selected_doc:
189
  st.session_state.update({
@@ -194,70 +194,8 @@ with st.sidebar:
194
  "doc_ready": True,
195
  "status_text": f"πŸ“„ {active_doc_name} loaded from registry β€” ready for queries."
196
  })
197
- else:
198
- st.caption("πŸ“­ No documents registered yet.")
199
- else:
200
- st.caption("πŸ“ Upload or process a document to see registered files here.")
201
-
202
- st.markdown("---")
203
-
204
- show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
205
- if show_dev:
206
- st.markdown("### βš™οΈ Developer Options")
207
- chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
208
- overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
209
- top_k = st.slider("Top K Results", 1, 10, 7)
210
- else:
211
- chunk_size, overlap, top_k = 1000, 120, 5
212
-
213
- st.markdown("---")
214
  st.caption("✨ Built by Shubham Sharma")
215
 
216
- # 🧩 Developer Insights (Moved up here from main block)
217
- if show_dev:
218
- st.markdown("---")
219
- with st.expander("🧩 Developer Insights", expanded=False):
220
- st.markdown("**Retrieved Chunks (Context):**")
221
- retrieved_chunks = st.session_state.get("retrieved", [])
222
- for i, r in enumerate(retrieved_chunks, start=1):
223
- st.markdown(f"- **Chunk {i}:** {r}")
224
-
225
- toc_data = st.session_state.get("toc", [])
226
- if toc_data:
227
- st.markdown("---")
228
- st.markdown("**Document Sections (TOC):**")
229
- toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc_data])
230
- st.text_area("", toc_text, height=120)
231
-
232
- doc_text = st.session_state.get("text", "")
233
- if doc_text:
234
- st.markdown("---")
235
- st.markdown("**Document Preview:**")
236
- st.text_area("", doc_text[:1000], height=120)
237
- st.caption(f"{len(st.session_state.get('chunks', []))} chunks processed.")
238
-
239
-
240
- # ==========================================================
241
- # 🧠 SESSION STATE
242
- # ==========================================================
243
- for key, val in {
244
- "user_query_input": "",
245
- "show_more": False,
246
- "selected_suggestion": None,
247
- "query_suggestions_fixed": None,
248
- "last_doc": None,
249
- "doc_lang": "en", # πŸ†• optional: store document language
250
- }.items():
251
- if key not in st.session_state:
252
- st.session_state[key] = val
253
-
254
-
255
- def set_user_query(q, idx):
256
- st.session_state["user_query_input"] = q
257
- st.session_state["selected_suggestion"] = idx
258
- st.experimental_rerun()
259
-
260
-
261
  # ==========================================================
262
  # πŸ“„ MAIN SECTION
263
  # ==========================================================
@@ -267,45 +205,26 @@ st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning
267
  doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
268
 
269
  # ==========================================================
270
- # πŸ“‚ DOCUMENT HANDLING β€” CLEAN, ACCURATE, AND BYTE-AWARE
271
  # ==========================================================
272
  import hashlib
273
 
274
  def _hash_content(file_path):
275
- """Generate a short SHA256 hash of the file's actual binary content."""
276
  hasher = hashlib.sha256()
277
  with open(file_path, "rb") as f:
278
  while chunk := f.read(8192):
279
  hasher.update(chunk)
280
- return hasher.hexdigest()[:12] # short unique hash for same-name files
281
 
282
-
283
- def refresh_suggestions(doc_name, toc, chunks):
284
- """Refresh dynamic suggestions and reset related states."""
285
- st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
286
- toc, chunks, doc_name
287
- )
288
- st.session_state["user_query_input"] = ""
289
- st.session_state["selected_suggestion"] = None
290
- st.session_state["show_more"] = False
291
-
292
-
293
- # --- Document selection ---
294
  if doc_choice == "-- Select --":
295
  st.info("⬅️ Select or upload a document to begin.")
296
  else:
297
  temp_path = None
298
-
299
- # --- File selection ---
300
  if doc_choice == "Sample PDF":
301
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
302
  st.markdown("βœ… **Sample PDF selected.** Preparing document...")
303
  else:
304
- uploaded_file = st.file_uploader(
305
- "Upload a PDF document (max 200MB):",
306
- type="pdf",
307
- label_visibility="collapsed"
308
- )
309
  if uploaded_file:
310
  temp_path = os.path.join("/tmp", uploaded_file.name)
311
  with open(temp_path, "wb") as f:
@@ -313,18 +232,15 @@ else:
313
  else:
314
  st.stop()
315
 
316
- # --- Start processing if file exists ---
317
  if temp_path:
318
  doc_name = os.path.basename(temp_path)
319
  file_hash = _hash_content(temp_path)
320
- doc_identifier = f"{doc_name}_{file_hash}" # unique per content
321
 
322
- # βœ… Step 0: Initialize registry
323
  if "registry" not in st.session_state:
324
  st.session_state["registry"] = DocumentRegistry()
325
  registry = st.session_state["registry"]
326
 
327
- # βœ… Step 1: Check if document already in registry
328
  existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
329
  if existing_doc:
330
  doc_data = registry.get_doc(existing_doc["name"])
@@ -338,43 +254,21 @@ else:
338
  "active_doc": existing_doc["name"],
339
  "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
340
  })
 
 
341
 
342
- # βœ… Refresh suggestions when switching
343
- refresh_suggestions(
344
- existing_doc["name"],
345
- st.session_state["toc"],
346
- st.session_state["chunks"]
347
- )
348
-
349
- if show_dev:
350
- st.info(f"🧠 Loaded from registry: {doc_name}")
351
- st.rerun()
352
-
353
- # βœ… Step 2: If new document β†’ process normally
354
  status = st.empty()
355
  status.info("πŸ“€ Upload complete β€” reading document...")
356
-
357
- # 🧩 Step 2.1: Extract text and TOC
358
  text, toc, toc_source = extract_text_from_pdf(temp_path)
359
-
360
- # 🧩 Step 2.2: Chunk the text
361
  status.info("πŸ“‘ Parsing and chunking document...")
362
- chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
363
-
364
- # 🧩 Step 2.3: Embed and index
365
  status.info("🧠 Building embeddings and search index...")
366
  embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
367
  index = build_faiss_index(embeddings)
368
-
369
- # 🧩 Step 2.4: Register document
370
  doc_id = registry.register(temp_path, chunks, embeddings, index)
371
  st.session_state["active_doc"] = doc_id
372
-
373
- # 🧩 Step 2.5: Success message + suggestions
374
- status.success("βœ… Document processed successfully β€” all set to query your assistant!")
375
  refresh_suggestions(doc_name, toc, chunks)
376
-
377
- # 🧠 Update session
378
  st.session_state.update({
379
  "text": text,
380
  "toc": toc,
@@ -383,60 +277,32 @@ else:
383
  "index": index,
384
  "doc_ready": True,
385
  "last_doc": doc_identifier,
386
- "status_text": "βœ… Document processed successfully β€” all set to query your assistant!"
387
  })
388
- st.rerun()
389
-
390
- # --- Display Ready Message + Ask Section ---
391
- if st.session_state.get("doc_ready"):
392
- active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
393
- st.info(st.session_state.get("status_text", f"πŸ“„ {active_name or 'Document'} is ready for queries."))
394
-
395
- st.markdown("### πŸ’¬ Ask the Assistant")
396
- query_suggestions = st.session_state.get("query_suggestions_fixed", [])
397
- if query_suggestions:
398
- visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
399
- cols = st.columns(min(3, len(visible)))
400
- for i, q in enumerate(visible):
401
- if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
402
- set_user_query(q, i)
403
-
404
- toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
405
- if st.button(toggle_text, help="Show or hide more suggestions"):
406
- st.session_state["show_more"] = not st.session_state["show_more"]
407
- st.rerun()
408
-
409
- user_query = st.text_input(
410
- "Type your question or click one above:",
411
- key="user_query_input",
412
- label_visibility="visible"
413
- )
414
-
415
- if user_query.strip():
416
- reasoning_mode = mode == "Extended (Document + General)"
417
- with st.spinner("πŸ’­ Generating your answer..."):
418
- retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
419
- answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
420
- st.session_state["retrieved"] = retrieved
421
-
422
- st.markdown("### πŸ€– Assistant’s Answer")
423
-
424
- if not reasoning_mode and not answer.startswith("⚠️"):
425
- answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
426
- answer = re.sub(r"(^|\n)-\s*", r"\1<br>β€’ ", answer)
427
-
428
- st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
429
-
430
-
431
- # ==========================================================
432
- # 🎨 Optional Sidebar Scroll Styling (keeps it clean)
433
- # ==========================================================
434
- st.markdown("""
435
- <style>
436
- section[data-testid="stSidebar"] div.stExpander {
437
- max-height: 480px;
438
- overflow-y: auto;
439
- }
440
- </style>
441
- """, unsafe_allow_html=True)
442
-
 
13
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
14
  print("CUDA available:", torch.cuda.is_available())
15
 
16
+ # ==========================================================
17
+ # βš™οΈ SAFE RERUN HANDLER
18
+ # ==========================================================
19
+ def trigger_safe_rerun():
20
+ """Mark rerun flag for next render instead of rerunning immediately."""
21
+ st.session_state["_safe_rerun"] = True
22
+
23
+ if st.session_state.get("_safe_rerun"):
24
+ st.session_state["_safe_rerun"] = False
25
+ st.rerun()
26
+
27
  # ==========================================================
28
  # βš™οΈ CACHE SETUP
29
  # ==========================================================
 
94
  questions = []
95
  for ln in lines:
96
  q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
97
+ if not q.endswith("?") and len(q.split()) < 18 and re.match(
98
+ r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q
99
+ ):
100
  q += "?"
101
  if 8 <= len(q) <= 140:
102
  questions.append(q)
 
103
  final = []
104
  seen = set()
105
  for q in questions:
 
113
  return ["How do I start using this guide?", "What does this document cover?"]
114
 
115
  # ==========================================================
116
+ # 🎨 STYLING
117
  # ==========================================================
118
  st.markdown("""
119
  <style>
 
150
  font-size: 15px !important;
151
  }
152
  .stTextInput > label {font-weight: 500;}
153
+ .small-link {font-size: 13px; color: #60a5fa; cursor: pointer;}
 
 
 
 
154
  </style>
155
  """, unsafe_allow_html=True)
156
 
 
163
  "",
164
  ("Strict (Document-only)", "Extended (Document + General)"),
165
  index=0,
 
166
  )
 
167
  st.markdown("---")
168
 
 
169
  if "registry" in st.session_state:
170
  registry = st.session_state["registry"]
171
  registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
 
172
  if registered_docs:
173
  with st.expander("πŸ“š Registered Documents", expanded=False):
174
  for i, doc in enumerate(registered_docs, start=1):
175
  doc_name = doc.get("name", "Unknown")
176
  chunks = doc.get("num_chunks", "?")
177
  toc_source = doc.get("toc_source", "β€”")
178
+ st.markdown(f"**{i}. {doc_name}** β€” {chunks} chunks *(TOC: {toc_source})*")
 
 
179
 
 
180
  st.markdown("---")
181
  active_doc_name = st.selectbox(
182
  "πŸ“„ Select Active Document",
 
184
  index=0,
185
  key="active_doc_selector"
186
  )
 
187
  selected_doc = registry.get_doc(active_doc_name)
188
  if selected_doc:
189
  st.session_state.update({
 
194
  "doc_ready": True,
195
  "status_text": f"πŸ“„ {active_doc_name} loaded from registry β€” ready for queries."
196
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  st.caption("✨ Built by Shubham Sharma")
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  # ==========================================================
200
  # πŸ“„ MAIN SECTION
201
  # ==========================================================
 
205
  doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
206
 
207
  # ==========================================================
208
+ # πŸ“‚ DOCUMENT HANDLING β€” SAFE VERSION
209
  # ==========================================================
210
  import hashlib
211
 
212
  def _hash_content(file_path):
 
213
  hasher = hashlib.sha256()
214
  with open(file_path, "rb") as f:
215
  while chunk := f.read(8192):
216
  hasher.update(chunk)
217
+ return hasher.hexdigest()[:12]
218
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  if doc_choice == "-- Select --":
220
  st.info("⬅️ Select or upload a document to begin.")
221
  else:
222
  temp_path = None
 
 
223
  if doc_choice == "Sample PDF":
224
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
225
  st.markdown("βœ… **Sample PDF selected.** Preparing document...")
226
  else:
227
+ uploaded_file = st.file_uploader("Upload a PDF document (max 200MB):", type="pdf", label_visibility="collapsed")
 
 
 
 
228
  if uploaded_file:
229
  temp_path = os.path.join("/tmp", uploaded_file.name)
230
  with open(temp_path, "wb") as f:
 
232
  else:
233
  st.stop()
234
 
 
235
  if temp_path:
236
  doc_name = os.path.basename(temp_path)
237
  file_hash = _hash_content(temp_path)
238
+ doc_identifier = f"{doc_name}_{file_hash}"
239
 
 
240
  if "registry" not in st.session_state:
241
  st.session_state["registry"] = DocumentRegistry()
242
  registry = st.session_state["registry"]
243
 
 
244
  existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
245
  if existing_doc:
246
  doc_data = registry.get_doc(existing_doc["name"])
 
254
  "active_doc": existing_doc["name"],
255
  "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
256
  })
257
+ refresh_suggestions(existing_doc["name"], st.session_state["toc"], st.session_state["chunks"])
258
+ trigger_safe_rerun()
259
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  status = st.empty()
261
  status.info("πŸ“€ Upload complete β€” reading document...")
 
 
262
  text, toc, toc_source = extract_text_from_pdf(temp_path)
 
 
263
  status.info("πŸ“‘ Parsing and chunking document...")
264
+ chunks = chunk_text(text, chunk_size=1000, overlap=120)
 
 
265
  status.info("🧠 Building embeddings and search index...")
266
  embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
267
  index = build_faiss_index(embeddings)
 
 
268
  doc_id = registry.register(temp_path, chunks, embeddings, index)
269
  st.session_state["active_doc"] = doc_id
270
+ status.success("βœ… Document processed successfully β€” ready to query!")
 
 
271
  refresh_suggestions(doc_name, toc, chunks)
 
 
272
  st.session_state.update({
273
  "text": text,
274
  "toc": toc,
 
277
  "index": index,
278
  "doc_ready": True,
279
  "last_doc": doc_identifier,
280
+ "status_text": "βœ… Document processed successfully β€” ready to query!"
281
  })
282
+ trigger_safe_rerun()
283
+
284
+ if st.session_state.get("doc_ready"):
285
+ st.info(st.session_state.get("status_text"))
286
+ st.markdown("### πŸ’¬ Ask the Assistant")
287
+ query_suggestions = st.session_state.get("query_suggestions_fixed", [])
288
+ if query_suggestions:
289
+ visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
290
+ cols = st.columns(min(3, len(visible)))
291
+ for i, q in enumerate(visible):
292
+ if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
293
+ st.session_state["user_query_input"] = q
294
+ st.session_state["selected_suggestion"] = i
295
+ trigger_safe_rerun()
296
+ toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
297
+ if st.button(toggle_text):
298
+ st.session_state["show_more"] = not st.session_state["show_more"]
299
+ trigger_safe_rerun()
300
+
301
+ user_query = st.text_input("Your Question:", key="user_query_input", label_visibility="visible")
302
+ if user_query.strip():
303
+ reasoning_mode = mode == "Extended (Document + General)"
304
+ with st.spinner("πŸ’­ Generating your answer..."):
305
+ retrieved = retrieve_chunks(user_query, st.session_state["index"], st.session_state["chunks"], top_k=5, embeddings=st.session_state["embeddings"])
306
+ answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
307
+ st.markdown("### πŸ€– Assistant’s Answer")
308
+ st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)