Shubham170793 commited on
Commit
5d37e56
Β·
verified Β·
1 Parent(s): aa034da

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +198 -78
src/streamlit_app.py CHANGED
@@ -1,18 +1,14 @@
1
  # ==========================================================
2
- # streamlit_app.py β€” Commit 2 (Stable)
3
  # ==========================================================
4
  import os
5
  import re
6
- import hashlib
7
  import streamlit as st
8
  import torch
9
  from document_registry import DocumentRegistry
10
- from ingestion import extract_text_from_pdf, chunk_text
11
- from vectorstore import build_faiss_index
12
- from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
13
 
14
  # ==========================================================
15
- # βœ… PAGE CONFIG
16
  # ==========================================================
17
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
18
  print("CUDA available:", torch.cuda.is_available())
@@ -30,18 +26,108 @@ os.environ.update({
30
  })
31
 
32
  # ==========================================================
33
- # 🎨 STYLING
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # ==========================================================
35
  st.markdown("""
36
  <style>
37
- div.block-container {padding-top:1.2rem;max-width:1080px;}
38
- h1,h2,h3{color:#f3f4f6;font-weight:600;}
39
- .answer-box{background:linear-gradient(180deg,#0b1220,#071027);
40
- border-left:4px solid #3b82f6;border-radius:8px;padding:16px 18px;color:#e6eef8;
41
- margin-top:12px;box-shadow:0 4px 14px rgba(0,0,0,0.35);}
42
- .stTextInput>div>div>input{background-color:#0f172a!important;color:#f1f5f9!important;
43
- border-radius:6px!important;border:1px solid #334155!important;padding:8px 10px!important;
44
- font-size:15px!important;}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  </style>
46
  """, unsafe_allow_html=True)
47
 
@@ -54,45 +140,81 @@ with st.sidebar:
54
  "",
55
  ("Strict (Document-only)", "Extended (Document + General)"),
56
  index=0,
57
- help="Strict = answers only from the uploaded document."
58
  )
 
59
  st.markdown("---")
60
 
 
61
  if "registry" in st.session_state:
62
  registry = st.session_state["registry"]
63
  registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
64
  if registered_docs:
65
  with st.expander("πŸ“š Registered Documents", expanded=False):
66
  for i, doc in enumerate(registered_docs, start=1):
67
- st.markdown(f"**{i}. {doc.get('name','?')}** β€” {doc.get('num_chunks','?')} chunks *(TOC: {doc.get('toc_source','β€”')})*")
 
 
 
68
  else:
69
  st.caption("πŸ“­ No documents registered yet.")
70
  else:
71
  st.caption("πŸ“­ No registry initialized yet.")
72
 
73
  st.markdown("---")
 
74
  show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
75
  if show_dev:
 
76
  chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
77
  overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
78
- top_k = st.slider("Top K Results", 1, 10, 5)
79
  else:
80
  chunk_size, overlap, top_k = 1000, 120, 5
 
81
  st.markdown("---")
82
  st.caption("✨ Built by Shubham Sharma")
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # ==========================================================
85
- # 🧠 SESSION STATE INITIALIZATION (added for safety)
86
  # ==========================================================
87
  for key, val in {
88
- "show_more": False,
89
  "user_query_input": "",
 
90
  "selected_suggestion": None,
 
 
 
91
  "doc_ready": False,
92
  }.items():
93
  if key not in st.session_state:
94
  st.session_state[key] = val
95
 
 
 
 
 
96
 
97
  # ==========================================================
98
  # πŸ“„ MAIN SECTION
@@ -100,22 +222,20 @@ for key, val in {
100
  st.title("πŸ“„ Enterprise Knowledge Assistant")
101
  st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
102
 
103
- doc_choice = st.radio(
104
- "Select a document:",
105
- ["-- Select --", "Sample PDF", "Upload Custom PDF"],
106
- index=0
107
- )
108
 
109
  # ==========================================================
110
- # πŸ“‚ DOCUMENT HANDLING (Commit 2)
111
  # ==========================================================
 
 
112
  def _hash_content(file_path):
113
- h = hashlib.sha256()
 
114
  with open(file_path, "rb") as f:
115
  while chunk := f.read(8192):
116
- h.update(chunk)
117
- return h.hexdigest()[:12]
118
-
119
 
120
  if doc_choice == "-- Select --":
121
  st.info("⬅️ Select or upload a document to begin.")
@@ -125,11 +245,11 @@ else:
125
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
126
  st.markdown("βœ… **Sample PDF selected.** Preparing document...")
127
  else:
128
- uploaded = st.file_uploader("Upload a PDF document (max 200 MB):", type="pdf", label_visibility="collapsed")
129
- if uploaded:
130
- temp_path = os.path.join("/tmp", uploaded.name)
131
  with open(temp_path, "wb") as f:
132
- f.write(uploaded.getbuffer())
133
  else:
134
  st.stop()
135
 
@@ -138,24 +258,7 @@ else:
138
  file_hash = _hash_content(temp_path)
139
  doc_identifier = f"{doc_name}_{file_hash}"
140
 
141
- # βœ… Registry initialization
142
- if "registry" not in st.session_state:
143
- st.session_state["registry"] = DocumentRegistry()
144
- registry = st.session_state["registry"]
145
-
146
- # βœ… Reuse if already processed
147
- if doc_name in [d["name"] for d in registry.list_docs()]:
148
- st.session_state["status_text"] = f"βœ… {doc_name} already processed β€” loaded from registry."
149
- doc_data = registry.get_doc(doc_name)
150
- st.session_state.update({
151
- "text": doc_data.get("text", ""),
152
- "toc": doc_data.get("toc", []),
153
- "chunks": doc_data.get("chunks", []),
154
- "embeddings": doc_data.get("embeddings"),
155
- "index": doc_data.get("index"),
156
- "doc_ready": True
157
- })
158
- else:
159
  status = st.empty()
160
  status.info("πŸ“€ Upload complete β€” reading document...")
161
 
@@ -166,9 +269,14 @@ else:
166
  embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
167
  index = build_faiss_index(embeddings)
168
 
 
 
 
 
169
  doc_id = registry.register(temp_path, chunks, embeddings, index)
170
  st.session_state["active_doc"] = doc_id
171
- status.success("βœ… Document processed successfully β€” ready to query!")
 
172
 
173
  st.session_state.update({
174
  "text": text,
@@ -178,50 +286,62 @@ else:
178
  "index": index,
179
  "doc_ready": True,
180
  "last_doc": doc_identifier,
181
- "status_text": "βœ… Document processed successfully β€” ready to query!"
182
  })
183
 
184
- # --- Ask section ---
185
- if st.session_state.get("doc_ready"):
186
- st.info(st.session_state.get("status_text", "πŸ“„ Ready for queries."))
187
- st.markdown("### πŸ’¬ Ask the Assistant")
 
 
188
 
189
- query_suggestions = ["How do I start using this guide?", "What are the prerequisites?", "What is covered in this document?"]
190
- visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
191
- cols = st.columns(min(3, len(visible)))
192
- for i, q in enumerate(visible):
193
- if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
194
- st.session_state["user_query_input"] = q
195
- st.experimental_rerun()
 
 
 
 
 
 
 
 
 
 
196
 
197
- toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
198
- if st.button(toggle_text):
199
- st.session_state["show_more"] = not st.session_state["show_more"]
200
- st.experimental_rerun()
201
 
202
  user_query = st.text_input("Type your question or click one above:", key="user_query_input")
203
 
204
  if user_query.strip():
205
  reasoning_mode = mode == "Extended (Document + General)"
206
  with st.spinner("πŸ’­ Generating your answer..."):
207
- retrieved = retrieve_chunks(
208
- user_query,
209
- st.session_state["index"],
210
- st.session_state["chunks"],
211
- top_k=top_k,
212
- embeddings=st.session_state["embeddings"]
213
- )
214
  answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
215
  st.session_state["retrieved"] = retrieved
216
 
217
  st.markdown("### πŸ€– Assistant’s Answer")
 
 
 
218
  st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
219
 
220
  # ==========================================================
221
- # 🎨 Sidebar scroll style
222
  # ==========================================================
223
  st.markdown("""
224
  <style>
225
- section[data-testid="stSidebar"] div.stExpander {max-height:480px;overflow-y:auto;}
 
 
 
226
  </style>
227
  """, unsafe_allow_html=True)
 
1
  # ==========================================================
2
+ # streamlit_app.py β€” Stable Layout (English Only) + Session Fix
3
  # ==========================================================
4
  import os
5
  import re
 
6
  import streamlit as st
7
  import torch
8
  from document_registry import DocumentRegistry
 
 
 
9
 
10
  # ==========================================================
11
+ # βœ… PAGE CONFIGS
12
  # ==========================================================
13
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
14
  print("CUDA available:", torch.cuda.is_available())
 
26
  })
27
 
28
  # ==========================================================
29
+ # πŸ“¦ IMPORTS
30
+ # ==========================================================
31
+ from ingestion import extract_text_from_pdf, chunk_text
32
+ from vectorstore import build_faiss_index
33
+ from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
34
+
35
+ # ==========================================================
36
+ # 🧠 SMART SUGGESTION GENERATOR (English Only)
37
+ # ==========================================================
38
+ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
39
+ """Generates 5–7 short, natural English questions based on TOC and document text."""
40
+ if not toc or not chunks:
41
+ return ["How do I start using this guide?", "What does this document cover?"]
42
+
43
+ titles = []
44
+ for sec, raw_title in toc:
45
+ title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
46
+ title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
47
+ if 4 < len(title) < 120:
48
+ titles.append(title)
49
+
50
+ context_sample = " ".join(chunks[:3])[:4000]
51
+ prompt = f"""
52
+ You are a content assistant. Based on the Table of Contents and the sample document text below,
53
+ generate 5–7 short, natural user-facing questions.
54
+ Each question should be under 18 words, end with a question mark, and sound human.
55
+ Document: "{doc_name}"
56
+
57
+ TABLE OF CONTENTS:
58
+ {chr(10).join(['- ' + t for t in titles[:8]])}
59
+
60
+ SAMPLE TEXT:
61
+ {context_sample}
62
+
63
+ Output: Write each question on a new line. Do not invent facts β€” base questions only on the document.
64
+ """
65
+
66
+ try:
67
+ ai_response = genai_generate(prompt)
68
+ lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
69
+ questions = []
70
+ for ln in lines:
71
+ q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
72
+ if not q.endswith("?") and len(q.split()) < 18 and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
73
+ q += "?"
74
+ if 8 <= len(q) <= 140:
75
+ questions.append(q)
76
+ # dedupe
77
+ final, seen = [], set()
78
+ for q in questions:
79
+ if q.lower() not in seen:
80
+ seen.add(q.lower())
81
+ final.append(q)
82
+ if not final:
83
+ final = [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
84
+ return final[:7]
85
+ except Exception:
86
+ return ["How do I start using this guide?", "What does this document cover?"]
87
+
88
+ # ==========================================================
89
+ # 🎨 STYLING β€” REVERT TO ORIGINAL
90
  # ==========================================================
91
  st.markdown("""
92
  <style>
93
+ div.block-container {padding-top: 1.2rem; max-width: 1080px;}
94
+ h1, h2, h3 {color: #f3f4f6; font-weight: 600;}
95
+ .suggest-chip {
96
+ background: #0f1724;
97
+ border: 1px solid #374151;
98
+ border-radius: 14px;
99
+ color: #e6eef8;
100
+ padding: 8px 12px;
101
+ cursor: pointer;
102
+ font-size: 13px;
103
+ margin: 6px 6px 6px 0;
104
+ display: inline-block;
105
+ transition: background 0.2s, transform 0.1s;
106
+ }
107
+ .suggest-chip:hover {background: #1e3a8a; transform: translateY(-2px);}
108
+ .answer-box {
109
+ background: linear-gradient(180deg,#0b1220,#071027);
110
+ border-left: 4px solid #3b82f6;
111
+ border-radius: 8px;
112
+ padding: 16px 18px;
113
+ color: #e6eef8;
114
+ margin-top: 12px;
115
+ box-shadow: 0 4px 14px rgba(0,0,0,0.35);
116
+ }
117
+ .stTextInput > div > div > input {
118
+ background-color: #0f172a !important;
119
+ color: #f1f5f9 !important;
120
+ border-radius: 6px !important;
121
+ border: 1px solid #334155 !important;
122
+ padding: 8px 10px !important;
123
+ font-size: 15px !important;
124
+ }
125
+ .stTextInput > label {font-weight: 500;}
126
+ .small-link {
127
+ font-size: 13px;
128
+ color: #60a5fa;
129
+ cursor: pointer;
130
+ }
131
  </style>
132
  """, unsafe_allow_html=True)
133
 
 
140
  "",
141
  ("Strict (Document-only)", "Extended (Document + General)"),
142
  index=0,
143
+ help="Strict = answers only from the uploaded document. Extended = may include related general info.",
144
  )
145
+
146
  st.markdown("---")
147
 
148
+ # 🧩 Document Registry Viewer
149
  if "registry" in st.session_state:
150
  registry = st.session_state["registry"]
151
  registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
152
  if registered_docs:
153
  with st.expander("πŸ“š Registered Documents", expanded=False):
154
  for i, doc in enumerate(registered_docs, start=1):
155
+ doc_name = doc.get("name", "Unknown")
156
+ chunks = doc.get("num_chunks", "?")
157
+ toc_source = doc.get("toc_source", "β€”")
158
+ st.markdown(f"**{i}. {doc_name}** β€” {chunks} chunks *(TOC: {toc_source})*")
159
  else:
160
  st.caption("πŸ“­ No documents registered yet.")
161
  else:
162
  st.caption("πŸ“­ No registry initialized yet.")
163
 
164
  st.markdown("---")
165
+
166
  show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
167
  if show_dev:
168
+ st.markdown("### βš™οΈ Developer Options")
169
  chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
170
  overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
171
+ top_k = st.slider("Top K Results", 1, 10, 7)
172
  else:
173
  chunk_size, overlap, top_k = 1000, 120, 5
174
+
175
  st.markdown("---")
176
  st.caption("✨ Built by Shubham Sharma")
177
 
178
+ if show_dev:
179
+ st.markdown("---")
180
+ with st.expander("🧩 Developer Insights", expanded=False):
181
+ st.markdown("**Retrieved Chunks (Context):**")
182
+ for i, r in enumerate(st.session_state.get("retrieved", []), start=1):
183
+ st.markdown(f"- **Chunk {i}:** {r}")
184
+
185
+ toc_data = st.session_state.get("toc", [])
186
+ if toc_data:
187
+ st.markdown("---")
188
+ st.markdown("**Document Sections (TOC):**")
189
+ toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc_data])
190
+ st.text_area("", toc_text, height=120)
191
+
192
+ doc_text = st.session_state.get("text", "")
193
+ if doc_text:
194
+ st.markdown("---")
195
+ st.markdown("**Document Preview:**")
196
+ st.text_area("", doc_text[:1000], height=120)
197
+ st.caption(f"{len(st.session_state.get('chunks', []))} chunks processed.")
198
+
199
  # ==========================================================
200
+ # 🧠 SESSION STATE SAFETY INITIALIZATION
201
  # ==========================================================
202
  for key, val in {
 
203
  "user_query_input": "",
204
+ "show_more": False,
205
  "selected_suggestion": None,
206
+ "query_suggestions_fixed": None,
207
+ "last_doc": None,
208
+ "doc_lang": "en",
209
  "doc_ready": False,
210
  }.items():
211
  if key not in st.session_state:
212
  st.session_state[key] = val
213
 
214
+ def set_user_query(q, idx):
215
+ st.session_state["user_query_input"] = q
216
+ st.session_state["selected_suggestion"] = idx
217
+ st.experimental_rerun()
218
 
219
  # ==========================================================
220
  # πŸ“„ MAIN SECTION
 
222
  st.title("πŸ“„ Enterprise Knowledge Assistant")
223
  st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
224
 
225
+ doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
 
 
 
 
226
 
227
  # ==========================================================
228
+ # πŸ“‚ DOCUMENT HANDLING β€” CLEAN, ACCURATE, AND BYTE-AWARE
229
  # ==========================================================
230
+ import hashlib
231
+
232
  def _hash_content(file_path):
233
+ """Generate a short SHA256 hash of the file's actual binary content."""
234
+ hasher = hashlib.sha256()
235
  with open(file_path, "rb") as f:
236
  while chunk := f.read(8192):
237
+ hasher.update(chunk)
238
+ return hasher.hexdigest()[:12]
 
239
 
240
  if doc_choice == "-- Select --":
241
  st.info("⬅️ Select or upload a document to begin.")
 
245
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
246
  st.markdown("βœ… **Sample PDF selected.** Preparing document...")
247
  else:
248
+ uploaded_file = st.file_uploader("Upload a PDF document (max 200MB):", type="pdf", label_visibility="collapsed")
249
+ if uploaded_file:
250
+ temp_path = os.path.join("/tmp", uploaded_file.name)
251
  with open(temp_path, "wb") as f:
252
+ f.write(uploaded_file.getbuffer())
253
  else:
254
  st.stop()
255
 
 
258
  file_hash = _hash_content(temp_path)
259
  doc_identifier = f"{doc_name}_{file_hash}"
260
 
261
+ if "doc_ready" not in st.session_state or st.session_state.get("last_doc") != doc_identifier:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  status = st.empty()
263
  status.info("πŸ“€ Upload complete β€” reading document...")
264
 
 
269
  embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
270
  index = build_faiss_index(embeddings)
271
 
272
+ if "registry" not in st.session_state:
273
+ st.session_state["registry"] = DocumentRegistry()
274
+
275
+ registry = st.session_state["registry"]
276
  doc_id = registry.register(temp_path, chunks, embeddings, index)
277
  st.session_state["active_doc"] = doc_id
278
+
279
+ status.success("βœ… Document processed successfully β€” all set to query your assistant!")
280
 
281
  st.session_state.update({
282
  "text": text,
 
286
  "index": index,
287
  "doc_ready": True,
288
  "last_doc": doc_identifier,
289
+ "status_text": "βœ… Document processed successfully β€” all set to query your assistant!"
290
  })
291
 
292
+ query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
293
+ st.session_state["query_suggestions_fixed"] = query_suggestions
294
+ st.session_state["user_query_input"] = ""
295
+ st.session_state["selected_suggestion"] = None
296
+ st.session_state["show_more"] = False
297
+ st.rerun()
298
 
299
+ else:
300
+ text = st.session_state["text"]
301
+ toc = st.session_state["toc"]
302
+ chunks = st.session_state["chunks"]
303
+ embeddings = st.session_state["embeddings"]
304
+ index = st.session_state["index"]
305
+ query_suggestions = st.session_state.get("query_suggestions_fixed", [])
306
+ st.info(st.session_state.get("status_text", f"πŸ“„ {doc_name} is ready for queries."))
307
+
308
+ # --- Ask section ---
309
+ st.markdown("### πŸ’¬ Ask the Assistant")
310
+ if query_suggestions:
311
+ visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
312
+ cols = st.columns(min(3, len(visible)))
313
+ for i, q in enumerate(visible):
314
+ if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
315
+ set_user_query(q, i)
316
 
317
+ toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
318
+ if st.button(toggle_text, help="Show or hide more suggestions"):
319
+ st.session_state["show_more"] = not st.session_state["show_more"]
320
+ st.rerun()
321
 
322
  user_query = st.text_input("Type your question or click one above:", key="user_query_input")
323
 
324
  if user_query.strip():
325
  reasoning_mode = mode == "Extended (Document + General)"
326
  with st.spinner("πŸ’­ Generating your answer..."):
327
+ retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
 
 
 
 
 
 
328
  answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
329
  st.session_state["retrieved"] = retrieved
330
 
331
  st.markdown("### πŸ€– Assistant’s Answer")
332
+ if not reasoning_mode and not answer.startswith("⚠️"):
333
+ answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
334
+ answer = re.sub(r"(^|\n)-\s*", r"\1<br>β€’ ", answer)
335
  st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
336
 
337
  # ==========================================================
338
+ # 🎨 Optional Sidebar Scroll Styling (keeps it clean)
339
  # ==========================================================
340
  st.markdown("""
341
  <style>
342
+ section[data-testid="stSidebar"] div.stExpander {
343
+ max-height: 480px;
344
+ overflow-y: auto;
345
+ }
346
  </style>
347
  """, unsafe_allow_html=True)