Shubham170793 commited on
Commit
3d6631f
Β·
verified Β·
1 Parent(s): f978245

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +61 -149
src/streamlit_app.py CHANGED
@@ -1,5 +1,5 @@
1
  # ==========================================================
2
- # streamlit_app.py β€” Stable Layout (English Only)
3
  # ==========================================================
4
  import os
5
  import re
@@ -29,50 +29,6 @@ os.environ.update({
29
  "HF_MODULES_CACHE": CACHE_DIR,
30
  })
31
 
32
- # ==========================================================
33
- # 🧠 SMART SUGGESTION GENERATOR
34
- # ==========================================================
35
- def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
36
- if not toc or not chunks:
37
- return ["How do I start using this guide?", "What does this document cover?"]
38
-
39
- titles = []
40
- for sec, raw_title in toc:
41
- title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
42
- title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
43
- if 4 < len(title) < 120:
44
- titles.append(title)
45
-
46
- context_sample = " ".join(chunks[:3])[:4000]
47
- prompt = f"""
48
- You are a content assistant. Based on the TOC and text, generate 5–7 short natural questions.
49
- Each question <18 words, ends with '?', sounds human. Document: "{doc_name}"
50
- TOC:
51
- {chr(10).join(['- ' + t for t in titles[:8]])}
52
- Sample:
53
- {context_sample}
54
- """
55
-
56
- try:
57
- ai_response = genai_generate(prompt)
58
- lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
59
- out = []
60
- for ln in lines:
61
- q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
62
- if not q.endswith("?") and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
63
- q += "?"
64
- if 8 <= len(q) <= 140:
65
- out.append(q)
66
- uniq = []
67
- seen = set()
68
- for q in out:
69
- if q.lower() not in seen:
70
- seen.add(q.lower())
71
- uniq.append(q)
72
- return uniq[:7] or [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
73
- except Exception:
74
- return ["How do I start using this guide?", "What does this document cover?"]
75
-
76
  # ==========================================================
77
  # 🎨 STYLING
78
  # ==========================================================
@@ -80,17 +36,12 @@ st.markdown("""
80
  <style>
81
  div.block-container {padding-top:1.2rem;max-width:1080px;}
82
  h1,h2,h3{color:#f3f4f6;font-weight:600;}
83
- .suggest-chip{background:#0f1724;border:1px solid #374151;border-radius:14px;
84
- color:#e6eef8;padding:8px 12px;cursor:pointer;font-size:13px;margin:6px 6px 6px 0;
85
- display:inline-block;transition:background 0.2s,transform 0.1s;}
86
- .suggest-chip:hover{background:#1e3a8a;transform:translateY(-2px);}
87
  .answer-box{background:linear-gradient(180deg,#0b1220,#071027);
88
  border-left:4px solid #3b82f6;border-radius:8px;padding:16px 18px;color:#e6eef8;
89
  margin-top:12px;box-shadow:0 4px 14px rgba(0,0,0,0.35);}
90
  .stTextInput>div>div>input{background-color:#0f172a!important;color:#f1f5f9!important;
91
  border-radius:6px!important;border:1px solid #334155!important;padding:8px 10px!important;
92
  font-size:15px!important;}
93
- .small-link{font-size:13px;color:#60a5fa;cursor:pointer;}
94
  </style>
95
  """, unsafe_allow_html=True)
96
 
@@ -107,14 +58,13 @@ with st.sidebar:
107
  )
108
  st.markdown("---")
109
 
110
- # πŸ“š Registry display
111
  if "registry" in st.session_state:
112
  registry = st.session_state["registry"]
113
- registered = registry.list_docs() if hasattr(registry, "list_docs") else []
114
- if registered:
115
  with st.expander("πŸ“š Registered Documents", expanded=False):
116
- for i, d in enumerate(registered, 1):
117
- st.markdown(f"**{i}. {d.get('name','?')}** β€” {d.get('num_chunks','?')} chunks *(TOC: {d.get('toc_source','β€”')})*")
118
  else:
119
  st.caption("πŸ“­ No documents registered yet.")
120
  else:
@@ -123,7 +73,6 @@ with st.sidebar:
123
  st.markdown("---")
124
  show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
125
  if show_dev:
126
- st.markdown("### βš™οΈ Developer Options")
127
  chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
128
  overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
129
  top_k = st.slider("Top K Results", 1, 10, 5)
@@ -132,32 +81,12 @@ with st.sidebar:
132
  st.markdown("---")
133
  st.caption("✨ Built by Shubham Sharma")
134
 
135
- # ==========================================================
136
- # 🧠 SESSION STATE
137
- # ==========================================================
138
- for key, val in {
139
- "user_query_input": "",
140
- "show_more": False,
141
- "selected_suggestion": None,
142
- "query_suggestions_fixed": None,
143
- "last_doc": None,
144
- "doc_lang": "en",
145
- }.items():
146
- if key not in st.session_state:
147
- st.session_state[key] = val
148
-
149
- def set_user_query(q, idx):
150
- st.session_state["user_query_input"] = q
151
- st.session_state["selected_suggestion"] = idx
152
- st.experimental_rerun()
153
-
154
  # ==========================================================
155
  # πŸ“„ MAIN SECTION
156
  # ==========================================================
157
  st.title("πŸ“„ Enterprise Knowledge Assistant")
158
  st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
159
 
160
- # βœ… FIXED: must be defined before document-handling logic
161
  doc_choice = st.radio(
162
  "Select a document:",
163
  ["-- Select --", "Sample PDF", "Upload Custom PDF"],
@@ -165,7 +94,7 @@ doc_choice = st.radio(
165
  )
166
 
167
  # ==========================================================
168
- # πŸ“‚ DOCUMENT HANDLING
169
  # ==========================================================
170
  def _hash_content(file_path):
171
  h = hashlib.sha256()
@@ -174,11 +103,6 @@ def _hash_content(file_path):
174
  h.update(chunk)
175
  return h.hexdigest()[:12]
176
 
177
- def refresh_suggestions(doc_name, toc, chunks):
178
- st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
179
- toc, chunks, doc_name
180
- )
181
- st.session_state.update({"user_query_input": "", "selected_suggestion": None, "show_more": False})
182
 
183
  if doc_choice == "-- Select --":
184
  st.info("⬅️ Select or upload a document to begin.")
@@ -198,96 +122,84 @@ else:
198
 
199
  if temp_path:
200
  doc_name = os.path.basename(temp_path)
201
- doc_hash = _hash_content(temp_path)
202
- doc_identifier = f"{doc_name}_{doc_hash}"
203
 
204
- # --- registry check
205
  if "registry" not in st.session_state:
206
  st.session_state["registry"] = DocumentRegistry()
207
  registry = st.session_state["registry"]
208
 
209
- existing = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
210
- if existing:
211
- doc_data = registry.get_doc(existing["name"])
 
212
  st.session_state.update({
213
  "text": doc_data.get("text", ""),
214
  "toc": doc_data.get("toc", []),
215
  "chunks": doc_data.get("chunks", []),
216
  "embeddings": doc_data.get("embeddings"),
217
  "index": doc_data.get("index"),
218
- "doc_ready": True,
219
- "active_doc": existing["name"],
220
- "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
221
  })
222
- refresh_suggestions(existing["name"], st.session_state["toc"], st.session_state["chunks"])
223
- st.experimental_rerun()
224
-
225
- status = st.empty()
226
- status.info("πŸ“€ Upload complete β€” reading document...")
227
-
228
- text, toc, toc_source = extract_text_from_pdf(temp_path)
229
- status.info("πŸ“‘ Parsing and chunking document...")
230
- chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
231
 
232
- status.info("🧠 Building embeddings and search index...")
233
- embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
234
- index = build_faiss_index(embeddings)
 
 
 
235
 
236
- doc_id = registry.register(temp_path, chunks, embeddings, index)
237
- st.session_state["active_doc"] = doc_id
238
- status.success("βœ… Document processed successfully β€” ready to query!")
239
- refresh_suggestions(doc_name, toc, chunks)
240
 
241
- st.session_state.update({
242
- "text": text,
243
- "toc": toc,
244
- "chunks": chunks,
245
- "embeddings": embeddings,
246
- "index": index,
247
- "doc_ready": True,
248
- "last_doc": doc_identifier,
249
- "status_text": "βœ… Document processed successfully β€” ready to query!"
250
- })
251
- st.experimental_rerun()
252
 
253
- if st.session_state.get("doc_ready"):
254
- st.info(st.session_state.get("status_text", "πŸ“„ Ready for queries."))
255
- st.markdown("### πŸ’¬ Ask the Assistant")
 
256
 
257
- suggs = st.session_state.get("query_suggestions_fixed", [])
258
- if suggs:
259
- visible = suggs if st.session_state["show_more"] else suggs[:3]
260
  cols = st.columns(min(3, len(visible)))
261
  for i, q in enumerate(visible):
262
  if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
263
- set_user_query(q, i)
 
264
 
265
  toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
266
  if st.button(toggle_text):
267
  st.session_state["show_more"] = not st.session_state["show_more"]
268
  st.experimental_rerun()
269
 
270
- user_query = st.text_input("Type your question or click one above:", key="user_query_input")
271
- if user_query.strip():
272
- reasoning_mode = mode == "Extended (Document + General)"
273
- with st.spinner("πŸ’­ Generating your answer..."):
274
- retrieved = retrieve_chunks(
275
- user_query,
276
- st.session_state["index"],
277
- st.session_state["chunks"],
278
- top_k=top_k,
279
- embeddings=st.session_state["embeddings"]
280
- )
281
- answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
282
- st.session_state["retrieved"] = retrieved
283
- st.markdown("### πŸ€– Assistant’s Answer")
284
- st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
285
-
286
- # ==========================================================
287
- # 🎨 Sidebar scroll style
288
- # ==========================================================
289
- st.markdown("""
290
- <style>
291
- section[data-testid="stSidebar"] div.stExpander {max-height:480px;overflow-y:auto;}
292
- </style>
293
- """, unsafe_allow_html=True)
 
1
  # ==========================================================
2
+ # streamlit_app.py β€” Commit 2 (Stable)
3
  # ==========================================================
4
  import os
5
  import re
 
29
  "HF_MODULES_CACHE": CACHE_DIR,
30
  })
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # ==========================================================
33
  # 🎨 STYLING
34
  # ==========================================================
 
36
  <style>
37
  div.block-container {padding-top:1.2rem;max-width:1080px;}
38
  h1,h2,h3{color:#f3f4f6;font-weight:600;}
 
 
 
 
39
  .answer-box{background:linear-gradient(180deg,#0b1220,#071027);
40
  border-left:4px solid #3b82f6;border-radius:8px;padding:16px 18px;color:#e6eef8;
41
  margin-top:12px;box-shadow:0 4px 14px rgba(0,0,0,0.35);}
42
  .stTextInput>div>div>input{background-color:#0f172a!important;color:#f1f5f9!important;
43
  border-radius:6px!important;border:1px solid #334155!important;padding:8px 10px!important;
44
  font-size:15px!important;}
 
45
  </style>
46
  """, unsafe_allow_html=True)
47
 
 
58
  )
59
  st.markdown("---")
60
 
 
61
  if "registry" in st.session_state:
62
  registry = st.session_state["registry"]
63
+ registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
64
+ if registered_docs:
65
  with st.expander("πŸ“š Registered Documents", expanded=False):
66
+ for i, doc in enumerate(registered_docs, start=1):
67
+ st.markdown(f"**{i}. {doc.get('name','?')}** β€” {doc.get('num_chunks','?')} chunks *(TOC: {doc.get('toc_source','β€”')})*")
68
  else:
69
  st.caption("πŸ“­ No documents registered yet.")
70
  else:
 
73
  st.markdown("---")
74
  show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
75
  if show_dev:
 
76
  chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
77
  overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
78
  top_k = st.slider("Top K Results", 1, 10, 5)
 
81
  st.markdown("---")
82
  st.caption("✨ Built by Shubham Sharma")
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # ==========================================================
85
  # πŸ“„ MAIN SECTION
86
  # ==========================================================
87
  st.title("πŸ“„ Enterprise Knowledge Assistant")
88
  st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
89
 
 
90
  doc_choice = st.radio(
91
  "Select a document:",
92
  ["-- Select --", "Sample PDF", "Upload Custom PDF"],
 
94
  )
95
 
96
  # ==========================================================
97
+ # πŸ“‚ DOCUMENT HANDLING (Commit 2)
98
  # ==========================================================
99
  def _hash_content(file_path):
100
  h = hashlib.sha256()
 
103
  h.update(chunk)
104
  return h.hexdigest()[:12]
105
 
 
 
 
 
 
106
 
107
  if doc_choice == "-- Select --":
108
  st.info("⬅️ Select or upload a document to begin.")
 
122
 
123
  if temp_path:
124
  doc_name = os.path.basename(temp_path)
125
+ file_hash = _hash_content(temp_path)
126
+ doc_identifier = f"{doc_name}_{file_hash}"
127
 
128
+ # βœ… Registry initialization
129
  if "registry" not in st.session_state:
130
  st.session_state["registry"] = DocumentRegistry()
131
  registry = st.session_state["registry"]
132
 
133
+ # βœ… Reuse if already processed
134
+ if doc_name in [d["name"] for d in registry.list_docs()]:
135
+ st.session_state["status_text"] = f"βœ… {doc_name} already processed β€” loaded from registry."
136
+ doc_data = registry.get_doc(doc_name)
137
  st.session_state.update({
138
  "text": doc_data.get("text", ""),
139
  "toc": doc_data.get("toc", []),
140
  "chunks": doc_data.get("chunks", []),
141
  "embeddings": doc_data.get("embeddings"),
142
  "index": doc_data.get("index"),
143
+ "doc_ready": True
 
 
144
  })
145
+ else:
146
+ status = st.empty()
147
+ status.info("πŸ“€ Upload complete β€” reading document...")
 
 
 
 
 
 
148
 
149
+ text, toc, toc_source = extract_text_from_pdf(temp_path)
150
+ status.info("πŸ“‘ Parsing and chunking document...")
151
+ chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
152
+ status.info("🧠 Building embeddings and search index...")
153
+ embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
154
+ index = build_faiss_index(embeddings)
155
 
156
+ doc_id = registry.register(temp_path, chunks, embeddings, index)
157
+ st.session_state["active_doc"] = doc_id
158
+ status.success("βœ… Document processed successfully β€” ready to query!")
 
159
 
160
+ st.session_state.update({
161
+ "text": text,
162
+ "toc": toc,
163
+ "chunks": chunks,
164
+ "embeddings": embeddings,
165
+ "index": index,
166
+ "doc_ready": True,
167
+ "last_doc": doc_identifier,
168
+ "status_text": "βœ… Document processed successfully β€” ready to query!"
169
+ })
 
170
 
171
+ # --- Ask section ---
172
+ if st.session_state.get("doc_ready"):
173
+ st.info(st.session_state.get("status_text", "πŸ“„ Ready for queries."))
174
+ st.markdown("### πŸ’¬ Ask the Assistant")
175
 
176
+ query_suggestions = ["How do I start using this guide?", "What are the prerequisites?", "What is covered in this document?"]
177
+ visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
 
178
  cols = st.columns(min(3, len(visible)))
179
  for i, q in enumerate(visible):
180
  if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
181
+ st.session_state["user_query_input"] = q
182
+ st.experimental_rerun()
183
 
184
  toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
185
  if st.button(toggle_text):
186
  st.session_state["show_more"] = not st.session_state["show_more"]
187
  st.experimental_rerun()
188
 
189
+ user_query = st.text_input("Type your question or click one above:", key="user_query_input")
190
+
191
+ if user_query.strip():
192
+ reasoning_mode = mode == "Extended (Document + General)"
193
+ with st.spinner("πŸ’­ Generating your answer..."):
194
+ retrieved = retrieve_chunks(
195
+ user_query,
196
+ st.session_state["index"],
197
+ st.session_state["chunks"],
198
+ top_k=top_k,
199
+ embeddings=st.session_state["embeddings"]
200
+ )
201
+ answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
202
+ st.session_state["retrieved"] = retrieved
203
+
204
+ st.markdown("### πŸ€– Assistant’s Answer")
205
+ st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)