Shubham170793 commited on
Commit
bdf26f2
Β·
verified Β·
1 Parent(s): f5088d3

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +59 -59
src/streamlit_app.py CHANGED
@@ -27,10 +27,7 @@ st.set_page_config(
27
  # 🧹 Cache Management (prevent HF overflow)
28
  # ==========================================================
29
  def clean_cache(max_size_gb: float = 2.0):
30
- """
31
- Cleans large cache folders (> max_size_gb),
32
- preserving /tmp/hf_cache (used for model weights).
33
- """
34
  folders = [
35
  "/root/.cache/huggingface",
36
  "/root/.cache/transformers",
@@ -45,14 +42,10 @@ def clean_cache(max_size_gb: float = 2.0):
45
  for dp, _, files in os.walk(folder)
46
  for f in files
47
  ) / (1024**3)
48
-
49
  if size_gb > max_size_gb or "torch" in folder:
50
  shutil.rmtree(folder, ignore_errors=True)
51
  total_deleted += size_gb
52
  print(f"πŸ—‘οΈ Deleted {folder} ({size_gb:.2f} GB)")
53
- else:
54
- print(f"βœ… Preserved {folder} ({size_gb:.2f} GB)")
55
-
56
  os.makedirs("/tmp/hf_cache", exist_ok=True)
57
  print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
58
 
@@ -91,13 +84,13 @@ from vectorstore import build_faiss_index
91
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
92
 
93
  # ==========================================================
94
- # 🧠 TOC-Based Smart Question Generator
95
  # ==========================================================
96
  def clean_toc_titles(toc):
97
  """Removes section numbers and keeps only meaningful text."""
98
  clean_titles = []
99
  for _, title in toc:
100
- title = re.sub(r"^\d+(\.\d+)*\s*", "", title) # remove numbering like 3.1
101
  title = title.strip()
102
  if len(title) > 3:
103
  clean_titles.append(title)
@@ -135,7 +128,41 @@ def generate_query_suggestions(toc_titles):
135
  if s not in seen:
136
  seen.add(s)
137
  final.append(s)
138
- return final[:6] # Show top 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
 
141
  # ==========================================================
@@ -152,14 +179,12 @@ st.title("πŸ“„ Enterprise Knowledge Assistant")
152
  st.caption("Query SAP documentation and enterprise PDFs using natural language and reasoning.")
153
 
154
  # ==========================================================
155
- # 🧭 Sidebar β€” Library, Settings, Diagnostics
156
  # ==========================================================
157
  with st.sidebar:
158
- # πŸ–ΌοΈ App Logo
159
  if os.path.exists(LOGO_PATH):
160
  st.image(LOGO_PATH, width=150)
161
 
162
- # 🧠 Reasoning Mode Toggle
163
  if "reasoning_mode" not in st.session_state:
164
  st.session_state.reasoning_mode = False
165
 
@@ -170,8 +195,6 @@ with st.sidebar:
170
  )
171
 
172
  st.markdown("---")
173
-
174
- # πŸ“š Document Library
175
  st.header("πŸ“š Document Library")
176
  doc_choice = st.radio(
177
  "Choose a document:",
@@ -180,13 +203,10 @@ with st.sidebar:
180
  )
181
 
182
  st.markdown("---")
183
-
184
- # βš™οΈ Settings
185
  st.header("βš™οΈ Settings")
186
  chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
187
  overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
188
  top_k = st.slider("Top K Results", 1, 10, 5)
189
-
190
  st.markdown("---")
191
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
192
 
@@ -212,26 +232,21 @@ elif doc_choice == "Sample PDF":
212
  toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
213
  st.text_area("TOC Preview", toc_text, height=200)
214
 
215
- # πŸ’‘ Generate and display smart suggestions
216
  clean_titles = clean_toc_titles(toc)
217
  query_suggestions = generate_query_suggestions(clean_titles)
218
- if query_suggestions:
219
- st.markdown("#### πŸ’‘ Suggested Questions")
220
- cols = st.columns(2)
221
- for i, q in enumerate(query_suggestions):
222
- if cols[i % 2].button(f"πŸ” {q}"):
223
- st.session_state["user_query"] = q
 
 
 
 
224
 
225
- # βœ… Cached Embeddings
226
  with st.spinner("βš™οΈ Loading cached embeddings or generating new ones..."):
227
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
228
- hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
229
- cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
230
- if os.path.exists(cache_file):
231
- st.info(f"🧠 Using cached embeddings for {os.path.basename(temp_path)}")
232
- else:
233
- st.warning(f"πŸ’‘ Generated new embeddings for {os.path.basename(temp_path)}")
234
-
235
  index = build_faiss_index(embeddings)
236
 
237
  elif doc_choice == "Upload Custom PDF":
@@ -252,37 +267,24 @@ elif doc_choice == "Upload Custom PDF":
252
  toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
253
  st.text_area("TOC Preview", toc_text, height=200)
254
 
255
- # πŸ’‘ Generate and display smart suggestions
256
  clean_titles = clean_toc_titles(toc)
257
  query_suggestions = generate_query_suggestions(clean_titles)
258
- if query_suggestions:
259
- st.markdown("#### πŸ’‘ Suggested Questions")
260
- cols = st.columns(2)
261
- for i, q in enumerate(query_suggestions):
262
- if cols[i % 2].button(f"πŸ” {q}"):
263
- st.session_state["user_query"] = q
 
 
 
 
264
 
265
  with st.spinner("βš™οΈ Loading cached embeddings or generating new ones..."):
266
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
267
- hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
268
- cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
269
- if os.path.exists(cache_file):
270
- st.info(f"🧠 Using cached embeddings for {os.path.basename(temp_path)}")
271
- else:
272
- st.warning(f"πŸ’‘ Generated new embeddings for {os.path.basename(temp_path)}")
273
-
274
  index = build_faiss_index(embeddings)
275
  st.success("πŸš€ Document processed successfully!")
276
 
277
- # ==========================================================
278
- # πŸ“‘ Document Preview
279
- # ==========================================================
280
- if chunks:
281
- st.subheader("πŸ“‘ Document Preview")
282
- st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
283
- avg_len = int(sum(len(c) for c in chunks) / len(chunks))
284
- st.caption(f"πŸ“¦ {len(chunks)} chunks | Avg length: {avg_len} chars")
285
-
286
  # ==========================================================
287
  # πŸ’¬ Query Section
288
  # ==========================================================
@@ -307,14 +309,12 @@ if index and chunks:
307
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
308
  answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
309
 
310
- # βœ… Display Answer
311
  st.markdown("### βœ… Assistant’s Answer")
312
  st.markdown(
313
  f"<div style='background-color:#0E1117;padding:12px;border-radius:10px;color:white;'>{answer}</div>",
314
  unsafe_allow_html=True
315
  )
316
 
317
- # πŸ“„ Supporting Chunks
318
  with st.expander("πŸ“„ Supporting Chunks (Context Used)"):
319
  for i, r in enumerate(retrieved, start=1):
320
  st.markdown(
 
27
  # 🧹 Cache Management (prevent HF overflow)
28
  # ==========================================================
29
  def clean_cache(max_size_gb: float = 2.0):
30
+ """Cleans large cache folders (> max_size_gb)."""
 
 
 
31
  folders = [
32
  "/root/.cache/huggingface",
33
  "/root/.cache/transformers",
 
42
  for dp, _, files in os.walk(folder)
43
  for f in files
44
  ) / (1024**3)
 
45
  if size_gb > max_size_gb or "torch" in folder:
46
  shutil.rmtree(folder, ignore_errors=True)
47
  total_deleted += size_gb
48
  print(f"πŸ—‘οΈ Deleted {folder} ({size_gb:.2f} GB)")
 
 
 
49
  os.makedirs("/tmp/hf_cache", exist_ok=True)
50
  print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
51
 
 
84
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
85
 
86
  # ==========================================================
87
+ # 🧠 TOC-Based Smart Question Generator + AI Fallback
88
  # ==========================================================
89
  def clean_toc_titles(toc):
90
  """Removes section numbers and keeps only meaningful text."""
91
  clean_titles = []
92
  for _, title in toc:
93
+ title = re.sub(r"^\d+(\.\d+)*\s*", "", title)
94
  title = title.strip()
95
  if len(title) > 3:
96
  clean_titles.append(title)
 
128
  if s not in seen:
129
  seen.add(s)
130
  final.append(s)
131
+ return final[:6]
132
+
133
+
134
+ def generate_ai_fallback_suggestions(chunks):
135
+ """When no TOC is detected, use document content to guess interactive suggestions."""
136
+ if not chunks:
137
+ return []
138
+
139
+ # Take the first few chunks (intro + overview usually)
140
+ head_text = " ".join(chunks[:3]).lower()
141
+
142
+ suggestions = []
143
+ if "overview" in head_text or "introduction" in head_text:
144
+ suggestions.append("Can you summarize the overview of this document?")
145
+ if "setup" in head_text or "configuration" in head_text:
146
+ suggestions.append("How do I configure or set this up?")
147
+ if "prerequisite" in head_text:
148
+ suggestions.append("What are the prerequisites before using this process?")
149
+ if "troubleshoot" in head_text or "error" in head_text:
150
+ suggestions.append("How do I troubleshoot common errors?")
151
+ if "step" in head_text or "procedure" in head_text:
152
+ suggestions.append("Can you list the steps involved in this process?")
153
+ if "benefit" in head_text or "objective" in head_text:
154
+ suggestions.append("What is the objective or benefit of this guide?")
155
+
156
+ # Fallback generic questions if no keywords found
157
+ if not suggestions:
158
+ suggestions = [
159
+ "Can you summarize the main topic of this document?",
160
+ "What process does this guide explain?",
161
+ "How can I get started with the described setup?",
162
+ "What are the important details to remember?",
163
+ ]
164
+
165
+ return suggestions[:6]
166
 
167
 
168
  # ==========================================================
 
179
  st.caption("Query SAP documentation and enterprise PDFs using natural language and reasoning.")
180
 
181
  # ==========================================================
182
+ # 🧭 Sidebar
183
  # ==========================================================
184
  with st.sidebar:
 
185
  if os.path.exists(LOGO_PATH):
186
  st.image(LOGO_PATH, width=150)
187
 
 
188
  if "reasoning_mode" not in st.session_state:
189
  st.session_state.reasoning_mode = False
190
 
 
195
  )
196
 
197
  st.markdown("---")
 
 
198
  st.header("πŸ“š Document Library")
199
  doc_choice = st.radio(
200
  "Choose a document:",
 
203
  )
204
 
205
  st.markdown("---")
 
 
206
  st.header("βš™οΈ Settings")
207
  chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
208
  overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
209
  top_k = st.slider("Top K Results", 1, 10, 5)
 
210
  st.markdown("---")
211
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
212
 
 
232
  toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
233
  st.text_area("TOC Preview", toc_text, height=200)
234
 
 
235
  clean_titles = clean_toc_titles(toc)
236
  query_suggestions = generate_query_suggestions(clean_titles)
237
+ else:
238
+ st.warning("⚠️ No TOC detected β€” generating smart suggestions using content...")
239
+ query_suggestions = generate_ai_fallback_suggestions(chunks)
240
+
241
+ if query_suggestions:
242
+ st.markdown("#### πŸ’‘ Suggested Questions")
243
+ cols = st.columns(2)
244
+ for i, q in enumerate(query_suggestions):
245
+ if cols[i % 2].button(f"πŸ” {q}"):
246
+ st.session_state["user_query"] = q
247
 
 
248
  with st.spinner("βš™οΈ Loading cached embeddings or generating new ones..."):
249
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
 
 
 
 
 
 
 
250
  index = build_faiss_index(embeddings)
251
 
252
  elif doc_choice == "Upload Custom PDF":
 
267
  toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
268
  st.text_area("TOC Preview", toc_text, height=200)
269
 
 
270
  clean_titles = clean_toc_titles(toc)
271
  query_suggestions = generate_query_suggestions(clean_titles)
272
+ else:
273
+ st.warning("⚠️ No TOC detected β€” generating smart suggestions using content...")
274
+ query_suggestions = generate_ai_fallback_suggestions(chunks)
275
+
276
+ if query_suggestions:
277
+ st.markdown("#### πŸ’‘ Suggested Questions")
278
+ cols = st.columns(2)
279
+ for i, q in enumerate(query_suggestions):
280
+ if cols[i % 2].button(f"πŸ” {q}"):
281
+ st.session_state["user_query"] = q
282
 
283
  with st.spinner("βš™οΈ Loading cached embeddings or generating new ones..."):
284
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
 
 
 
 
 
 
 
285
  index = build_faiss_index(embeddings)
286
  st.success("πŸš€ Document processed successfully!")
287
 
 
 
 
 
 
 
 
 
 
288
  # ==========================================================
289
  # πŸ’¬ Query Section
290
  # ==========================================================
 
309
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
310
  answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
311
 
 
312
  st.markdown("### βœ… Assistant’s Answer")
313
  st.markdown(
314
  f"<div style='background-color:#0E1117;padding:12px;border-radius:10px;color:white;'>{answer}</div>",
315
  unsafe_allow_html=True
316
  )
317
 
 
318
  with st.expander("πŸ“„ Supporting Chunks (Context Used)"):
319
  for i, r in enumerate(retrieved, start=1):
320
  st.markdown(