Shubham170793 commited on
Commit
59078ee
Β·
verified Β·
1 Parent(s): bdf26f2

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +61 -98
src/streamlit_app.py CHANGED
@@ -24,17 +24,15 @@ st.set_page_config(
24
  )
25
 
26
  # ==========================================================
27
- # 🧹 Cache Management (prevent HF overflow)
28
  # ==========================================================
29
  def clean_cache(max_size_gb: float = 2.0):
30
- """Cleans large cache folders (> max_size_gb)."""
31
  folders = [
32
  "/root/.cache/huggingface",
33
  "/root/.cache/transformers",
34
  "/root/.cache/torch",
35
  ]
36
  total_deleted = 0.0
37
-
38
  for folder in folders:
39
  if os.path.exists(folder):
40
  size_gb = sum(
@@ -45,13 +43,10 @@ def clean_cache(max_size_gb: float = 2.0):
45
  if size_gb > max_size_gb or "torch" in folder:
46
  shutil.rmtree(folder, ignore_errors=True)
47
  total_deleted += size_gb
48
- print(f"πŸ—‘οΈ Deleted {folder} ({size_gb:.2f} GB)")
49
  os.makedirs("/tmp/hf_cache", exist_ok=True)
50
  print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
51
 
52
-
53
  def check_disk_usage():
54
- """Display disk usage info in sidebar."""
55
  st.sidebar.markdown("### πŸ’Ύ Disk Usage (Debug)")
56
  try:
57
  usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
@@ -59,13 +54,11 @@ def check_disk_usage():
59
  except Exception as e:
60
  st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
61
 
62
-
63
- # Run cache cleanup once at startup
64
  clean_cache()
65
  check_disk_usage()
66
 
67
  # ==========================================================
68
- # βš™οΈ Hugging Face Cache Configuration
69
  # ==========================================================
70
  CACHE_DIR = "/tmp/hf_cache"
71
  os.makedirs(CACHE_DIR, exist_ok=True)
@@ -81,13 +74,12 @@ os.environ.update({
81
  # ==========================================================
82
  from ingestion import extract_text_from_pdf, chunk_text
83
  from vectorstore import build_faiss_index
84
- from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
85
 
86
  # ==========================================================
87
- # 🧠 TOC-Based Smart Question Generator + AI Fallback
88
  # ==========================================================
89
  def clean_toc_titles(toc):
90
- """Removes section numbers and keeps only meaningful text."""
91
  clean_titles = []
92
  for _, title in toc:
93
  title = re.sub(r"^\d+(\.\d+)*\s*", "", title)
@@ -98,14 +90,12 @@ def clean_toc_titles(toc):
98
 
99
 
100
  def generate_query_suggestions(toc_titles):
101
- """Converts section titles into conversational question suggestions."""
102
  suggestions = []
103
  for t in toc_titles:
104
  lower = t.lower()
105
-
106
  if "prerequisite" in lower:
107
  suggestions.append("What are the prerequisites for setting this up?")
108
- elif "restriction" in lower or "limitation" in lower:
109
  suggestions.append("What are the key restrictions or limitations?")
110
  elif "configuration" in lower or "setup" in lower:
111
  suggestions.append(f"How do I {t.lower()}?")
@@ -117,12 +107,8 @@ def generate_query_suggestions(toc_titles):
117
  suggestions.append("Can you show an example from this document?")
118
  elif "process" in lower:
119
  suggestions.append(f"Can you explain the {t.lower()} process?")
120
- elif "use" in lower:
121
- suggestions.append(f"How do I {t.lower()}?")
122
  else:
123
  suggestions.append(f"Explain the section about {t.lower()}.")
124
-
125
- # Deduplicate & limit
126
  seen, final = set(), []
127
  for s in suggestions:
128
  if s not in seen:
@@ -131,38 +117,48 @@ def generate_query_suggestions(toc_titles):
131
  return final[:6]
132
 
133
 
134
- def generate_ai_fallback_suggestions(chunks):
135
- """When no TOC is detected, use document content to guess interactive suggestions."""
 
 
 
136
  if not chunks:
137
  return []
138
 
139
- # Take the first few chunks (intro + overview usually)
140
- head_text = " ".join(chunks[:3]).lower()
 
 
141
 
142
- suggestions = []
143
- if "overview" in head_text or "introduction" in head_text:
144
- suggestions.append("Can you summarize the overview of this document?")
145
- if "setup" in head_text or "configuration" in head_text:
146
- suggestions.append("How do I configure or set this up?")
147
- if "prerequisite" in head_text:
148
- suggestions.append("What are the prerequisites before using this process?")
149
- if "troubleshoot" in head_text or "error" in head_text:
150
- suggestions.append("How do I troubleshoot common errors?")
151
- if "step" in head_text or "procedure" in head_text:
152
- suggestions.append("Can you list the steps involved in this process?")
153
- if "benefit" in head_text or "objective" in head_text:
154
- suggestions.append("What is the objective or benefit of this guide?")
155
-
156
- # Fallback generic questions if no keywords found
157
- if not suggestions:
158
- suggestions = [
159
- "Can you summarize the main topic of this document?",
160
- "What process does this guide explain?",
161
- "How can I get started with the described setup?",
162
- "What are the important details to remember?",
163
- ]
164
 
165
- return suggestions[:6]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
 
168
  # ==========================================================
@@ -191,21 +187,17 @@ with st.sidebar:
191
  st.session_state.reasoning_mode = st.toggle(
192
  "🧠 Enable Reasoning Mode",
193
  value=st.session_state.reasoning_mode,
194
- help="When ON: GPT-4o uses reasoning + web-like synthesis.\nWhen OFF: Strictly factual from PDF."
195
  )
196
 
197
  st.markdown("---")
198
  st.header("πŸ“š Document Library")
199
- doc_choice = st.radio(
200
- "Choose a document:",
201
- ["-- Select --", "Sample PDF", "Upload Custom PDF"],
202
- index=0
203
- )
204
 
205
  st.markdown("---")
206
  st.header("βš™οΈ Settings")
207
- chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
208
- overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
209
  top_k = st.slider("Top K Results", 1, 10, 5)
210
  st.markdown("---")
211
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
@@ -218,49 +210,21 @@ text, chunks, index, embeddings, toc = None, None, None, None, None
218
  if doc_choice == "-- Select --":
219
  st.info("⬅️ Please choose a document from the sidebar.")
220
 
221
- elif doc_choice == "Sample PDF":
222
- temp_path = SAMPLE_PATH
223
- st.success("πŸ“˜ Using built-in Sample PDF")
224
-
225
- with st.spinner("πŸ” Extracting and processing document..."):
226
- text, toc = extract_text_from_pdf(temp_path)
227
- chunks = chunk_text(text, chunk_size=chunk_size)
228
- st.write(f"πŸ“‘ Extracted {len(chunks)} chunks.")
229
-
230
- if toc:
231
- st.markdown("### 🧭 Detected Table of Contents")
232
- toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
233
- st.text_area("TOC Preview", toc_text, height=200)
234
-
235
- clean_titles = clean_toc_titles(toc)
236
- query_suggestions = generate_query_suggestions(clean_titles)
237
- else:
238
- st.warning("⚠️ No TOC detected β€” generating smart suggestions using content...")
239
- query_suggestions = generate_ai_fallback_suggestions(chunks)
240
-
241
- if query_suggestions:
242
- st.markdown("#### πŸ’‘ Suggested Questions")
243
- cols = st.columns(2)
244
- for i, q in enumerate(query_suggestions):
245
- if cols[i % 2].button(f"πŸ” {q}"):
246
- st.session_state["user_query"] = q
247
-
248
- with st.spinner("βš™οΈ Loading cached embeddings or generating new ones..."):
249
- embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
250
- index = build_faiss_index(embeddings)
251
-
252
- elif doc_choice == "Upload Custom PDF":
253
- uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
254
- if uploaded_file:
255
- temp_path = os.path.join("/tmp", uploaded_file.name)
256
- with open(temp_path, "wb") as f:
257
- f.write(uploaded_file.getbuffer())
258
- st.success(f"βœ… File '{uploaded_file.name}' uploaded successfully")
259
-
260
- with st.spinner("βš™οΈ Extracting and processing your document..."):
261
  text, toc = extract_text_from_pdf(temp_path)
262
  chunks = chunk_text(text, chunk_size=chunk_size)
263
- st.write(f"πŸ“„ Extracted {len(chunks)} chunks.")
264
 
265
  if toc:
266
  st.markdown("### 🧭 Detected Table of Contents")
@@ -270,8 +234,8 @@ elif doc_choice == "Upload Custom PDF":
270
  clean_titles = clean_toc_titles(toc)
271
  query_suggestions = generate_query_suggestions(clean_titles)
272
  else:
273
- st.warning("⚠️ No TOC detected β€” generating smart suggestions using content...")
274
- query_suggestions = generate_ai_fallback_suggestions(chunks)
275
 
276
  if query_suggestions:
277
  st.markdown("#### πŸ’‘ Suggested Questions")
@@ -325,6 +289,5 @@ if index and chunks:
325
  """,
326
  unsafe_allow_html=True,
327
  )
328
-
329
  else:
330
  st.info("πŸ“₯ Upload or select a document to start exploring.")
 
24
  )
25
 
26
  # ==========================================================
27
+ # 🧹 Cache Management
28
  # ==========================================================
29
  def clean_cache(max_size_gb: float = 2.0):
 
30
  folders = [
31
  "/root/.cache/huggingface",
32
  "/root/.cache/transformers",
33
  "/root/.cache/torch",
34
  ]
35
  total_deleted = 0.0
 
36
  for folder in folders:
37
  if os.path.exists(folder):
38
  size_gb = sum(
 
43
  if size_gb > max_size_gb or "torch" in folder:
44
  shutil.rmtree(folder, ignore_errors=True)
45
  total_deleted += size_gb
 
46
  os.makedirs("/tmp/hf_cache", exist_ok=True)
47
  print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
48
 
 
49
  def check_disk_usage():
 
50
  st.sidebar.markdown("### πŸ’Ύ Disk Usage (Debug)")
51
  try:
52
  usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
 
54
  except Exception as e:
55
  st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
56
 
 
 
57
  clean_cache()
58
  check_disk_usage()
59
 
60
  # ==========================================================
61
+ # βš™οΈ HF Cache Configuration
62
  # ==========================================================
63
  CACHE_DIR = "/tmp/hf_cache"
64
  os.makedirs(CACHE_DIR, exist_ok=True)
 
74
  # ==========================================================
75
  from ingestion import extract_text_from_pdf, chunk_text
76
  from vectorstore import build_faiss_index
77
+ from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate # add genai_generate!
78
 
79
  # ==========================================================
80
+ # 🧠 TOC & Dynamic AI Suggestion System
81
  # ==========================================================
82
  def clean_toc_titles(toc):
 
83
  clean_titles = []
84
  for _, title in toc:
85
  title = re.sub(r"^\d+(\.\d+)*\s*", "", title)
 
90
 
91
 
92
  def generate_query_suggestions(toc_titles):
 
93
  suggestions = []
94
  for t in toc_titles:
95
  lower = t.lower()
 
96
  if "prerequisite" in lower:
97
  suggestions.append("What are the prerequisites for setting this up?")
98
+ elif "restriction" in lower:
99
  suggestions.append("What are the key restrictions or limitations?")
100
  elif "configuration" in lower or "setup" in lower:
101
  suggestions.append(f"How do I {t.lower()}?")
 
107
  suggestions.append("Can you show an example from this document?")
108
  elif "process" in lower:
109
  suggestions.append(f"Can you explain the {t.lower()} process?")
 
 
110
  else:
111
  suggestions.append(f"Explain the section about {t.lower()}.")
 
 
112
  seen, final = set(), []
113
  for s in suggestions:
114
  if s not in seen:
 
117
  return final[:6]
118
 
119
 
120
+ def generate_ai_dynamic_suggestions(chunks, doc_name="Document"):
121
+ """
122
+ πŸ€– Uses GPT-4o via SAP GenAI Hub to analyze first few chunks
123
+ and generate dynamic, context-aware question suggestions.
124
+ """
125
  if not chunks:
126
  return []
127
 
128
+ # Take top 3 chunks as context
129
+ sample_text = " ".join(chunks[:3])[:3000]
130
+ prompt = f"""
131
+ You are an intelligent assistant helping users explore enterprise documentation titled '{doc_name}'.
132
 
133
+ Based on the content below, generate 5 short, interactive, human-like questions
134
+ that a curious user might ask to understand this document better.
135
+ Avoid section numbers, and sound conversational.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ ---
138
+ Content Sample:
139
+ {sample_text}
140
+ ---
141
+ Questions:
142
+ """
143
+
144
+ try:
145
+ ai_response = genai_generate(prompt) # Uses your existing GPT-4o connector
146
+ questions = re.findall(r"[-β€’]?\s*(.+)", ai_response)
147
+ clean_q = [q.strip("β€’-β€” ").strip() for q in questions if 8 < len(q) < 120]
148
+ clean_q = [q for q in clean_q if q.endswith("?")]
149
+ return clean_q[:6] if clean_q else [
150
+ "What is this document about?",
151
+ "How do I start using the process described here?",
152
+ "What key setup steps are involved?",
153
+ "What benefits or objectives are explained?",
154
+ ]
155
+ except Exception as e:
156
+ print(f"⚠️ AI suggestion generation failed: {e}")
157
+ return [
158
+ "Can you summarize the document?",
159
+ "What is the main idea here?",
160
+ "How does this guide help me?",
161
+ ]
162
 
163
 
164
  # ==========================================================
 
187
  st.session_state.reasoning_mode = st.toggle(
188
  "🧠 Enable Reasoning Mode",
189
  value=st.session_state.reasoning_mode,
190
+ help="When ON: GPT-4o uses reasoning + synthesis.\nWhen OFF: strictly factual."
191
  )
192
 
193
  st.markdown("---")
194
  st.header("πŸ“š Document Library")
195
+ doc_choice = st.radio("Choose a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
 
 
 
 
196
 
197
  st.markdown("---")
198
  st.header("βš™οΈ Settings")
199
+ chunk_size = st.slider("Chunk Size", 200, 1500, 800, step=50)
200
+ overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
201
  top_k = st.slider("Top K Results", 1, 10, 5)
202
  st.markdown("---")
203
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
 
210
  if doc_choice == "-- Select --":
211
  st.info("⬅️ Please choose a document from the sidebar.")
212
 
213
+ elif doc_choice in ["Sample PDF", "Upload Custom PDF"]:
214
+ temp_path = SAMPLE_PATH if doc_choice == "Sample PDF" else None
215
+ if doc_choice == "Upload Custom PDF":
216
+ uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
217
+ if uploaded_file:
218
+ temp_path = os.path.join("/tmp", uploaded_file.name)
219
+ with open(temp_path, "wb") as f:
220
+ f.write(uploaded_file.getbuffer())
221
+ st.success(f"βœ… File '{uploaded_file.name}' uploaded successfully")
222
+
223
+ if temp_path:
224
+ with st.spinner("πŸ” Extracting and processing document..."):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  text, toc = extract_text_from_pdf(temp_path)
226
  chunks = chunk_text(text, chunk_size=chunk_size)
227
+ st.write(f"πŸ“‘ Extracted {len(chunks)} chunks.")
228
 
229
  if toc:
230
  st.markdown("### 🧭 Detected Table of Contents")
 
234
  clean_titles = clean_toc_titles(toc)
235
  query_suggestions = generate_query_suggestions(clean_titles)
236
  else:
237
+ st.warning("⚠️ No TOC detected β€” generating dynamic suggestions using AI...")
238
+ query_suggestions = generate_ai_dynamic_suggestions(chunks, doc_name=os.path.basename(temp_path))
239
 
240
  if query_suggestions:
241
  st.markdown("#### πŸ’‘ Suggested Questions")
 
289
  """,
290
  unsafe_allow_html=True,
291
  )
 
292
  else:
293
  st.info("πŸ“₯ Upload or select a document to start exploring.")