Shubham170793 commited on
Commit
cabbcce
Β·
verified Β·
1 Parent(s): 34f4cf4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +194 -219
src/streamlit_app.py CHANGED
@@ -2,271 +2,246 @@
2
  import os
3
  import re
4
  import streamlit as st
5
- import torch
 
 
6
 
7
- # ==========================================================
8
- # βœ… PAGE CONFIG
9
- # ==========================================================
10
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
11
- print("CUDA available:", torch.cuda.is_available())
12
 
13
- # ==========================================================
14
- # βš™οΈ CACHE DIR
15
- # ==========================================================
16
- CACHE_DIR = "/tmp/hf_cache"
17
- os.makedirs(CACHE_DIR, exist_ok=True)
18
- os.environ.update({
19
- "HF_HOME": CACHE_DIR,
20
- "TRANSFORMERS_CACHE": CACHE_DIR,
21
- "HF_DATASETS_CACHE": CACHE_DIR,
22
- "HF_MODULES_CACHE": CACHE_DIR,
23
- })
24
-
25
- # ==========================================================
26
- # πŸ“¦ IMPORTS
 
 
 
 
 
 
 
 
 
 
27
  # ==========================================================
28
  from ingestion import extract_text_from_pdf, chunk_text
29
  from vectorstore import build_faiss_index
30
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
31
 
32
  # ==========================================================
33
- # 🧠 SMART SUGGESTION GENERATOR
34
- # ==========================================================
35
- def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
36
- if not toc or not chunks:
37
- return []
38
- titles = []
39
- for sec, raw_title in toc:
40
- title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
41
- title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
42
- if 4 < len(title) < 120:
43
- titles.append(title)
44
- context_sample = " ".join(chunks[:3])[:4000]
45
- prompt = f"""
46
- You are generating concise, context-aware questions based on the document "{doc_name}".
47
- Use this Table of Contents and sample content for inspiration.
48
-
49
- TABLE OF CONTENTS:
50
- {chr(10).join(['- ' + t for t in titles[:8]])}
51
-
52
- TEXT SAMPLE:
53
- {context_sample}
54
-
55
- Generate 5–7 short, relevant, strictly document-based questions.
56
- Each question should be under 18 words.
57
- """
58
- try:
59
- ai_response = genai_generate(prompt)
60
- questions = re.findall(r"[-β€’]?\s*(.+?)\?", ai_response)
61
- clean_qs = [q.strip("β€’-β€” ").strip() + "?" for q in questions if 8 < len(q) < 120]
62
- seen, final = set(), []
63
- for q in clean_qs:
64
- if q.lower() not in seen:
65
- seen.add(q.lower())
66
- final.append(q)
67
- return final[:7]
68
- except Exception:
69
- return ["What is this document about?", "How do I begin using this guide?"]
70
-
71
- # ==========================================================
72
- # 🎨 STYLING β€” MINIMAL, ENTERPRISE UI
73
  # ==========================================================
74
- st.markdown("""
75
- <style>
76
- div.block-container {padding-top: 1.2rem; max-width: 1080px;}
77
- h1, h2, h3 {color: #f3f4f6; font-weight: 600;}
78
- .suggest-chip {
79
- background: #0f1724;
80
- border: 1px solid #374151;
81
- border-radius: 14px;
82
- color: #e6eef8;
83
- padding: 8px 12px;
84
- cursor: pointer;
85
- font-size: 13px;
86
- margin: 6px 6px 6px 0;
87
- display: inline-block;
88
- transition: background 0.2s, transform 0.1s;
89
- }
90
- .suggest-chip:hover {background: #1e3a8a; transform: translateY(-2px);}
91
- .answer-box {
92
- background: linear-gradient(180deg,#0b1220,#071027);
93
- border-left: 4px solid #3b82f6;
94
- border-radius: 8px;
95
- padding: 16px 18px;
96
- color: #e6eef8;
97
- margin-top: 12px;
98
- box-shadow: 0 4px 14px rgba(0,0,0,0.35);
99
- }
100
- .stTextInput > div > div > input {
101
- background-color: #0f172a !important;
102
- color: #f1f5f9 !important;
103
- border-radius: 6px !important;
104
- border: 1px solid #334155 !important;
105
- padding: 8px 10px !important;
106
- }
107
- </style>
108
- """, unsafe_allow_html=True)
109
-
110
- # ==========================================================
111
- # 🧠 SESSION STATE (initialize before widgets)
112
- # ==========================================================
113
- for key, val in {
114
  "user_query_input": "",
115
  "show_more": False,
116
  "selected_suggestion": None,
117
  "query_suggestions_fixed": None,
118
  "last_doc": None,
119
- "show_advanced": False, # keep persistent checkbox state
120
- }.items():
121
- if key not in st.session_state:
122
- st.session_state[key] = val
123
-
 
 
 
 
124
 
125
  # ==========================================================
126
- # 🧭 SIDEBAR β€” STABLE VERSION (no rerun reloads)
127
  # ==========================================================
128
  with st.sidebar:
129
  st.markdown("### 🧭 Response Mode")
130
- mode = st.radio(
131
  "",
132
  ("Strict (Document-only)", "Extended (Document + general)"),
133
  index=0,
134
- help="Strict = answers only from the document. Extended = may include general context.",
135
- key="response_mode"
136
- )
137
 
138
  st.markdown("---")
 
 
139
 
140
- # βœ… Checkbox linked to session state (no rerun logic, Streamlit handles it automatically)
141
- st.checkbox(
142
- "Show advanced settings (for developers)",
143
- key="show_advanced",
144
- help="Toggle developer settings without reloading document or clearing outputs."
145
- )
146
-
147
- # βœ… Sliders appear conditionally, state persists
148
  if st.session_state.show_advanced:
149
  st.markdown("### Developer Settings")
150
- chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 1000, step=50, key="chunk_slider")
151
- overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10, key="overlap_slider")
152
- top_k = st.slider("Top K Results", 1, 10, 5, key="topk_slider")
153
- else:
154
- chunk_size, overlap, top_k = 1000, 120, 5
155
-
 
 
 
 
156
  st.markdown("---")
157
  st.caption("✨ Built by Shubham Sharma")
158
 
159
-
160
  # ==========================================================
161
- # 🧩 QUERY HANDLER
162
  # ==========================================================
163
- def set_user_query(q, idx):
164
- st.session_state["user_query_input"] = q
165
- st.session_state["selected_suggestion"] = idx
166
- # ⚠️ No explicit rerun β€” Streamlit does this automatically
167
-
168
 
169
  # ==========================================================
170
- # 🧠 USER INPUT
171
  # ==========================================================
172
- user_query = st.text_input(
173
- "Type your question or click one above:",
174
- key="user_query_input"
 
 
175
  )
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  # ==========================================================
179
- # πŸ“„ MAIN LAYOUT
180
  # ==========================================================
181
- st.title("πŸ“„ Enterprise Knowledge Assistant")
182
- st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
183
-
184
- doc_choice = st.radio(
185
- "Select a document:",
186
- ["-- Select --", "Sample PDF", "Upload Custom PDF"],
187
- index=0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  )
189
 
190
  # ==========================================================
191
- # πŸ“‚ DOCUMENT HANDLING
192
  # ==========================================================
193
- if doc_choice == "-- Select --":
194
- st.info("⬅️ Select a document to begin.")
195
- else:
196
- if doc_choice == "Sample PDF":
197
- temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
198
- st.success("πŸ“˜ Sample PDF loaded successfully. Ask questions below.")
199
- else:
200
- uploaded_file = st.file_uploader("", type="pdf", label_visibility="collapsed")
201
- if uploaded_file:
202
- temp_path = os.path.join("/tmp", uploaded_file.name)
203
- with open(temp_path, "wb") as f:
204
- f.write(uploaded_file.getbuffer())
205
- st.success("βœ… Document loaded successfully. You can now ask questions below.")
206
- else:
207
- temp_path = None
208
-
209
- if temp_path:
210
- with st.spinner("πŸ” Processing document..."):
211
- text, toc = extract_text_from_pdf(temp_path)
212
- chunks = chunk_text(text, chunk_size=chunk_size)
213
-
214
- with st.spinner("βš™οΈ Preparing index..."):
215
- embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks, chunk_size=chunk_size, overlap=overlap)
216
- index = build_faiss_index(embeddings)
217
-
218
- st.success("βœ… Ready. Ask below!")
219
-
220
- doc_name = os.path.basename(temp_path)
221
- if st.session_state["last_doc"] != doc_name:
222
- query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
223
- st.session_state["query_suggestions_fixed"] = query_suggestions
224
- st.session_state["last_doc"] = doc_name
225
- else:
226
- query_suggestions = st.session_state["query_suggestions_fixed"]
227
-
228
- # ----------------------------------------------------------
229
- # πŸ’¬ ASK ASSISTANT SECTION
230
- # ----------------------------------------------------------
231
- st.markdown("### Ask the Assistant")
232
-
233
- if query_suggestions:
234
- visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
235
- cols = st.columns(min(3, len(visible)))
236
- for i, q in enumerate(visible):
237
- if cols[i % 3].button(f"πŸ” {q}", key=f"sugg_{i}"):
238
- set_user_query(q, i)
239
 
240
- toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
241
- if st.button(toggle_text):
242
- st.session_state["show_more"] = not st.session_state["show_more"]
243
- st.experimental_rerun()
244
 
245
- user_query = st.text_input("Type your question or click one above:", key="user_query_input")
 
 
246
 
247
- # ----------------------------------------------------------
248
- # πŸ’‘ RESPONSE
249
- # ----------------------------------------------------------
250
- if user_query.strip():
251
- with st.spinner("πŸ’­ Thinking..."):
252
- retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
253
- answer = generate_answer(user_query, retrieved)
254
- st.markdown("### Assistant")
255
- st.markdown(f"<div class='answer-box'>πŸ’‘ {answer}</div>", unsafe_allow_html=True)
256
-
257
- st.markdown("""
258
- <script>window.scrollTo({top: document.body.scrollHeight, behavior: 'smooth'});</script>
259
- """, unsafe_allow_html=True)
260
-
261
- with st.expander("πŸ“„ Supporting Context"):
262
- for i, r in enumerate(retrieved, start=1):
263
- st.markdown(f"**Chunk {i}:** {r}")
264
-
265
- if toc:
266
- with st.expander("πŸ“š Table of Contents"):
267
- toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
268
- st.text_area("", toc_text, height=140)
269
 
270
- with st.expander("πŸ“„ Document Preview"):
271
- st.text_area("", text[:1000], height=140)
272
- st.caption(f"{len(chunks)} chunks processed.")
 
2
  import os
3
  import re
4
  import streamlit as st
5
+ import shutil
6
+ import hashlib
7
+ import pickle
8
 
9
+ # Protect against pages that call heavy imports before environment set
 
 
10
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
 
11
 
12
+ # --- Local helpers / small cache cleanup (keeps /tmp sane) ---
13
+ def clean_cache(max_size_gb: float = 2.0):
14
+ folders = [
15
+ "/root/.cache/huggingface",
16
+ "/root/.cache/transformers",
17
+ "/root/.cache/torch",
18
+ ]
19
+ for folder in folders:
20
+ if os.path.exists(folder):
21
+ try:
22
+ size_gb = sum(
23
+ os.path.getsize(os.path.join(dp, f))
24
+ for dp, _, files in os.walk(folder)
25
+ for f in files
26
+ ) / (1024**3)
27
+ if size_gb > max_size_gb:
28
+ shutil.rmtree(folder, ignore_errors=True)
29
+ except Exception:
30
+ pass
31
+
32
+ clean_cache()
33
+
34
+ # ==========================================================
35
+ # Imports (after HF cache set to avoid extra downloads)
36
  # ==========================================================
37
  from ingestion import extract_text_from_pdf, chunk_text
38
  from vectorstore import build_faiss_index
39
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
40
 
41
  # ==========================================================
42
+ # SESSION - initialize keys early to avoid widget duplication
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # ==========================================================
44
+ initial_state = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  "user_query_input": "",
46
  "show_more": False,
47
  "selected_suggestion": None,
48
  "query_suggestions_fixed": None,
49
  "last_doc": None,
50
+ "show_advanced": False,
51
+ "chunk_size": 1000,
52
+ "overlap": 120,
53
+ "top_k": 5,
54
+ "reasoning_mode": False,
55
+ }
56
+ for k, v in initial_state.items():
57
+ if k not in st.session_state:
58
+ st.session_state[k] = v
59
 
60
  # ==========================================================
61
+ # Sidebar (stable, no unexpected reruns)
62
  # ==========================================================
63
  with st.sidebar:
64
  st.markdown("### 🧭 Response Mode")
65
+ st.session_state["reasoning_mode"] = st.radio(
66
  "",
67
  ("Strict (Document-only)", "Extended (Document + general)"),
68
  index=0,
69
+ help="Strict = answers only from the document. Extended = may include general context."
70
+ ) == "Extended (Document + general)"
 
71
 
72
  st.markdown("---")
73
+ # Avoid forcing reruns when toggled: Streamlit manages state
74
+ st.checkbox("Show advanced settings (for developers)", key="show_advanced")
75
 
 
 
 
 
 
 
 
 
76
  if st.session_state.show_advanced:
77
  st.markdown("### Developer Settings")
78
+ # Persist slider states to session_state keys to avoid duplication
79
+ st.session_state["chunk_size"] = st.slider(
80
+ "Chunk Size (characters)", 200, 1500, st.session_state["chunk_size"], step=50, key="chunk_slider"
81
+ )
82
+ st.session_state["overlap"] = st.slider(
83
+ "Chunk Overlap (characters)", 50, 200, st.session_state["overlap"], step=10, key="overlap_slider"
84
+ )
85
+ st.session_state["top_k"] = st.slider(
86
+ "Top K Results", 1, 10, st.session_state["top_k"], key="topk_slider"
87
+ )
88
  st.markdown("---")
89
  st.caption("✨ Built by Shubham Sharma")
90
 
 
91
  # ==========================================================
92
+ # Page header
93
  # ==========================================================
94
+ st.title("πŸ“„ Enterprise Knowledge Assistant")
95
+ st.caption("Ask questions about SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
 
 
 
96
 
97
  # ==========================================================
98
+ # Document selection + upload
99
  # ==========================================================
100
+ doc_choice = st.radio(
101
+ "Select a document:",
102
+ ("-- Select --", "Sample PDF", "Upload Custom PDF"),
103
+ index=0,
104
+ key="doc_choice"
105
  )
106
 
107
+ temp_path = None
108
+ if doc_choice == "-- Select --":
109
+ st.info("⬅️ Please choose a document from the sidebar to begin.")
110
+ elif doc_choice == "Sample PDF":
111
+ temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
112
+ st.success("πŸ“˜ Using built-in Sample PDF.")
113
+ else:
114
+ uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
115
+ if uploaded_file:
116
+ temp_path = os.path.join("/tmp", uploaded_file.name)
117
+ with open(temp_path, "wb") as f:
118
+ f.write(uploaded_file.getbuffer())
119
+ st.success(f"βœ… '{uploaded_file.name}' uploaded successfully.")
120
 
121
  # ==========================================================
122
+ # Utility: set query (used by suggestion buttons)
123
  # ==========================================================
124
+ def set_user_query(q: str, idx: int = None):
125
+ # Only set values and rerun once to ensure text_input reflects it
126
+ st.session_state["user_query_input"] = q
127
+ st.session_state["selected_suggestion"] = idx
128
+ # Trigger a rerun so the single text_input shows the new value and downstream code runs
129
+ st.rerun()
130
+
131
+ # ==========================================================
132
+ # Document processing and suggestion generation
133
+ # ==========================================================
134
+ text, chunks, index, embeddings, toc = None, None, None, None, None
135
+ if temp_path:
136
+ # If user switched files, clear cached suggestions so they regenerate once for this doc
137
+ if st.session_state["last_doc"] != os.path.basename(temp_path):
138
+ st.session_state["query_suggestions_fixed"] = None
139
+ st.session_state["selected_suggestion"] = None
140
+ st.session_state["last_doc"] = os.path.basename(temp_path)
141
+
142
+ with st.spinner("πŸ” Processing your document..."):
143
+ # Extract -> chunk -> suggestions
144
+ text, toc = extract_text_from_pdf(temp_path)
145
+ chunks = chunk_text(text, chunk_size=st.session_state["chunk_size"], overlap=st.session_state["overlap"])
146
+ st.success("βœ… Document loaded successfully.")
147
+ # Generate suggestions only once per document and store in session_state
148
+ if st.session_state["query_suggestions_fixed"] is None:
149
+ # Prefer genai (GPT) suggestions if available in qa.genai_generate
150
+ try:
151
+ # build prompt from toc or chunks (keeps consistent)
152
+ titles = []
153
+ if toc:
154
+ for sec, raw_title in toc:
155
+ t = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
156
+ t = re.sub(r"\.{2,}\s*\d+$", "", t).strip()
157
+ if 4 < len(t) < 120:
158
+ titles.append(t)
159
+ sample_text = " ".join(chunks[:4])[:4000]
160
+ prompt = f"Generate 6 short (<=18 words), document-focused questions based on TOC: {titles[:6]} and sample: {sample_text}"
161
+ ai_out = genai_generate(prompt)
162
+ # Parse lines/questions from model response robustly
163
+ qs = re.findall(r"(?m)^\s*[-β€’\d\)]*\s*(.+?)\?$", ai_out)
164
+ qs = [q.strip() + "?" for q in qs if 8 < len(q.strip()) < 120]
165
+ if not qs:
166
+ # fallback: simple heuristics from TOC or chunk sentences
167
+ qs = []
168
+ if titles:
169
+ for t in titles[:6]:
170
+ qs.append(f"What is described in '{t}'?")
171
+ else:
172
+ sents = re.split(r'(?<=[.?!])\s+', sample_text)
173
+ for s in sents[:6]:
174
+ if len(s) > 20:
175
+ qs.append((s.strip()[:80] + "...").strip() + "?")
176
+ qs = qs[:6]
177
+ st.session_state["query_suggestions_fixed"] = qs
178
+ except Exception:
179
+ # deterministic fallback
180
+ st.session_state["query_suggestions_fixed"] = [
181
+ "What is this document about?",
182
+ "How do I get started with the process?",
183
+ "What are the prerequisites?",
184
+ "What steps are required?",
185
+ "How to troubleshoot common issues?",
186
+ "Where can I find the configuration?"
187
+ ]
188
+
189
+ # Build or load embeddings and index (embedding cache uses chunk-aware key)
190
+ with st.spinner("βš™οΈ Preparing search index..."):
191
+ embeddings = cache_embeddings(os.path.basename(temp_path), chunks, lambda c: embed_chunks(c))
192
+ index = build_faiss_index(embeddings)
193
+ st.success("πŸš€ Document ready β€” you can now ask questions below.")
194
+
195
+ # ==========================================================
196
+ # UI: Suggested Questions + Query input (only one text_input)
197
+ # ==========================================================
198
+ st.markdown("## πŸ€– Ask the Assistant")
199
+
200
+ if temp_path and st.session_state["query_suggestions_fixed"]:
201
+ st.markdown("#### πŸ’‘ Suggested Questions")
202
+ suggestions = st.session_state["query_suggestions_fixed"]
203
+ visible = suggestions if st.session_state["show_more"] else suggestions[:3]
204
+ cols = st.columns(min(3, len(visible)))
205
+ for i, q in enumerate(visible):
206
+ col = cols[i % 3]
207
+ # unique keys for each button
208
+ if col.button(f"πŸ” {q}", key=f"suggest_{i}"):
209
+ set_user_query(q, idx=i)
210
+
211
+ toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
212
+ if st.button(toggle_text, key="toggle_show_more"):
213
+ st.session_state["show_more"] = not st.session_state["show_more"]
214
+
215
+ # --- single text input (only place we define it) ---
216
+ user_query = st.text_input(
217
+ "Type your question or pick one above:",
218
+ value=st.session_state["user_query_input"],
219
+ key="user_query_input",
220
  )
221
 
222
  # ==========================================================
223
+ # Answer generation (runs only when user_query non-empty)
224
  # ==========================================================
225
+ if temp_path and user_query and user_query.strip():
226
+ st.caption("Mode: πŸ“„ Strict Document" if not st.session_state["reasoning_mode"] else "Mode: 🧠 Extended")
227
+ with st.spinner("πŸ’­ Generating your answer..."):
228
+ retrieved = retrieve_chunks(user_query, index, chunks, top_k=st.session_state["top_k"], embeddings=embeddings)
229
+ answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state["reasoning_mode"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ st.markdown("### βœ… Assistant’s Answer")
232
+ st.markdown(f"<div style='background:#0f172a;border-left:4px solid #3b82f6;padding:12px;border-radius:8px;color:#f1f5f9'>{answer}</div>", unsafe_allow_html=True)
 
 
233
 
234
+ with st.expander("πŸ“„ Supporting Context", expanded=False):
235
+ for i, r in enumerate(retrieved, start=1):
236
+ st.markdown(f"**Chunk {i}:** {r}")
237
 
238
+ # ==========================================================
239
+ # Document preview (collapsed)
240
+ # ==========================================================
241
+ if chunks:
242
+ st.markdown("---")
243
+ with st.expander("πŸ“‘ Document Preview", expanded=False):
244
+ st.text_area("Extracted text (first 1000 chars)", text[:1000], height=180)
245
+ st.caption(f"πŸ“¦ {len(chunks)} chunks processed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
+ # End of file