Shubham170793 commited on
Commit
4ec6a61
Β·
verified Β·
1 Parent(s): cabbcce

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +105 -199
src/streamlit_app.py CHANGED
@@ -1,247 +1,153 @@
1
- # streamlit_app.py
 
 
2
  import os
3
  import re
4
  import streamlit as st
5
- import shutil
6
- import hashlib
7
- import pickle
8
-
9
- # Protect against pages that call heavy imports before environment set
10
- st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
11
-
12
- # --- Local helpers / small cache cleanup (keeps /tmp sane) ---
13
- def clean_cache(max_size_gb: float = 2.0):
14
- folders = [
15
- "/root/.cache/huggingface",
16
- "/root/.cache/transformers",
17
- "/root/.cache/torch",
18
- ]
19
- for folder in folders:
20
- if os.path.exists(folder):
21
- try:
22
- size_gb = sum(
23
- os.path.getsize(os.path.join(dp, f))
24
- for dp, _, files in os.walk(folder)
25
- for f in files
26
- ) / (1024**3)
27
- if size_gb > max_size_gb:
28
- shutil.rmtree(folder, ignore_errors=True)
29
- except Exception:
30
- pass
31
-
32
- clean_cache()
33
-
34
- # ==========================================================
35
- # Imports (after HF cache set to avoid extra downloads)
36
- # ==========================================================
37
  from ingestion import extract_text_from_pdf, chunk_text
38
  from vectorstore import build_faiss_index
39
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
40
 
41
- # ==========================================================
42
- # SESSION - initialize keys early to avoid widget duplication
43
- # ==========================================================
44
- initial_state = {
45
- "user_query_input": "",
46
- "show_more": False,
47
- "selected_suggestion": None,
48
- "query_suggestions_fixed": None,
49
- "last_doc": None,
50
- "show_advanced": False,
51
- "chunk_size": 1000,
52
- "overlap": 120,
53
- "top_k": 5,
54
- "reasoning_mode": False,
55
- }
56
- for k, v in initial_state.items():
57
- if k not in st.session_state:
58
- st.session_state[k] = v
59
 
60
  # ==========================================================
61
- # Sidebar (stable, no unexpected reruns)
62
  # ==========================================================
63
  with st.sidebar:
64
  st.markdown("### 🧭 Response Mode")
65
- st.session_state["reasoning_mode"] = st.radio(
66
  "",
67
  ("Strict (Document-only)", "Extended (Document + general)"),
68
  index=0,
69
- help="Strict = answers only from the document. Extended = may include general context."
70
- ) == "Extended (Document + general)"
71
 
72
  st.markdown("---")
73
- # Avoid forcing reruns when toggled: Streamlit manages state
74
- st.checkbox("Show advanced settings (for developers)", key="show_advanced")
75
-
76
- if st.session_state.show_advanced:
77
  st.markdown("### Developer Settings")
78
- # Persist slider states to session_state keys to avoid duplication
79
- st.session_state["chunk_size"] = st.slider(
80
- "Chunk Size (characters)", 200, 1500, st.session_state["chunk_size"], step=50, key="chunk_slider"
81
- )
82
- st.session_state["overlap"] = st.slider(
83
- "Chunk Overlap (characters)", 50, 200, st.session_state["overlap"], step=10, key="overlap_slider"
84
- )
85
- st.session_state["top_k"] = st.slider(
86
- "Top K Results", 1, 10, st.session_state["top_k"], key="topk_slider"
87
- )
88
  st.markdown("---")
89
  st.caption("✨ Built by Shubham Sharma")
90
 
91
  # ==========================================================
92
- # Page header
93
  # ==========================================================
94
- st.title("πŸ“„ Enterprise Knowledge Assistant")
95
- st.caption("Ask questions about SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # ==========================================================
98
- # Document selection + upload
99
  # ==========================================================
100
- doc_choice = st.radio(
101
- "Select a document:",
102
- ("-- Select --", "Sample PDF", "Upload Custom PDF"),
103
- index=0,
104
- key="doc_choice"
 
 
 
105
  )
106
 
 
 
 
 
 
 
107
  temp_path = None
108
- if doc_choice == "-- Select --":
109
- st.info("⬅️ Please choose a document from the sidebar to begin.")
110
- elif doc_choice == "Sample PDF":
111
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
112
- st.success("πŸ“˜ Using built-in Sample PDF.")
113
- else:
114
- uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
115
  if uploaded_file:
116
  temp_path = os.path.join("/tmp", uploaded_file.name)
117
  with open(temp_path, "wb") as f:
118
  f.write(uploaded_file.getbuffer())
119
- st.success(f"βœ… '{uploaded_file.name}' uploaded successfully.")
120
-
121
- # ==========================================================
122
- # Utility: set query (used by suggestion buttons)
123
- # ==========================================================
124
- def set_user_query(q: str, idx: int = None):
125
- # Only set values and rerun once to ensure text_input reflects it
126
- st.session_state["user_query_input"] = q
127
- st.session_state["selected_suggestion"] = idx
128
- # Trigger a rerun so the single text_input shows the new value and downstream code runs
129
- st.rerun()
130
 
131
  # ==========================================================
132
- # Document processing and suggestion generation
133
  # ==========================================================
134
- text, chunks, index, embeddings, toc = None, None, None, None, None
135
  if temp_path:
136
- # If user switched files, clear cached suggestions so they regenerate once for this doc
137
- if st.session_state["last_doc"] != os.path.basename(temp_path):
138
- st.session_state["query_suggestions_fixed"] = None
139
- st.session_state["selected_suggestion"] = None
140
- st.session_state["last_doc"] = os.path.basename(temp_path)
141
-
142
- with st.spinner("πŸ” Processing your document..."):
143
- # Extract -> chunk -> suggestions
144
- text, toc = extract_text_from_pdf(temp_path)
145
- chunks = chunk_text(text, chunk_size=st.session_state["chunk_size"], overlap=st.session_state["overlap"])
146
- st.success("βœ… Document loaded successfully.")
147
- # Generate suggestions only once per document and store in session_state
148
- if st.session_state["query_suggestions_fixed"] is None:
149
- # Prefer genai (GPT) suggestions if available in qa.genai_generate
150
- try:
151
- # build prompt from toc or chunks (keeps consistent)
152
- titles = []
153
- if toc:
154
- for sec, raw_title in toc:
155
- t = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
156
- t = re.sub(r"\.{2,}\s*\d+$", "", t).strip()
157
- if 4 < len(t) < 120:
158
- titles.append(t)
159
- sample_text = " ".join(chunks[:4])[:4000]
160
- prompt = f"Generate 6 short (<=18 words), document-focused questions based on TOC: {titles[:6]} and sample: {sample_text}"
161
- ai_out = genai_generate(prompt)
162
- # Parse lines/questions from model response robustly
163
- qs = re.findall(r"(?m)^\s*[-β€’\d\)]*\s*(.+?)\?$", ai_out)
164
- qs = [q.strip() + "?" for q in qs if 8 < len(q.strip()) < 120]
165
- if not qs:
166
- # fallback: simple heuristics from TOC or chunk sentences
167
- qs = []
168
- if titles:
169
- for t in titles[:6]:
170
- qs.append(f"What is described in '{t}'?")
171
- else:
172
- sents = re.split(r'(?<=[.?!])\s+', sample_text)
173
- for s in sents[:6]:
174
- if len(s) > 20:
175
- qs.append((s.strip()[:80] + "...").strip() + "?")
176
- qs = qs[:6]
177
- st.session_state["query_suggestions_fixed"] = qs
178
- except Exception:
179
- # deterministic fallback
180
- st.session_state["query_suggestions_fixed"] = [
181
- "What is this document about?",
182
- "How do I get started with the process?",
183
- "What are the prerequisites?",
184
- "What steps are required?",
185
- "How to troubleshoot common issues?",
186
- "Where can I find the configuration?"
187
- ]
188
-
189
- # Build or load embeddings and index (embedding cache uses chunk-aware key)
190
- with st.spinner("βš™οΈ Preparing search index..."):
191
- embeddings = cache_embeddings(os.path.basename(temp_path), chunks, lambda c: embed_chunks(c))
192
- index = build_faiss_index(embeddings)
193
- st.success("πŸš€ Document ready β€” you can now ask questions below.")
194
-
195
- # ==========================================================
196
- # UI: Suggested Questions + Query input (only one text_input)
197
- # ==========================================================
198
- st.markdown("## πŸ€– Ask the Assistant")
199
-
200
- if temp_path and st.session_state["query_suggestions_fixed"]:
201
- st.markdown("#### πŸ’‘ Suggested Questions")
202
- suggestions = st.session_state["query_suggestions_fixed"]
203
- visible = suggestions if st.session_state["show_more"] else suggestions[:3]
204
- cols = st.columns(min(3, len(visible)))
205
- for i, q in enumerate(visible):
206
- col = cols[i % 3]
207
- # unique keys for each button
208
- if col.button(f"πŸ” {q}", key=f"suggest_{i}"):
209
- set_user_query(q, idx=i)
210
-
211
- toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
212
- if st.button(toggle_text, key="toggle_show_more"):
213
  st.session_state["show_more"] = not st.session_state["show_more"]
214
 
215
- # --- single text input (only place we define it) ---
216
- user_query = st.text_input(
217
- "Type your question or pick one above:",
218
- value=st.session_state["user_query_input"],
219
- key="user_query_input",
220
- )
221
 
222
- # ==========================================================
223
- # Answer generation (runs only when user_query non-empty)
224
- # ==========================================================
225
- if temp_path and user_query and user_query.strip():
226
- st.caption("Mode: πŸ“„ Strict Document" if not st.session_state["reasoning_mode"] else "Mode: 🧠 Extended")
227
- with st.spinner("πŸ’­ Generating your answer..."):
228
- retrieved = retrieve_chunks(user_query, index, chunks, top_k=st.session_state["top_k"], embeddings=embeddings)
229
- answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state["reasoning_mode"])
 
230
 
231
- st.markdown("### βœ… Assistant’s Answer")
232
- st.markdown(f"<div style='background:#0f172a;border-left:4px solid #3b82f6;padding:12px;border-radius:8px;color:#f1f5f9'>{answer}</div>", unsafe_allow_html=True)
233
 
234
- with st.expander("πŸ“„ Supporting Context", expanded=False):
235
- for i, r in enumerate(retrieved, start=1):
236
- st.markdown(f"**Chunk {i}:** {r}")
237
 
238
  # ==========================================================
239
- # Document preview (collapsed)
240
  # ==========================================================
241
- if chunks:
242
  st.markdown("---")
243
- with st.expander("πŸ“‘ Document Preview", expanded=False):
244
- st.text_area("Extracted text (first 1000 chars)", text[:1000], height=180)
245
  st.caption(f"πŸ“¦ {len(chunks)} chunks processed.")
246
-
247
- # End of file
 
1
+ # ==========================================================
2
+ # streamlit_app.py β€” Restored Stable UI (Pre-rerun Fix)
3
+ # ==========================================================
4
  import os
5
  import re
6
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from ingestion import extract_text_from_pdf, chunk_text
8
  from vectorstore import build_faiss_index
9
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
10
 
11
+ st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # ==========================================================
14
+ # 🧭 SIDEBAR β€” Original Clean Version
15
  # ==========================================================
16
  with st.sidebar:
17
  st.markdown("### 🧭 Response Mode")
18
+ mode = st.radio(
19
  "",
20
  ("Strict (Document-only)", "Extended (Document + general)"),
21
  index=0,
22
+ help="Strict = answers only from the document. Extended = may include general context.",
23
+ )
24
 
25
  st.markdown("---")
26
+ show_advanced = st.checkbox("Show advanced settings (for developers)", value=False)
27
+ if show_advanced:
 
 
28
  st.markdown("### Developer Settings")
29
+ chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 1000, step=50)
30
+ overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
31
+ top_k = st.slider("Top K Results", 1, 10, 5)
32
+ else:
33
+ chunk_size, overlap, top_k = 1000, 120, 5
34
+
 
 
 
 
35
  st.markdown("---")
36
  st.caption("✨ Built by Shubham Sharma")
37
 
38
  # ==========================================================
39
+ # 🧠 SESSION STATE β€” Clean and Minimal
40
  # ==========================================================
41
+ for key, val in {
42
+ "user_query_input": "",
43
+ "show_more": False,
44
+ "selected_suggestion": None,
45
+ "query_suggestions_fixed": None,
46
+ "last_doc": None,
47
+ }.items():
48
+ if key not in st.session_state:
49
+ st.session_state[key] = val
50
+
51
+ def set_user_query(q, idx):
52
+ st.session_state["user_query_input"] = q
53
+ st.session_state["selected_suggestion"] = idx
54
+ st.experimental_rerun()
55
 
56
  # ==========================================================
57
+ # πŸ“˜ APP HEADER
58
  # ==========================================================
59
+ st.markdown(
60
+ """
61
+ <h1 style="text-align:center;">πŸ“„ Enterprise Knowledge Assistant</h1>
62
+ <p style="text-align:center;color:gray;">
63
+ Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.
64
+ </p>
65
+ """,
66
+ unsafe_allow_html=True,
67
  )
68
 
69
+ # ==========================================================
70
+ # πŸ“‚ DOCUMENT UPLOAD / SELECTION
71
+ # ==========================================================
72
+ st.markdown("### Select a document:")
73
+ doc_choice = st.radio("", ("-- Select --", "Sample PDF", "Upload Custom PDF"))
74
+
75
  temp_path = None
76
+ if doc_choice == "Sample PDF":
 
 
77
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
78
+ st.success("πŸ“˜ Sample PDF loaded successfully. Ask questions below.")
79
+ elif doc_choice == "Upload Custom PDF":
80
+ uploaded_file = st.file_uploader("Upload your PDF", type="pdf")
81
  if uploaded_file:
82
  temp_path = os.path.join("/tmp", uploaded_file.name)
83
  with open(temp_path, "wb") as f:
84
  f.write(uploaded_file.getbuffer())
85
+ st.success(f"βœ… '{uploaded_file.name}' uploaded successfully β€” ready to query below.")
86
+ else:
87
+ st.info("⬅️ Please select or upload a document to begin.")
 
 
 
 
 
 
 
 
88
 
89
  # ==========================================================
90
+ # 🧠 PROCESS DOCUMENT (only when selected)
91
  # ==========================================================
 
92
  if temp_path:
93
+ text, toc = extract_text_from_pdf(temp_path)
94
+ chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
95
+ st.success("βœ… Document ready β€” you can now ask questions below.")
96
+
97
+ embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
98
+ index = build_faiss_index(embeddings)
99
+
100
+ # ==========================================================
101
+ # πŸ’¬ SUGGESTED QUESTIONS (Smart Defaults)
102
+ # ==========================================================
103
+ if not st.session_state["query_suggestions_fixed"]:
104
+ st.session_state["query_suggestions_fixed"] = [
105
+ "What is the purpose of this document?",
106
+ "How can integration be set up in SAP Cloud?",
107
+ "What are the prerequisites mentioned?",
108
+ "What steps are involved in configuration?",
109
+ "How to troubleshoot integration issues?",
110
+ "What is the key functionality covered?"
111
+ ]
112
+
113
+ st.markdown("### Ask the Assistant")
114
+
115
+ visible_qs = st.session_state["query_suggestions_fixed"][:3] if not st.session_state["show_more"] else st.session_state["query_suggestions_fixed"]
116
+ cols = st.columns(3)
117
+ for i, q in enumerate(visible_qs):
118
+ if cols[i % 3].button(f"πŸ”Ή {q}", key=f"suggest_{i}"):
119
+ set_user_query(q, i)
120
+
121
+ if st.button("Show more β–Ό" if not st.session_state["show_more"] else "Show less β–²"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  st.session_state["show_more"] = not st.session_state["show_more"]
123
 
124
+ # ==========================================================
125
+ # 🧩 QUERY INPUT
126
+ # ==========================================================
127
+ user_query = st.text_input("Type your question or click one above:", value=st.session_state["user_query_input"], key="user_query_input")
 
 
128
 
129
+ # ==========================================================
130
+ # πŸ€– GENERATE ANSWER
131
+ # ==========================================================
132
+ if user_query:
133
+ reasoning_mode = mode == "Extended (Document + general)"
134
+ st.markdown(f"**Mode:** {'🧠 Extended' if reasoning_mode else 'πŸ“„ Strict Document'}")
135
+ with st.spinner("Thinking..."):
136
+ retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
137
+ answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
138
 
139
+ st.markdown("### Assistant’s Answer")
140
+ st.markdown(f"<div style='background:#0f172a;border-left:4px solid #3b82f6;padding:12px;border-radius:8px;color:#f1f5f9'>{answer}</div>", unsafe_allow_html=True)
141
 
142
+ with st.expander("πŸ“˜ Supporting Context", expanded=False):
143
+ for i, chunk in enumerate(retrieved, 1):
144
+ st.markdown(f"**Chunk {i}:** {chunk.strip()}")
145
 
146
  # ==========================================================
147
+ # πŸ—‚οΈ DOCUMENT PREVIEW
148
  # ==========================================================
149
+ if temp_path:
150
  st.markdown("---")
151
+ with st.expander("πŸ“„ Document Preview", expanded=False):
152
+ st.text_area("Extracted text (first 1000 chars):", text[:1000], height=180)
153
  st.caption(f"πŸ“¦ {len(chunks)} chunks processed.")