Shubham170793 commited on
Commit
f978245
Β·
verified Β·
1 Parent(s): 3fbd2b9

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +224 -81
src/streamlit_app.py CHANGED
@@ -1,65 +1,214 @@
1
  # ==========================================================
2
- # πŸ“‚ DOCUMENT HANDLING β€” CLEAN, ACCURATE, AND BYTE-AWARE
3
  # ==========================================================
 
 
4
  import hashlib
 
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def _hash_content(file_path):
7
- """Generate a short SHA256 hash of the file's actual binary content."""
8
- hasher = hashlib.sha256()
9
  with open(file_path, "rb") as f:
10
  while chunk := f.read(8192):
11
- hasher.update(chunk)
12
- return hasher.hexdigest()[:12] # short unique hash for same-name files
13
-
14
 
15
  def refresh_suggestions(doc_name, toc, chunks):
16
- """Refresh dynamic suggestions and reset related states."""
17
  st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
18
  toc, chunks, doc_name
19
  )
20
- st.session_state["user_query_input"] = ""
21
- st.session_state["selected_suggestion"] = None
22
- st.session_state["show_more"] = False
23
-
24
 
25
- # --- Document selection ---
26
  if doc_choice == "-- Select --":
27
  st.info("⬅️ Select or upload a document to begin.")
28
  else:
29
  temp_path = None
30
-
31
- # --- File selection ---
32
  if doc_choice == "Sample PDF":
33
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
34
  st.markdown("βœ… **Sample PDF selected.** Preparing document...")
35
  else:
36
- uploaded_file = st.file_uploader(
37
- "Upload a PDF document (max 200MB):",
38
- type="pdf",
39
- label_visibility="collapsed"
40
- )
41
- if uploaded_file:
42
- temp_path = os.path.join("/tmp", uploaded_file.name)
43
  with open(temp_path, "wb") as f:
44
- f.write(uploaded_file.getbuffer())
45
  else:
46
  st.stop()
47
 
48
- # --- Start processing if file exists ---
49
  if temp_path:
50
  doc_name = os.path.basename(temp_path)
51
- file_hash = _hash_content(temp_path)
52
- doc_identifier = f"{doc_name}_{file_hash}"
53
 
54
- # βœ… Step 0: Ensure registry exists
55
  if "registry" not in st.session_state:
56
  st.session_state["registry"] = DocumentRegistry()
57
  registry = st.session_state["registry"]
58
 
59
- # βœ… Step 1: Check if already registered
60
- existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
61
- if existing_doc:
62
- doc_data = registry.get_doc(existing_doc["name"])
63
  st.session_state.update({
64
  "text": doc_data.get("text", ""),
65
  "toc": doc_data.get("toc", []),
@@ -67,14 +216,12 @@ else:
67
  "embeddings": doc_data.get("embeddings"),
68
  "index": doc_data.get("index"),
69
  "doc_ready": True,
70
- "active_doc": existing_doc["name"],
71
  "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
72
  })
73
-
74
- refresh_suggestions(existing_doc["name"], st.session_state["toc"], st.session_state["chunks"])
75
  st.experimental_rerun()
76
 
77
- # βœ… Step 2: New document β†’ process
78
  status = st.empty()
79
  status.info("πŸ“€ Upload complete β€” reading document...")
80
 
@@ -88,9 +235,7 @@ else:
88
 
89
  doc_id = registry.register(temp_path, chunks, embeddings, index)
90
  st.session_state["active_doc"] = doc_id
91
-
92
- status.success("βœ… Document processed successfully β€” all set to query your assistant!")
93
-
94
  refresh_suggestions(doc_name, toc, chunks)
95
 
96
  st.session_state.update({
@@ -101,50 +246,48 @@ else:
101
  "index": index,
102
  "doc_ready": True,
103
  "last_doc": doc_identifier,
104
- "status_text": "βœ… Document processed successfully β€” all set to query your assistant!"
105
  })
106
-
107
  st.experimental_rerun()
108
 
109
- # --- Display Ready Message + Ask Section ---
110
- if st.session_state.get("doc_ready"):
111
- active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
112
- st.info(st.session_state.get("status_text", f"πŸ“„ {active_name or 'Document'} is ready for queries."))
113
-
114
- st.markdown("### πŸ’¬ Ask the Assistant")
115
- query_suggestions = st.session_state.get("query_suggestions_fixed", [])
116
- if query_suggestions:
117
- visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
118
- cols = st.columns(min(3, len(visible)))
119
- for i, q in enumerate(visible):
120
- if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
121
- st.session_state["user_query_input"] = q
122
- st.session_state["selected_suggestion"] = i
123
- st.experimental_rerun()
124
-
125
- toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
126
- if st.button(toggle_text, help="Show or hide more suggestions"):
127
- st.session_state["show_more"] = not st.session_state["show_more"]
128
- st.experimental_rerun()
129
-
130
- user_query = st.text_input(
131
- "Type your question or click one above:",
132
- key="user_query_input",
133
- label_visibility="visible"
134
- )
135
-
136
- if user_query.strip():
137
- reasoning_mode = mode == "Extended (Document + General)"
138
- with st.spinner("πŸ’­ Generating your answer..."):
139
- retrieved = retrieve_chunks(
140
- user_query,
141
- st.session_state["index"],
142
- st.session_state["chunks"],
143
- top_k=top_k,
144
- embeddings=st.session_state["embeddings"]
145
- )
146
- answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
147
- st.session_state["retrieved"] = retrieved
148
-
149
- st.markdown("### πŸ€– Assistant’s Answer")
150
- st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
 
1
  # ==========================================================
2
+ # streamlit_app.py β€” Stable Layout (English Only)
3
  # ==========================================================
4
+ import os
5
+ import re
6
  import hashlib
7
+ import streamlit as st
8
+ import torch
9
+ from document_registry import DocumentRegistry
10
+ from ingestion import extract_text_from_pdf, chunk_text
11
+ from vectorstore import build_faiss_index
12
+ from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
13
 
14
+ # ==========================================================
15
+ # βœ… PAGE CONFIG
16
+ # ==========================================================
17
+ st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
18
+ print("CUDA available:", torch.cuda.is_available())
19
+
20
+ # ==========================================================
21
+ # βš™οΈ CACHE SETUP
22
+ # ==========================================================
23
+ CACHE_DIR = "/tmp/hf_cache"
24
+ os.makedirs(CACHE_DIR, exist_ok=True)
25
+ os.environ.update({
26
+ "HF_HOME": CACHE_DIR,
27
+ "TRANSFORMERS_CACHE": CACHE_DIR,
28
+ "HF_DATASETS_CACHE": CACHE_DIR,
29
+ "HF_MODULES_CACHE": CACHE_DIR,
30
+ })
31
+
32
+ # ==========================================================
33
+ # 🧠 SMART SUGGESTION GENERATOR
34
+ # ==========================================================
35
+ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
36
+ if not toc or not chunks:
37
+ return ["How do I start using this guide?", "What does this document cover?"]
38
+
39
+ titles = []
40
+ for sec, raw_title in toc:
41
+ title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
42
+ title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
43
+ if 4 < len(title) < 120:
44
+ titles.append(title)
45
+
46
+ context_sample = " ".join(chunks[:3])[:4000]
47
+ prompt = f"""
48
+ You are a content assistant. Based on the TOC and text, generate 5–7 short natural questions.
49
+ Each question <18 words, ends with '?', sounds human. Document: "{doc_name}"
50
+ TOC:
51
+ {chr(10).join(['- ' + t for t in titles[:8]])}
52
+ Sample:
53
+ {context_sample}
54
+ """
55
+
56
+ try:
57
+ ai_response = genai_generate(prompt)
58
+ lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
59
+ out = []
60
+ for ln in lines:
61
+ q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
62
+ if not q.endswith("?") and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
63
+ q += "?"
64
+ if 8 <= len(q) <= 140:
65
+ out.append(q)
66
+ uniq = []
67
+ seen = set()
68
+ for q in out:
69
+ if q.lower() not in seen:
70
+ seen.add(q.lower())
71
+ uniq.append(q)
72
+ return uniq[:7] or [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
73
+ except Exception:
74
+ return ["How do I start using this guide?", "What does this document cover?"]
75
+
76
+ # ==========================================================
77
+ # 🎨 STYLING
78
+ # ==========================================================
79
+ st.markdown("""
80
+ <style>
81
+ div.block-container {padding-top:1.2rem;max-width:1080px;}
82
+ h1,h2,h3{color:#f3f4f6;font-weight:600;}
83
+ .suggest-chip{background:#0f1724;border:1px solid #374151;border-radius:14px;
84
+ color:#e6eef8;padding:8px 12px;cursor:pointer;font-size:13px;margin:6px 6px 6px 0;
85
+ display:inline-block;transition:background 0.2s,transform 0.1s;}
86
+ .suggest-chip:hover{background:#1e3a8a;transform:translateY(-2px);}
87
+ .answer-box{background:linear-gradient(180deg,#0b1220,#071027);
88
+ border-left:4px solid #3b82f6;border-radius:8px;padding:16px 18px;color:#e6eef8;
89
+ margin-top:12px;box-shadow:0 4px 14px rgba(0,0,0,0.35);}
90
+ .stTextInput>div>div>input{background-color:#0f172a!important;color:#f1f5f9!important;
91
+ border-radius:6px!important;border:1px solid #334155!important;padding:8px 10px!important;
92
+ font-size:15px!important;}
93
+ .small-link{font-size:13px;color:#60a5fa;cursor:pointer;}
94
+ </style>
95
+ """, unsafe_allow_html=True)
96
+
97
+ # ==========================================================
98
+ # 🧭 SIDEBAR
99
+ # ==========================================================
100
+ with st.sidebar:
101
+ st.markdown("### 🧭 Response Style")
102
+ mode = st.radio(
103
+ "",
104
+ ("Strict (Document-only)", "Extended (Document + General)"),
105
+ index=0,
106
+ help="Strict = answers only from the uploaded document."
107
+ )
108
+ st.markdown("---")
109
+
110
+ # πŸ“š Registry display
111
+ if "registry" in st.session_state:
112
+ registry = st.session_state["registry"]
113
+ registered = registry.list_docs() if hasattr(registry, "list_docs") else []
114
+ if registered:
115
+ with st.expander("πŸ“š Registered Documents", expanded=False):
116
+ for i, d in enumerate(registered, 1):
117
+ st.markdown(f"**{i}. {d.get('name','?')}** β€” {d.get('num_chunks','?')} chunks *(TOC: {d.get('toc_source','β€”')})*")
118
+ else:
119
+ st.caption("πŸ“­ No documents registered yet.")
120
+ else:
121
+ st.caption("πŸ“­ No registry initialized yet.")
122
+
123
+ st.markdown("---")
124
+ show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
125
+ if show_dev:
126
+ st.markdown("### βš™οΈ Developer Options")
127
+ chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
128
+ overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
129
+ top_k = st.slider("Top K Results", 1, 10, 5)
130
+ else:
131
+ chunk_size, overlap, top_k = 1000, 120, 5
132
+ st.markdown("---")
133
+ st.caption("✨ Built by Shubham Sharma")
134
+
135
+ # ==========================================================
136
+ # 🧠 SESSION STATE
137
+ # ==========================================================
138
+ for key, val in {
139
+ "user_query_input": "",
140
+ "show_more": False,
141
+ "selected_suggestion": None,
142
+ "query_suggestions_fixed": None,
143
+ "last_doc": None,
144
+ "doc_lang": "en",
145
+ }.items():
146
+ if key not in st.session_state:
147
+ st.session_state[key] = val
148
+
149
+ def set_user_query(q, idx):
150
+ st.session_state["user_query_input"] = q
151
+ st.session_state["selected_suggestion"] = idx
152
+ st.experimental_rerun()
153
+
154
+ # ==========================================================
155
+ # πŸ“„ MAIN SECTION
156
+ # ==========================================================
157
+ st.title("πŸ“„ Enterprise Knowledge Assistant")
158
+ st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
159
+
160
+ # βœ… FIXED: must be defined before document-handling logic
161
+ doc_choice = st.radio(
162
+ "Select a document:",
163
+ ["-- Select --", "Sample PDF", "Upload Custom PDF"],
164
+ index=0
165
+ )
166
+
167
+ # ==========================================================
168
+ # πŸ“‚ DOCUMENT HANDLING
169
+ # ==========================================================
170
  def _hash_content(file_path):
171
+ h = hashlib.sha256()
 
172
  with open(file_path, "rb") as f:
173
  while chunk := f.read(8192):
174
+ h.update(chunk)
175
+ return h.hexdigest()[:12]
 
176
 
177
  def refresh_suggestions(doc_name, toc, chunks):
 
178
  st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
179
  toc, chunks, doc_name
180
  )
181
+ st.session_state.update({"user_query_input": "", "selected_suggestion": None, "show_more": False})
 
 
 
182
 
 
183
  if doc_choice == "-- Select --":
184
  st.info("⬅️ Select or upload a document to begin.")
185
  else:
186
  temp_path = None
 
 
187
  if doc_choice == "Sample PDF":
188
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
189
  st.markdown("βœ… **Sample PDF selected.** Preparing document...")
190
  else:
191
+ uploaded = st.file_uploader("Upload a PDF document (max 200 MB):", type="pdf", label_visibility="collapsed")
192
+ if uploaded:
193
+ temp_path = os.path.join("/tmp", uploaded.name)
 
 
 
 
194
  with open(temp_path, "wb") as f:
195
+ f.write(uploaded.getbuffer())
196
  else:
197
  st.stop()
198
 
 
199
  if temp_path:
200
  doc_name = os.path.basename(temp_path)
201
+ doc_hash = _hash_content(temp_path)
202
+ doc_identifier = f"{doc_name}_{doc_hash}"
203
 
204
+ # --- registry check
205
  if "registry" not in st.session_state:
206
  st.session_state["registry"] = DocumentRegistry()
207
  registry = st.session_state["registry"]
208
 
209
+ existing = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
210
+ if existing:
211
+ doc_data = registry.get_doc(existing["name"])
 
212
  st.session_state.update({
213
  "text": doc_data.get("text", ""),
214
  "toc": doc_data.get("toc", []),
 
216
  "embeddings": doc_data.get("embeddings"),
217
  "index": doc_data.get("index"),
218
  "doc_ready": True,
219
+ "active_doc": existing["name"],
220
  "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
221
  })
222
+ refresh_suggestions(existing["name"], st.session_state["toc"], st.session_state["chunks"])
 
223
  st.experimental_rerun()
224
 
 
225
  status = st.empty()
226
  status.info("πŸ“€ Upload complete β€” reading document...")
227
 
 
235
 
236
  doc_id = registry.register(temp_path, chunks, embeddings, index)
237
  st.session_state["active_doc"] = doc_id
238
+ status.success("βœ… Document processed successfully β€” ready to query!")
 
 
239
  refresh_suggestions(doc_name, toc, chunks)
240
 
241
  st.session_state.update({
 
246
  "index": index,
247
  "doc_ready": True,
248
  "last_doc": doc_identifier,
249
+ "status_text": "βœ… Document processed successfully β€” ready to query!"
250
  })
 
251
  st.experimental_rerun()
252
 
253
+ if st.session_state.get("doc_ready"):
254
+ st.info(st.session_state.get("status_text", "πŸ“„ Ready for queries."))
255
+ st.markdown("### πŸ’¬ Ask the Assistant")
256
+
257
+ suggs = st.session_state.get("query_suggestions_fixed", [])
258
+ if suggs:
259
+ visible = suggs if st.session_state["show_more"] else suggs[:3]
260
+ cols = st.columns(min(3, len(visible)))
261
+ for i, q in enumerate(visible):
262
+ if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
263
+ set_user_query(q, i)
264
+
265
+ toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
266
+ if st.button(toggle_text):
267
+ st.session_state["show_more"] = not st.session_state["show_more"]
268
+ st.experimental_rerun()
269
+
270
+ user_query = st.text_input("Type your question or click one above:", key="user_query_input")
271
+ if user_query.strip():
272
+ reasoning_mode = mode == "Extended (Document + General)"
273
+ with st.spinner("πŸ’­ Generating your answer..."):
274
+ retrieved = retrieve_chunks(
275
+ user_query,
276
+ st.session_state["index"],
277
+ st.session_state["chunks"],
278
+ top_k=top_k,
279
+ embeddings=st.session_state["embeddings"]
280
+ )
281
+ answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
282
+ st.session_state["retrieved"] = retrieved
283
+ st.markdown("### πŸ€– Assistant’s Answer")
284
+ st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
285
+
286
+ # ==========================================================
287
+ # 🎨 Sidebar scroll style
288
+ # ==========================================================
289
+ st.markdown("""
290
+ <style>
291
+ section[data-testid="stSidebar"] div.stExpander {max-height:480px;overflow-y:auto;}
292
+ </style>
293
+ """, unsafe_allow_html=True)