Shubham170793 commited on
Commit
3fbd2b9
Β·
verified Β·
1 Parent(s): c9a83aa

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +76 -234
src/streamlit_app.py CHANGED
@@ -1,51 +1,17 @@
1
  # ==========================================================
2
- # streamlit_app.py β€” Stable Layout (English Only)
3
  # ==========================================================
4
- import os
5
- import re
6
- import streamlit as st
7
- import torch
8
- from document_registry import DocumentRegistry
9
-
10
- # ==========================================================
11
- # βœ… PAGE CONFIGS
12
- # ==========================================================
13
- st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
14
- print("CUDA available:", torch.cuda.is_available())
15
-
16
- # ==========================================================
17
- # βš™οΈ SAFE RERUN HANDLER
18
- # ==========================================================
19
- def trigger_safe_rerun():
20
- """Mark rerun flag for next render instead of rerunning immediately."""
21
- st.session_state["_safe_rerun"] = True
22
-
23
- if st.session_state.get("_safe_rerun"):
24
- st.session_state["_safe_rerun"] = False
25
- st.rerun()
26
 
27
- # ==========================================================
28
- # βš™οΈ CACHE SETUP
29
- # ==========================================================
30
- CACHE_DIR = "/tmp/hf_cache"
31
- os.makedirs(CACHE_DIR, exist_ok=True)
32
- os.environ.update({
33
- "HF_HOME": CACHE_DIR,
34
- "TRANSFORMERS_CACHE": CACHE_DIR,
35
- "HF_DATASETS_CACHE": CACHE_DIR,
36
- "HF_MODULES_CACHE": CACHE_DIR,
37
- })
38
 
39
- # ==========================================================
40
- # πŸ“¦ IMPORTS
41
- # ==========================================================
42
- from ingestion import extract_text_from_pdf, chunk_text
43
- from vectorstore import build_faiss_index
44
- from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
45
 
46
- # ==========================================================
47
- # 🧠 HELPER: Suggestion Refresher
48
- # ==========================================================
49
  def refresh_suggestions(doc_name, toc, chunks):
50
  """Refresh dynamic suggestions and reset related states."""
51
  st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
@@ -55,176 +21,23 @@ def refresh_suggestions(doc_name, toc, chunks):
55
  st.session_state["selected_suggestion"] = None
56
  st.session_state["show_more"] = False
57
 
58
- # ==========================================================
59
- # 🧠 SMART SUGGESTION GENERATOR (English Only)
60
- # ==========================================================
61
- def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
62
- """
63
- Generates 5–7 short, natural English questions based on TOC and document text.
64
- """
65
- if not toc or not chunks:
66
- return ["How do I start using this guide?", "What does this document cover?"]
67
-
68
- titles = []
69
- for sec, raw_title in toc:
70
- title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
71
- title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
72
- if 4 < len(title) < 120:
73
- titles.append(title)
74
-
75
- context_sample = " ".join(chunks[:3])[:4000]
76
- prompt = f"""
77
- You are a content assistant. Based on the Table of Contents and the sample document text below,
78
- generate 5–7 short, natural user-facing questions.
79
- Each question should be under 18 words, end with a question mark, and sound human.
80
- Document: "{doc_name}"
81
-
82
- TABLE OF CONTENTS:
83
- {chr(10).join(['- ' + t for t in titles[:8]])}
84
-
85
- SAMPLE TEXT:
86
- {context_sample}
87
-
88
- Output: Write each question on a new line. Do not invent facts β€” base questions only on the document.
89
- """
90
-
91
- try:
92
- ai_response = genai_generate(prompt)
93
- lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
94
- questions = []
95
- for ln in lines:
96
- q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
97
- if not q.endswith("?") and len(q.split()) < 18 and re.match(
98
- r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q
99
- ):
100
- q += "?"
101
- if 8 <= len(q) <= 140:
102
- questions.append(q)
103
- final = []
104
- seen = set()
105
- for q in questions:
106
- if q.lower() not in seen:
107
- seen.add(q.lower())
108
- final.append(q)
109
- if not final:
110
- final = [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
111
- return final[:7]
112
- except Exception:
113
- return ["How do I start using this guide?", "What does this document cover?"]
114
-
115
- # ==========================================================
116
- # 🎨 STYLING
117
- # ==========================================================
118
- st.markdown("""
119
- <style>
120
- div.block-container {padding-top: 1.2rem; max-width: 1080px;}
121
- h1, h2, h3 {color: #f3f4f6; font-weight: 600;}
122
- .suggest-chip {
123
- background: #0f1724;
124
- border: 1px solid #374151;
125
- border-radius: 14px;
126
- color: #e6eef8;
127
- padding: 8px 12px;
128
- cursor: pointer;
129
- font-size: 13px;
130
- margin: 6px 6px 6px 0;
131
- display: inline-block;
132
- transition: background 0.2s, transform 0.1s;
133
- }
134
- .suggest-chip:hover {background: #1e3a8a; transform: translateY(-2px);}
135
- .answer-box {
136
- background: linear-gradient(180deg,#0b1220,#071027);
137
- border-left: 4px solid #3b82f6;
138
- border-radius: 8px;
139
- padding: 16px 18px;
140
- color: #e6eef8;
141
- margin-top: 12px;
142
- box-shadow: 0 4px 14px rgba(0,0,0,0.35);
143
- }
144
- .stTextInput > div > div > input {
145
- background-color: #0f172a !important;
146
- color: #f1f5f9 !important;
147
- border-radius: 6px !important;
148
- border: 1px solid #334155 !important;
149
- padding: 8px 10px !important;
150
- font-size: 15px !important;
151
- }
152
- .stTextInput > label {font-weight: 500;}
153
- .small-link {font-size: 13px; color: #60a5fa; cursor: pointer;}
154
- </style>
155
- """, unsafe_allow_html=True)
156
-
157
- # ==========================================================
158
- # 🧭 SIDEBAR
159
- # ==========================================================
160
- with st.sidebar:
161
- st.markdown("### 🧭 Response Style")
162
- mode = st.radio(
163
- "",
164
- ("Strict (Document-only)", "Extended (Document + General)"),
165
- index=0,
166
- )
167
- st.markdown("---")
168
-
169
- if "registry" in st.session_state:
170
- registry = st.session_state["registry"]
171
- registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
172
- if registered_docs:
173
- with st.expander("πŸ“š Registered Documents", expanded=False):
174
- for i, doc in enumerate(registered_docs, start=1):
175
- doc_name = doc.get("name", "Unknown")
176
- chunks = doc.get("num_chunks", "?")
177
- toc_source = doc.get("toc_source", "β€”")
178
- st.markdown(f"**{i}. {doc_name}** β€” {chunks} chunks *(TOC: {toc_source})*")
179
-
180
- st.markdown("---")
181
- active_doc_name = st.selectbox(
182
- "πŸ“„ Select Active Document",
183
- [doc["name"] for doc in registered_docs],
184
- index=0,
185
- key="active_doc_selector"
186
- )
187
- selected_doc = registry.get_doc(active_doc_name)
188
- if selected_doc:
189
- st.session_state.update({
190
- "active_doc": active_doc_name,
191
- "chunks": selected_doc["chunks"],
192
- "embeddings": selected_doc["embeddings"],
193
- "index": selected_doc["index"],
194
- "doc_ready": True,
195
- "status_text": f"πŸ“„ {active_doc_name} loaded from registry β€” ready for queries."
196
- })
197
- st.caption("✨ Built by Shubham Sharma")
198
-
199
- # ==========================================================
200
- # πŸ“„ MAIN SECTION
201
- # ==========================================================
202
- st.title("πŸ“„ Enterprise Knowledge Assistant")
203
- st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
204
-
205
- doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
206
-
207
- # ==========================================================
208
- # πŸ“‚ DOCUMENT HANDLING β€” SAFE VERSION
209
- # ==========================================================
210
- import hashlib
211
-
212
- def _hash_content(file_path):
213
- hasher = hashlib.sha256()
214
- with open(file_path, "rb") as f:
215
- while chunk := f.read(8192):
216
- hasher.update(chunk)
217
- return hasher.hexdigest()[:12]
218
 
 
219
  if doc_choice == "-- Select --":
220
  st.info("⬅️ Select or upload a document to begin.")
221
  else:
222
  temp_path = None
 
 
223
  if doc_choice == "Sample PDF":
224
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
225
  st.markdown("βœ… **Sample PDF selected.** Preparing document...")
226
  else:
227
- uploaded_file = st.file_uploader("Upload a PDF document (max 200MB):", type="pdf", label_visibility="collapsed")
 
 
 
 
228
  if uploaded_file:
229
  temp_path = os.path.join("/tmp", uploaded_file.name)
230
  with open(temp_path, "wb") as f:
@@ -232,15 +45,18 @@ else:
232
  else:
233
  st.stop()
234
 
 
235
  if temp_path:
236
  doc_name = os.path.basename(temp_path)
237
  file_hash = _hash_content(temp_path)
238
  doc_identifier = f"{doc_name}_{file_hash}"
239
 
 
240
  if "registry" not in st.session_state:
241
  st.session_state["registry"] = DocumentRegistry()
242
  registry = st.session_state["registry"]
243
 
 
244
  existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
245
  if existing_doc:
246
  doc_data = registry.get_doc(existing_doc["name"])
@@ -254,21 +70,29 @@ else:
254
  "active_doc": existing_doc["name"],
255
  "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
256
  })
 
257
  refresh_suggestions(existing_doc["name"], st.session_state["toc"], st.session_state["chunks"])
258
- trigger_safe_rerun()
259
 
 
260
  status = st.empty()
261
  status.info("πŸ“€ Upload complete β€” reading document...")
 
262
  text, toc, toc_source = extract_text_from_pdf(temp_path)
263
  status.info("πŸ“‘ Parsing and chunking document...")
264
- chunks = chunk_text(text, chunk_size=1000, overlap=120)
 
265
  status.info("🧠 Building embeddings and search index...")
266
  embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
267
  index = build_faiss_index(embeddings)
 
268
  doc_id = registry.register(temp_path, chunks, embeddings, index)
269
  st.session_state["active_doc"] = doc_id
270
- status.success("βœ… Document processed successfully β€” ready to query!")
 
 
271
  refresh_suggestions(doc_name, toc, chunks)
 
272
  st.session_state.update({
273
  "text": text,
274
  "toc": toc,
@@ -277,32 +101,50 @@ else:
277
  "index": index,
278
  "doc_ready": True,
279
  "last_doc": doc_identifier,
280
- "status_text": "βœ… Document processed successfully β€” ready to query!"
281
  })
282
- trigger_safe_rerun()
283
-
284
- if st.session_state.get("doc_ready"):
285
- st.info(st.session_state.get("status_text"))
286
- st.markdown("### πŸ’¬ Ask the Assistant")
287
- query_suggestions = st.session_state.get("query_suggestions_fixed", [])
288
- if query_suggestions:
289
- visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
290
- cols = st.columns(min(3, len(visible)))
291
- for i, q in enumerate(visible):
292
- if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
293
- st.session_state["user_query_input"] = q
294
- st.session_state["selected_suggestion"] = i
295
- trigger_safe_rerun()
296
- toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
297
- if st.button(toggle_text):
298
- st.session_state["show_more"] = not st.session_state["show_more"]
299
- trigger_safe_rerun()
300
 
301
- user_query = st.text_input("Your Question:", key="user_query_input", label_visibility="visible")
302
- if user_query.strip():
303
- reasoning_mode = mode == "Extended (Document + General)"
304
- with st.spinner("πŸ’­ Generating your answer..."):
305
- retrieved = retrieve_chunks(user_query, st.session_state["index"], st.session_state["chunks"], top_k=5, embeddings=st.session_state["embeddings"])
306
- answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
307
- st.markdown("### πŸ€– Assistant’s Answer")
308
- st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # ==========================================================
2
+ # πŸ“‚ DOCUMENT HANDLING β€” CLEAN, ACCURATE, AND BYTE-AWARE
3
  # ==========================================================
4
+ import hashlib
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ def _hash_content(file_path):
7
+ """Generate a short SHA256 hash of the file's actual binary content."""
8
+ hasher = hashlib.sha256()
9
+ with open(file_path, "rb") as f:
10
+ while chunk := f.read(8192):
11
+ hasher.update(chunk)
12
+ return hasher.hexdigest()[:12] # short unique hash for same-name files
 
 
 
 
13
 
 
 
 
 
 
 
14
 
 
 
 
15
  def refresh_suggestions(doc_name, toc, chunks):
16
  """Refresh dynamic suggestions and reset related states."""
17
  st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
 
21
  st.session_state["selected_suggestion"] = None
22
  st.session_state["show_more"] = False
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # --- Document selection ---
26
  if doc_choice == "-- Select --":
27
  st.info("⬅️ Select or upload a document to begin.")
28
  else:
29
  temp_path = None
30
+
31
+ # --- File selection ---
32
  if doc_choice == "Sample PDF":
33
  temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
34
  st.markdown("βœ… **Sample PDF selected.** Preparing document...")
35
  else:
36
+ uploaded_file = st.file_uploader(
37
+ "Upload a PDF document (max 200MB):",
38
+ type="pdf",
39
+ label_visibility="collapsed"
40
+ )
41
  if uploaded_file:
42
  temp_path = os.path.join("/tmp", uploaded_file.name)
43
  with open(temp_path, "wb") as f:
 
45
  else:
46
  st.stop()
47
 
48
+ # --- Start processing if file exists ---
49
  if temp_path:
50
  doc_name = os.path.basename(temp_path)
51
  file_hash = _hash_content(temp_path)
52
  doc_identifier = f"{doc_name}_{file_hash}"
53
 
54
+ # βœ… Step 0: Ensure registry exists
55
  if "registry" not in st.session_state:
56
  st.session_state["registry"] = DocumentRegistry()
57
  registry = st.session_state["registry"]
58
 
59
+ # βœ… Step 1: Check if already registered
60
  existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
61
  if existing_doc:
62
  doc_data = registry.get_doc(existing_doc["name"])
 
70
  "active_doc": existing_doc["name"],
71
  "status_text": f"βœ… {doc_name} already processed β€” loaded from registry."
72
  })
73
+
74
  refresh_suggestions(existing_doc["name"], st.session_state["toc"], st.session_state["chunks"])
75
+ st.experimental_rerun()
76
 
77
+ # βœ… Step 2: New document β†’ process
78
  status = st.empty()
79
  status.info("πŸ“€ Upload complete β€” reading document...")
80
+
81
  text, toc, toc_source = extract_text_from_pdf(temp_path)
82
  status.info("πŸ“‘ Parsing and chunking document...")
83
+ chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
84
+
85
  status.info("🧠 Building embeddings and search index...")
86
  embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
87
  index = build_faiss_index(embeddings)
88
+
89
  doc_id = registry.register(temp_path, chunks, embeddings, index)
90
  st.session_state["active_doc"] = doc_id
91
+
92
+ status.success("βœ… Document processed successfully β€” all set to query your assistant!")
93
+
94
  refresh_suggestions(doc_name, toc, chunks)
95
+
96
  st.session_state.update({
97
  "text": text,
98
  "toc": toc,
 
101
  "index": index,
102
  "doc_ready": True,
103
  "last_doc": doc_identifier,
104
+ "status_text": "βœ… Document processed successfully β€” all set to query your assistant!"
105
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ st.experimental_rerun()
108
+
109
+ # --- Display Ready Message + Ask Section ---
110
+ if st.session_state.get("doc_ready"):
111
+ active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
112
+ st.info(st.session_state.get("status_text", f"πŸ“„ {active_name or 'Document'} is ready for queries."))
113
+
114
+ st.markdown("### πŸ’¬ Ask the Assistant")
115
+ query_suggestions = st.session_state.get("query_suggestions_fixed", [])
116
+ if query_suggestions:
117
+ visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
118
+ cols = st.columns(min(3, len(visible)))
119
+ for i, q in enumerate(visible):
120
+ if cols[i % 3].button(f"πŸ’¬ {q}", key=f"sugg_{i}"):
121
+ st.session_state["user_query_input"] = q
122
+ st.session_state["selected_suggestion"] = i
123
+ st.experimental_rerun()
124
+
125
+ toggle_text = "Show less β–²" if st.session_state["show_more"] else "Show more β–Ό"
126
+ if st.button(toggle_text, help="Show or hide more suggestions"):
127
+ st.session_state["show_more"] = not st.session_state["show_more"]
128
+ st.experimental_rerun()
129
+
130
+ user_query = st.text_input(
131
+ "Type your question or click one above:",
132
+ key="user_query_input",
133
+ label_visibility="visible"
134
+ )
135
+
136
+ if user_query.strip():
137
+ reasoning_mode = mode == "Extended (Document + General)"
138
+ with st.spinner("πŸ’­ Generating your answer..."):
139
+ retrieved = retrieve_chunks(
140
+ user_query,
141
+ st.session_state["index"],
142
+ st.session_state["chunks"],
143
+ top_k=top_k,
144
+ embeddings=st.session_state["embeddings"]
145
+ )
146
+ answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
147
+ st.session_state["retrieved"] = retrieved
148
+
149
+ st.markdown("### πŸ€– Assistant’s Answer")
150
+ st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)