Shubham170793 commited on
Commit
b9919bb
Β·
verified Β·
1 Parent(s): f9285e7

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +74 -90
src/streamlit_app.py CHANGED
@@ -1,8 +1,6 @@
1
  import os
2
  import re
3
  import shutil
4
- import pickle
5
- import faiss
6
  import streamlit as st
7
  import torch
8
 
@@ -24,7 +22,11 @@ st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
24
  # 🧹 Cache Management
25
  # ==========================================================
26
  def clean_cache(max_size_gb: float = 2.0):
27
- folders = ["/root/.cache/huggingface", "/root/.cache/transformers", "/root/.cache/torch"]
 
 
 
 
28
  total_deleted = 0.0
29
  for folder in folders:
30
  if os.path.exists(folder):
@@ -58,74 +60,80 @@ os.environ.update({
58
  # ==========================================================
59
  from ingestion import extract_text_from_pdf, chunk_text
60
  from vectorstore import build_faiss_index
61
- from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  # ==========================================================
64
- # πŸ’Ύ Persistent Embedding Cache for Hugging Face
65
  # ==========================================================
66
- EMBED_STORE_DIR = "/data/embedding_cache"
67
- os.makedirs(EMBED_STORE_DIR, exist_ok=True)
68
-
69
- # ==========================================================
70
- # 🧠 Smart Dynamic Suggestion Generator (doc-based only)
71
- # ==========================================================
72
- def generate_dynamic_suggestions_from_doc(toc, chunks, doc_name="Document"):
73
- """Generate clean, doc-based questions β€” no preset or AI hallucination."""
74
- suggestions = []
75
-
76
- # Case 1: Use TOC if available
77
- if toc:
78
- toc_titles = [re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", title).strip() for _, title in toc]
79
- toc_titles = [re.sub(r"\.{2,}\s*\d+$", "", t).strip() for t in toc_titles if len(t.strip()) > 3]
80
-
81
- for title in toc_titles[:8]:
82
- title_clean = re.sub(r"[\d\-.]", "", title).strip()
83
- if re.search(r"\b(how|what|why|when|where)\b", title_clean.lower()):
84
- suggestions.append(title_clean.rstrip("?") + "?")
85
- else:
86
- suggestions.append(f"What is described in '{title_clean}'?")
87
- return suggestions[:6]
88
-
89
- # Case 2: Fallback to section headers or recurring patterns
90
- text_sample = " ".join(chunks[:6])
91
- headers = re.findall(r"(?<=\n)[A-Z][A-Za-z0-9\s:,-]{5,80}(?=\n)", text_sample)
92
- for h in headers[:8]:
93
- h = h.strip().rstrip(":")
94
- suggestions.append(f"What is explained in '{h}'?")
95
-
96
- # Case 3: If none found
97
- if not suggestions:
98
- suggestions = [
99
- "What is the purpose of this document?",
100
- "How can this process be started?",
101
- "What are the key configurations explained?",
102
- ]
103
-
104
- # Deduplicate and format
105
- seen, clean_final = set(), []
106
- for q in suggestions:
107
- q = q.strip().capitalize()
108
- if q not in seen and 10 < len(q) < 120:
109
- seen.add(q)
110
- clean_final.append(q)
111
- return clean_final[:6]
112
 
113
  # ==========================================================
114
  # πŸ–₯️ Header
115
  # ==========================================================
116
  st.title("πŸ“„ Enterprise Knowledge Assistant")
117
- st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
118
 
119
  # ==========================================================
120
  # 🧭 Sidebar
121
  # ==========================================================
122
  with st.sidebar:
 
 
 
123
  if "reasoning_mode" not in st.session_state:
124
  st.session_state.reasoning_mode = False
125
  st.session_state.reasoning_mode = st.toggle(
126
  "🧠 Enable Reasoning Mode",
127
  value=st.session_state.reasoning_mode,
128
- help="ON = reasoning Β· OFF = strict factual lookup"
129
  )
130
 
131
  st.markdown("---")
@@ -141,7 +149,7 @@ with st.sidebar:
141
  st.caption("✨ Built by Shubham Sharma")
142
 
143
  # ==========================================================
144
- # 🧾 Document Handling
145
  # ==========================================================
146
  text, chunks, index, embeddings, toc = None, None, None, None, None
147
 
@@ -154,15 +162,12 @@ for key, default in {
154
  if key not in st.session_state:
155
  st.session_state[key] = default
156
 
157
- # ----------------------------------------------------------
158
- # πŸ“„ Main Logic
159
- # ----------------------------------------------------------
160
  if doc_choice == "-- Select --":
161
  st.info("⬅️ Please choose a document from the sidebar to begin.")
162
  else:
163
- # Load document
164
  if doc_choice == "Sample PDF":
165
- temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
166
  st.success("πŸ“˜ Using built-in Sample PDF")
167
  else:
168
  uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
@@ -175,49 +180,31 @@ else:
175
  temp_path = None
176
 
177
  if temp_path:
178
- # ----------------------------------------------------------
179
- # πŸ” Extract + Chunk + Suggest
180
- # ----------------------------------------------------------
181
  with st.spinner("πŸ” Processing your document..."):
182
  text, toc = extract_text_from_pdf(temp_path)
183
  chunks = chunk_text(text, chunk_size=chunk_size)
184
  st.markdown("βœ… Document loaded successfully.")
185
- query_suggestions = generate_dynamic_suggestions_from_doc(toc, chunks, os.path.basename(temp_path))
186
 
187
- # ----------------------------------------------------------
188
- # πŸ’Ύ Persistent Embedding Cache (Hugging Face Safe)
189
- # ----------------------------------------------------------
190
  with st.spinner("βš™οΈ Preparing search index..."):
191
- doc_id = os.path.basename(temp_path).replace(".pdf", "")
192
- emb_path = os.path.join(EMBED_STORE_DIR, f"{doc_id}_embeddings.pkl")
193
- index_path = os.path.join(EMBED_STORE_DIR, f"{doc_id}_faiss.index")
194
-
195
- if os.path.exists(emb_path) and os.path.exists(index_path):
196
- with open(emb_path, "rb") as f:
197
- embeddings = pickle.load(f)
198
- index = faiss.read_index(index_path)
199
- st.success(f"πŸ“¦ Loaded cached embeddings for '{doc_id}'.")
200
- else:
201
- embeddings = cache_embeddings(doc_id, chunks, embed_chunks)
202
- index = build_faiss_index(embeddings)
203
- with open(emb_path, "wb") as f:
204
- pickle.dump(embeddings, f)
205
- faiss.write_index(index, index_path)
206
- st.success(f"πŸ’Ύ Saved new embeddings for '{doc_id}'.")
207
 
208
  # ----------------------------------------------------------
209
- # πŸ’¬ Ask a Question Section
210
  # ----------------------------------------------------------
211
  st.markdown("## πŸ€– Ask a Question")
212
 
213
  if query_suggestions:
214
  st.markdown("#### πŸ’‘ Suggested Questions")
 
215
  visible = query_suggestions if st.session_state.show_more else query_suggestions[:3]
216
  cols = st.columns(min(3, len(visible)))
217
 
218
  for i, q in enumerate(visible):
219
  col = cols[i % 3]
220
- if col.button(f"πŸ” {q}", key=f"suggest_{i}"):
221
  st.session_state.selected_suggestion = i
222
  st.session_state.user_query_input = q
223
 
@@ -226,17 +213,16 @@ else:
226
  st.session_state.show_more = not st.session_state.show_more
227
  st.experimental_rerun()
228
 
229
- # --- Input box (synced with suggestion) ---
230
  user_query = st.text_input(
231
  "Type your question or pick one above:",
232
  value=st.session_state.user_query_input,
233
  key="user_query_input",
234
  )
235
 
236
- # --- Generate answer ---
237
  if user_query.strip():
238
  st.caption("Mode: 🧠 Reasoning" if st.session_state.reasoning_mode else "Mode: πŸ“„ Strict Document")
239
-
240
  with st.spinner("πŸ’­ Analyzing your document..."):
241
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
242
  answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
@@ -248,9 +234,7 @@ else:
248
  for i, r in enumerate(retrieved, start=1):
249
  st.markdown(f"**Chunk {i}:** {r}")
250
 
251
- # ----------------------------------------------------------
252
- # πŸ“‘ Document Preview
253
- # ----------------------------------------------------------
254
  if chunks:
255
  st.markdown("---")
256
  st.subheader("πŸ“‘ Document Preview")
 
1
  import os
2
  import re
3
  import shutil
 
 
4
  import streamlit as st
5
  import torch
6
 
 
22
  # 🧹 Cache Management
23
  # ==========================================================
24
  def clean_cache(max_size_gb: float = 2.0):
25
+ folders = [
26
+ "/root/.cache/huggingface",
27
+ "/root/.cache/transformers",
28
+ "/root/.cache/torch",
29
+ ]
30
  total_deleted = 0.0
31
  for folder in folders:
32
  if os.path.exists(folder):
 
60
  # ==========================================================
61
  from ingestion import extract_text_from_pdf, chunk_text
62
  from vectorstore import build_faiss_index
63
+ from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
64
+
65
+ # ==========================================================
66
+ # 🧠 Smart Suggestion Generator (Strong Version)
67
+ # ==========================================================
68
+ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
69
+ """Generate 5–7 short, smart, natural suggestions based on document context."""
70
+ if not toc or not chunks:
71
+ return []
72
+
73
+ titles = []
74
+ for sec, raw_title in toc:
75
+ title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
76
+ title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
77
+ if 4 < len(title) < 120:
78
+ titles.append(title)
79
+
80
+ context_sample = " ".join(chunks[:3])[:4000]
81
+ prompt = f"""
82
+ You are generating intelligent, concise questions for an enterprise assistant.
83
+ Document name: "{doc_name}"
84
+ TABLE OF CONTENTS:
85
+ {chr(10).join(['- ' + t for t in titles[:10]])}
86
+
87
+ CONTENT SAMPLE:
88
+ {context_sample}
89
+
90
+ Generate 5–7 smart, helpful, and professional questions that a user might ask about this document.
91
+ Keep them short (<20 words), clear, and well-formed.
92
+ """
93
+
94
+ try:
95
+ ai_response = genai_generate(prompt)
96
+ questions = re.findall(r"[-β€’]?\s*(.+?)\?", ai_response)
97
+ clean_qs = [q.strip("β€’-β€” ").strip() + "?" for q in questions if 8 < len(q) < 120]
98
+ seen, final = set(), []
99
+ for q in clean_qs:
100
+ if q.lower() not in seen:
101
+ seen.add(q.lower())
102
+ final.append(q)
103
+ return final[:7]
104
+ except Exception:
105
+ return [
106
+ "What is this document about?",
107
+ "How do I start using this process?",
108
+ "What key configurations are included?",
109
+ ]
110
 
111
  # ==========================================================
112
+ # πŸ“ Paths
113
  # ==========================================================
114
+ BASE_DIR = os.path.dirname(__file__)
115
+ LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
116
+ SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  # ==========================================================
119
  # πŸ–₯️ Header
120
  # ==========================================================
121
  st.title("πŸ“„ Enterprise Knowledge Assistant")
122
+ st.caption("Ask questions about SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
123
 
124
  # ==========================================================
125
  # 🧭 Sidebar
126
  # ==========================================================
127
  with st.sidebar:
128
+ if os.path.exists(LOGO_PATH):
129
+ st.image(LOGO_PATH, width=150)
130
+
131
  if "reasoning_mode" not in st.session_state:
132
  st.session_state.reasoning_mode = False
133
  st.session_state.reasoning_mode = st.toggle(
134
  "🧠 Enable Reasoning Mode",
135
  value=st.session_state.reasoning_mode,
136
+ help="ON = expanded reasoning Β· OFF = factual answers"
137
  )
138
 
139
  st.markdown("---")
 
149
  st.caption("✨ Built by Shubham Sharma")
150
 
151
  # ==========================================================
152
+ # 🧾 Document Handling + Question UI (Perfect Sync)
153
  # ==========================================================
154
  text, chunks, index, embeddings, toc = None, None, None, None, None
155
 
 
162
  if key not in st.session_state:
163
  st.session_state[key] = default
164
 
165
+ # --- Document Handling ---
 
 
166
  if doc_choice == "-- Select --":
167
  st.info("⬅️ Please choose a document from the sidebar to begin.")
168
  else:
 
169
  if doc_choice == "Sample PDF":
170
+ temp_path = SAMPLE_PATH
171
  st.success("πŸ“˜ Using built-in Sample PDF")
172
  else:
173
  uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
 
180
  temp_path = None
181
 
182
  if temp_path:
 
 
 
183
  with st.spinner("πŸ” Processing your document..."):
184
  text, toc = extract_text_from_pdf(temp_path)
185
  chunks = chunk_text(text, chunk_size=chunk_size)
186
  st.markdown("βœ… Document loaded successfully.")
187
+ query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, os.path.basename(temp_path))
188
 
 
 
 
189
  with st.spinner("βš™οΈ Preparing search index..."):
190
+ embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
191
+ index = build_faiss_index(embeddings)
192
+ st.markdown("πŸš€ Document ready β€” you can now ask questions below.")
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  # ----------------------------------------------------------
195
+ # πŸ’¬ Ask a Question
196
  # ----------------------------------------------------------
197
  st.markdown("## πŸ€– Ask a Question")
198
 
199
  if query_suggestions:
200
  st.markdown("#### πŸ’‘ Suggested Questions")
201
+
202
  visible = query_suggestions if st.session_state.show_more else query_suggestions[:3]
203
  cols = st.columns(min(3, len(visible)))
204
 
205
  for i, q in enumerate(visible):
206
  col = cols[i % 3]
207
+ if col.button(f"πŸ” {q}", key=f"q_{i}"):
208
  st.session_state.selected_suggestion = i
209
  st.session_state.user_query_input = q
210
 
 
213
  st.session_state.show_more = not st.session_state.show_more
214
  st.experimental_rerun()
215
 
216
+ # --- Text Input synced with suggestion ---
217
  user_query = st.text_input(
218
  "Type your question or pick one above:",
219
  value=st.session_state.user_query_input,
220
  key="user_query_input",
221
  )
222
 
223
+ # --- Answer Generation ---
224
  if user_query.strip():
225
  st.caption("Mode: 🧠 Reasoning" if st.session_state.reasoning_mode else "Mode: πŸ“„ Strict Document")
 
226
  with st.spinner("πŸ’­ Analyzing your document..."):
227
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
228
  answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
 
234
  for i, r in enumerate(retrieved, start=1):
235
  st.markdown(f"**Chunk {i}:** {r}")
236
 
237
+ # --- Document Preview ---
 
 
238
  if chunks:
239
  st.markdown("---")
240
  st.subheader("πŸ“‘ Document Preview")