Shubham170793 commited on
Commit
fb3091e
Β·
verified Β·
1 Parent(s): cce4565

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +129 -134
src/streamlit_app.py CHANGED
@@ -3,28 +3,24 @@ import re
3
  import shutil
4
  import streamlit as st
5
  import torch
6
- from typing import List, Tuple
7
 
8
- # -------------------------
9
- # Environment / diagnostics
10
- # -------------------------
11
  print("CUDA available:", torch.cuda.is_available())
12
  if torch.cuda.is_available():
13
- try:
14
- print("GPU:", torch.cuda.get_device_name(0))
15
- except Exception:
16
- pass
17
  else:
18
  print("Running on CPU")
19
 
20
- # -------------------------
21
- # Page config
22
- # -------------------------
23
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
24
 
25
- # -------------------------
26
- # Cache cleanup (prevent HF cache overflow on spaces)
27
- # -------------------------
28
  def clean_cache(max_size_gb: float = 2.0):
29
  folders = [
30
  "/root/.cache/huggingface",
@@ -33,27 +29,23 @@ def clean_cache(max_size_gb: float = 2.0):
33
  ]
34
  total_deleted = 0.0
35
  for folder in folders:
36
- try:
37
- if os.path.exists(folder):
38
- size_bytes = sum(
39
- os.path.getsize(os.path.join(dp, f))
40
- for dp, _, files in os.walk(folder)
41
- for f in files
42
- )
43
- size_gb = size_bytes / (1024 ** 3)
44
- if size_gb > max_size_gb or "torch" in folder:
45
- shutil.rmtree(folder, ignore_errors=True)
46
- total_deleted += size_gb
47
- except Exception:
48
- pass
49
  os.makedirs("/tmp/hf_cache", exist_ok=True)
50
  print(f"🧹 Cache cleanup done. Removed ~{total_deleted:.2f} GB.")
51
 
52
  clean_cache()
53
 
54
- # -------------------------
55
- # HF cache env
56
- # -------------------------
57
  CACHE_DIR = "/tmp/hf_cache"
58
  os.makedirs(CACHE_DIR, exist_ok=True)
59
  os.environ.update({
@@ -63,106 +55,73 @@ os.environ.update({
63
  "HF_MODULES_CACHE": CACHE_DIR
64
  })
65
 
66
- # -------------------------
67
- # App-specific imports
68
- # -------------------------
69
  from ingestion import extract_text_from_pdf, chunk_text
70
  from vectorstore import build_faiss_index
71
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
72
 
73
- # -------------------------
74
- # Smart suggestion generator (fixed f-string)
75
- # -------------------------
76
- def generate_dynamic_suggestions_from_toc(toc: List[Tuple[str, str]], chunks: List[str], doc_name="Document") -> List[str]:
77
- if not chunks:
78
  return []
79
 
80
  titles = []
81
  for sec, raw_title in toc or []:
82
  title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
83
  title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
84
- if 4 < len(title) < 140:
85
  titles.append(title)
86
 
87
- if not titles:
88
- sample = " ".join(chunks[:4])
89
- sents = re.split(r'(?<=[.?!])\s+', sample)
90
- suggestions = []
91
- for s in sents:
92
- if re.search(r'\b(set up|configure|install|enable|procedure|process|how to|step)\b', s, re.I):
93
- s = re.sub(r'[.?!]+$', '', s.strip())
94
- q = s[0].upper() + s[1:]
95
- if len(q) < 140:
96
- suggestions.append(q if q.endswith('?') else q + '?')
97
- return suggestions[:7]
98
 
99
- try:
100
- prompt = f"""Generate 5-7 user-friendly, short questions based on the document's table of contents:
101
- Document: {doc_name}
102
- TOC:
103
- {chr(10).join(['- ' + t for t in titles[:15]])}"""
104
- ai_resp = genai_generate(prompt)
105
- qs = re.findall(r'([^\n?.!]+\?)', ai_resp)
106
- clean_qs = [q.strip() for q in qs if 8 < len(q) < 140]
107
- if clean_qs:
108
- return list(dict.fromkeys(clean_qs))[:7]
109
- except Exception:
110
- pass
111
 
112
- suggestions = []
113
- for t in titles[:15]:
114
- low = t.lower()
115
- if re.search(r'\b(set up|install|configure|enable|define|create|prepare)\b', low):
116
- cleaned = re.sub(r'[^a-zA-Z0-9 \-]', '', low)
117
- suggestions.append(f"How do I {cleaned}?")
118
- elif re.search(r'\b(purpose|overview|objective|scope|what is)\b', low):
119
- suggestions.append(f"What is {t.strip().rstrip('.')}?")
120
- elif re.search(r'\b(step|procedure|process|task)\b', low):
121
- suggestions.append(f"What are the steps for {t.strip().rstrip('.')}?")
122
- else:
123
- suggestions.append(f"What is described in '{t}'?")
124
-
125
- seen, final = set(), []
126
- for s in suggestions:
127
- s = re.sub(r'\s+', ' ', s).strip()
128
- if s.lower() not in seen:
129
- seen.add(s.lower())
130
- final.append(s)
131
- return final[:7]
132
 
133
- # -------------------------
134
- # CSS for modern UI
135
- # -------------------------
136
- st.markdown("""
137
- <style>
138
- .section-header {font-weight:700;font-size:1.2rem;margin-top:22px;margin-bottom:8px;color:#f3f4f6;}
139
- .info-card {background:linear-gradient(180deg,rgba(16,24,39,0.9),rgba(6,10,14,0.9));padding:12px 16px;border-radius:10px;color:#e6eef6;box-shadow:0 6px 20px rgba(2,6,23,0.5);}
140
- .suggest-chip {display:inline-block;margin:6px 8px;padding:10px 16px;border-radius:999px;background:rgba(31,41,55,0.65);border:1px solid rgba(148,163,184,0.08);color:#e6eef6;font-size:14px;cursor:pointer;transition:transform .12s ease,box-shadow .12s ease;}
141
- .suggest-chip:hover {transform:translateY(-4px);box-shadow:0 8px 20px rgba(37,99,235,0.12);background:rgba(37,99,235,0.12);}
142
- .suggest-chip.active {background:linear-gradient(90deg,rgba(59,130,246,0.12),rgba(99,102,241,0.06));border:1px solid rgba(99,102,241,0.22);color:#eaf2ff;box-shadow:0 8px 30px rgba(37,99,235,0.12);}
143
- .stTextInput>div>div>input {background:rgba(17,24,39,0.75);border-radius:8px;padding:12px 14px;color:#e6eef6;border:1px solid rgba(255,255,255,0.03);}
144
- .stTextInput>div>div>input:focus {box-shadow:0 6px 20px rgba(37,99,235,0.06);border:1px solid rgba(37,99,235,0.3);}
145
- .assistant-card {background:linear-gradient(180deg,rgba(6,10,14,0.7),rgba(10,15,20,0.6));border-left:4px solid rgba(59,130,246,0.9);padding:18px 20px;border-radius:10px;color:#f8fbff;margin-bottom:10px;box-shadow:0 10px 40px rgba(2,6,23,0.5);}
146
- </style>
147
- """, unsafe_allow_html=True)
 
148
 
149
- # -------------------------
150
- # App Header
151
- # -------------------------
152
  st.title("πŸ“„ Enterprise Knowledge Assistant")
153
  st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
154
 
155
- # -------------------------
156
- # Sidebar
157
- # -------------------------
158
  with st.sidebar:
159
  if "reasoning_mode" not in st.session_state:
160
  st.session_state.reasoning_mode = False
161
-
162
  st.session_state.reasoning_mode = st.toggle(
163
  "🧠 Enable Reasoning Mode",
164
  value=st.session_state.reasoning_mode,
165
- help="ON: expanded reasoning / OFF: strict doc lookup"
166
  )
167
 
168
  st.markdown("---")
@@ -171,32 +130,34 @@ with st.sidebar:
171
 
172
  st.markdown("---")
173
  st.header("βš™οΈ Settings")
174
- chunk_size = st.slider("Chunk Size (chars)", 200, 1500, 1000, step=50)
175
- overlap = st.slider("Chunk Overlap (chars)", 50, 200, 120, step=10)
176
  top_k = st.slider("Top K Results", 1, 10, 5)
177
  st.markdown("---")
178
  st.caption("✨ Built by Shubham Sharma")
179
 
180
- # -------------------------
181
- # Session state
182
- # -------------------------
183
- for k, v in {
 
 
 
184
  "show_more": False,
185
  "user_query_input": "",
186
- "selected_suggestion_idx": None,
187
  }.items():
188
- if k not in st.session_state:
189
- st.session_state[k] = v
190
 
191
- # -------------------------
192
- # Document processing
193
- # -------------------------
194
  BASE_DIR = os.path.dirname(__file__)
195
  SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
196
- text, chunks, index, embeddings, toc = None, None, None, None, None
197
 
198
  if doc_choice == "-- Select --":
199
- st.info("⬅️ Please choose a document from the sidebar to begin.")
200
  else:
201
  if doc_choice == "Sample PDF":
202
  temp_path = SAMPLE_PATH
@@ -211,53 +172,87 @@ else:
211
  else:
212
  temp_path = None
213
 
 
 
 
214
  if temp_path:
215
- with st.spinner("πŸ” Processing document..."):
216
  text, toc = extract_text_from_pdf(temp_path)
217
  chunks = chunk_text(text, chunk_size=chunk_size)
218
- st.markdown("<div class='info-card'>βœ… Document loaded successfully.</div>", unsafe_allow_html=True)
219
 
220
- query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, os.path.basename(temp_path))
 
 
221
 
222
  with st.spinner("βš™οΈ Preparing embeddings..."):
223
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
224
  index = build_faiss_index(embeddings)
225
- st.markdown("<div class='info-card'>πŸš€ Document ready β€” ask questions below.</div>", unsafe_allow_html=True)
226
 
227
- # -------------------------
228
- # Ask a Question
229
- # -------------------------
230
  st.markdown("## πŸ€– Ask a Question")
231
-
232
  if query_suggestions:
233
  st.markdown("#### πŸ’‘ Suggested Questions")
 
234
  visible = query_suggestions if st.session_state.show_more else query_suggestions[:3]
235
  cols = st.columns(min(3, len(visible)))
236
  for i, q in enumerate(visible):
237
  col = cols[i % 3]
238
  if col.button(f"πŸ” {q}", key=f"q_{i}"):
 
239
  st.session_state.user_query_input = q
240
- st.session_state.selected_suggestion_idx = i
241
  toggle_text = "Show less β–²" if st.session_state.show_more else "Show more β–Ό"
242
  if st.button(toggle_text):
243
  st.session_state.show_more = not st.session_state.show_more
244
  st.experimental_rerun()
245
 
246
- user_query = st.text_input("Type your question or pick one above:", value=st.session_state.user_query_input)
 
 
 
 
 
 
 
247
 
 
 
 
248
  if user_query.strip():
249
- st.caption("Mode: 🧠 Reasoning" if st.session_state.reasoning_mode else "Mode: πŸ“„ Strict Document")
250
- with st.spinner("πŸ’­ Generating answer..."):
 
 
 
 
251
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
252
- answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
 
 
253
 
 
254
  st.markdown("### βœ… Assistant’s Answer")
255
- st.markdown(f"<div class='assistant-card'>{answer}</div>", unsafe_allow_html=True)
 
 
 
 
256
 
257
  with st.expander("πŸ“„ Supporting Context"):
258
  for i, r in enumerate(retrieved, start=1):
259
- st.markdown(f"**Chunk {i}:** {r}")
 
 
 
 
260
 
 
 
 
261
  st.markdown("---")
262
  st.subheader("πŸ“‘ Document Preview")
263
  st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
 
3
  import shutil
4
  import streamlit as st
5
  import torch
 
6
 
7
+ # ==========================================================
8
+ # βœ… Environment Diagnostics
9
+ # ==========================================================
10
  print("CUDA available:", torch.cuda.is_available())
11
  if torch.cuda.is_available():
12
+ print("GPU:", torch.cuda.get_device_name(0))
 
 
 
13
  else:
14
  print("Running on CPU")
15
 
16
+ # ==========================================================
17
+ # βœ… Page Configuration
18
+ # ==========================================================
19
  st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
20
 
21
+ # ==========================================================
22
+ # 🧹 Cache Management
23
+ # ==========================================================
24
  def clean_cache(max_size_gb: float = 2.0):
25
  folders = [
26
  "/root/.cache/huggingface",
 
29
  ]
30
  total_deleted = 0.0
31
  for folder in folders:
32
+ if os.path.exists(folder):
33
+ size_gb = sum(
34
+ os.path.getsize(os.path.join(dp, f))
35
+ for dp, _, files in os.walk(folder)
36
+ for f in files
37
+ ) / (1024**3)
38
+ if size_gb > max_size_gb or "torch" in folder:
39
+ shutil.rmtree(folder, ignore_errors=True)
40
+ total_deleted += size_gb
 
 
 
 
41
  os.makedirs("/tmp/hf_cache", exist_ok=True)
42
  print(f"🧹 Cache cleanup done. Removed ~{total_deleted:.2f} GB.")
43
 
44
  clean_cache()
45
 
46
+ # ==========================================================
47
+ # βš™οΈ Hugging Face Cache Configuration
48
+ # ==========================================================
49
  CACHE_DIR = "/tmp/hf_cache"
50
  os.makedirs(CACHE_DIR, exist_ok=True)
51
  os.environ.update({
 
55
  "HF_MODULES_CACHE": CACHE_DIR
56
  })
57
 
58
+ # ==========================================================
59
+ # πŸ“¦ Imports AFTER Environment Setup
60
+ # ==========================================================
61
  from ingestion import extract_text_from_pdf, chunk_text
62
  from vectorstore import build_faiss_index
63
  from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
64
 
65
+ # ==========================================================
66
+ # 🧠 Smart Suggestion Generator (Based on TOC + Content)
67
+ # ==========================================================
68
+ def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
69
+ if not toc and not chunks:
70
  return []
71
 
72
  titles = []
73
  for sec, raw_title in toc or []:
74
  title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
75
  title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
76
+ if 4 < len(title) < 120:
77
  titles.append(title)
78
 
79
+ context_sample = " ".join(chunks[:3])[:3000]
80
+ prompt = f"""
81
+ You are helping a user explore a document titled "{doc_name}".
82
+ TABLE OF CONTENTS:
83
+ {chr(10).join(['- ' + t for t in titles[:10]])}
 
 
 
 
 
 
84
 
85
+ CONTENT SAMPLE:
86
+ {context_sample}
 
 
 
 
 
 
 
 
 
 
87
 
88
+ Generate 5–7 short, specific professional questions.
89
+ Each should be under 20 words, well-phrased, and relevant to the document.
90
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ try:
93
+ ai_response = genai_generate(prompt)
94
+ questions = re.findall(r"[-β€’]?\s*(.+?)\?", ai_response)
95
+ clean_qs = [q.strip("β€’-β€” ").strip() + "?" for q in questions if 8 < len(q) < 120]
96
+ seen, final = set(), []
97
+ for q in clean_qs:
98
+ if q.lower() not in seen:
99
+ seen.add(q.lower())
100
+ final.append(q)
101
+ return final[:7]
102
+ except Exception:
103
+ return [
104
+ "What is this document about?",
105
+ "How do I configure this process?",
106
+ "What key steps are described?",
107
+ ]
108
 
109
+ # ==========================================================
110
+ # πŸ–₯️ Header
111
+ # ==========================================================
112
  st.title("πŸ“„ Enterprise Knowledge Assistant")
113
  st.caption("Query SAP documentation and enterprise PDFs β€” powered by reasoning and retrieval.")
114
 
115
+ # ==========================================================
116
+ # 🧭 Sidebar
117
+ # ==========================================================
118
  with st.sidebar:
119
  if "reasoning_mode" not in st.session_state:
120
  st.session_state.reasoning_mode = False
 
121
  st.session_state.reasoning_mode = st.toggle(
122
  "🧠 Enable Reasoning Mode",
123
  value=st.session_state.reasoning_mode,
124
+ help="ON = detailed reasoning Β· OFF = concise factual answers"
125
  )
126
 
127
  st.markdown("---")
 
130
 
131
  st.markdown("---")
132
  st.header("βš™οΈ Settings")
133
+ chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 1000, step=50)
134
+ overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
135
  top_k = st.slider("Top K Results", 1, 10, 5)
136
  st.markdown("---")
137
  st.caption("✨ Built by Shubham Sharma")
138
 
139
+ # ==========================================================
140
+ # 🧾 Document Handling
141
+ # ==========================================================
142
+ text, chunks, index, embeddings, toc = None, None, None, None, None
143
+
144
+ # --- Initialize Session State ---
145
+ for key, default in {
146
  "show_more": False,
147
  "user_query_input": "",
148
+ "selected_suggestion": None,
149
  }.items():
150
+ if key not in st.session_state:
151
+ st.session_state[key] = default
152
 
153
+ # ==========================================================
154
+ # πŸ“„ Main Flow
155
+ # ==========================================================
156
  BASE_DIR = os.path.dirname(__file__)
157
  SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
 
158
 
159
  if doc_choice == "-- Select --":
160
+ st.info("⬅️ Please choose a document to begin.")
161
  else:
162
  if doc_choice == "Sample PDF":
163
  temp_path = SAMPLE_PATH
 
172
  else:
173
  temp_path = None
174
 
175
+ # ------------------------------
176
+ # Document Processing
177
+ # ------------------------------
178
  if temp_path:
179
+ with st.spinner("πŸ” Processing your document..."):
180
  text, toc = extract_text_from_pdf(temp_path)
181
  chunks = chunk_text(text, chunk_size=chunk_size)
182
+ st.markdown("βœ… Document loaded successfully.")
183
 
184
+ query_suggestions = generate_dynamic_suggestions_from_toc(
185
+ toc, chunks, os.path.basename(temp_path)
186
+ )
187
 
188
  with st.spinner("βš™οΈ Preparing embeddings..."):
189
  embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
190
  index = build_faiss_index(embeddings)
191
+ st.markdown("πŸš€ Document ready β€” you can now ask questions below.")
192
 
193
+ # ==========================================================
194
+ # πŸ’¬ Ask a Question
195
+ # ==========================================================
196
  st.markdown("## πŸ€– Ask a Question")
 
197
  if query_suggestions:
198
  st.markdown("#### πŸ’‘ Suggested Questions")
199
+
200
  visible = query_suggestions if st.session_state.show_more else query_suggestions[:3]
201
  cols = st.columns(min(3, len(visible)))
202
  for i, q in enumerate(visible):
203
  col = cols[i % 3]
204
  if col.button(f"πŸ” {q}", key=f"q_{i}"):
205
+ st.session_state.selected_suggestion = i
206
  st.session_state.user_query_input = q
207
+
208
  toggle_text = "Show less β–²" if st.session_state.show_more else "Show more β–Ό"
209
  if st.button(toggle_text):
210
  st.session_state.show_more = not st.session_state.show_more
211
  st.experimental_rerun()
212
 
213
+ # ----------------------------------------------------------
214
+ # 🧠 Input Field
215
+ # ----------------------------------------------------------
216
+ user_query = st.text_input(
217
+ "Type your question or pick one above:",
218
+ value=st.session_state.user_query_input,
219
+ key="user_query_input"
220
+ )
221
 
222
+ # ----------------------------------------------------------
223
+ # 🧩 Generate Answer
224
+ # ----------------------------------------------------------
225
  if user_query.strip():
226
+ st.caption(
227
+ "Mode: 🧠 Reasoning"
228
+ if st.session_state.reasoning_mode
229
+ else "Mode: πŸ“„ Strict Document"
230
+ )
231
+ with st.spinner("πŸ’­ Analyzing your document..."):
232
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
233
+ answer = generate_answer(
234
+ user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode
235
+ )
236
 
237
+ # Assistant Answer
238
  st.markdown("### βœ… Assistant’s Answer")
239
+ st.markdown(
240
+ f"<div style='background-color:#111827;padding:14px;border-radius:8px;color:#f1f5f9;'>"
241
+ f"πŸ’‘ {answer}</div>",
242
+ unsafe_allow_html=True,
243
+ )
244
 
245
  with st.expander("πŸ“„ Supporting Context"):
246
  for i, r in enumerate(retrieved, start=1):
247
+ st.markdown(
248
+ f"<div style='background-color:#1f2937;padding:10px;border-radius:6px;color:#e5e7eb;margin-bottom:8px;'>"
249
+ f"<b>Chunk {i}:</b> {r}</div>",
250
+ unsafe_allow_html=True,
251
+ )
252
 
253
+ # ----------------------------------------------------------
254
+ # πŸ“‘ Document Preview
255
+ # ----------------------------------------------------------
256
  st.markdown("---")
257
  st.subheader("πŸ“‘ Document Preview")
258
  st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)