Shubham170793 commited on
Commit
585fec8
Β·
verified Β·
1 Parent(s): f86b15f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +59 -49
src/streamlit_app.py CHANGED
@@ -1,8 +1,13 @@
1
  import os
 
2
  import shutil
 
3
  import streamlit as st
4
-
5
  import torch
 
 
 
 
6
  print("CUDA available:", torch.cuda.is_available())
7
  print("Device count:", torch.cuda.device_count())
8
  if torch.cuda.is_available():
@@ -10,9 +15,8 @@ if torch.cuda.is_available():
10
  else:
11
  print("Running on CPU")
12
 
13
-
14
  # ==========================================================
15
- # βœ… Page Configuration (must be first Streamlit command)
16
  # ==========================================================
17
  st.set_page_config(
18
  page_title="Enterprise Knowledge Assistant",
@@ -20,30 +24,28 @@ st.set_page_config(
20
  )
21
 
22
  # ==========================================================
23
- # 🧹 Cache Management (prevents Hugging Face 50GB overflow)
24
  # ==========================================================
25
  def clean_cache(max_size_gb: float = 2.0):
26
  """
27
- Cleans large cache folders (> max_size_gb), preserving /tmp/hf_cache if small.
 
28
  """
29
  folders = [
30
- "/root/.cache/huggingface",
31
- "/root/.cache/transformers",
32
- "/root/.cache/torch",
33
- # "/tmp/hf_cache", # 🚫 DO NOT DELETE: used by Mistral for offloading
34
- ]
35
  total_deleted = 0.0
36
 
37
  for folder in folders:
38
  if os.path.exists(folder):
39
- # estimate folder size
40
  size_gb = sum(
41
  os.path.getsize(os.path.join(dp, f))
42
  for dp, _, files in os.walk(folder)
43
  for f in files
44
  ) / (1024**3)
45
 
46
- # only delete if large
47
  if size_gb > max_size_gb or "torch" in folder:
48
  shutil.rmtree(folder, ignore_errors=True)
49
  total_deleted += size_gb
@@ -56,7 +58,7 @@ def clean_cache(max_size_gb: float = 2.0):
56
 
57
 
58
  def check_disk_usage():
59
- """Show disk usage info in sidebar."""
60
  st.sidebar.markdown("### πŸ’Ύ Disk Usage (Debug)")
61
  try:
62
  usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
@@ -65,12 +67,12 @@ def check_disk_usage():
65
  st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
66
 
67
 
68
- # Run cleanup & diagnostics
69
  clean_cache()
70
  check_disk_usage()
71
 
72
  # ==========================================================
73
- # βš™οΈ Hugging Face Cache Configuration (/tmp for writable path)
74
  # ==========================================================
75
  CACHE_DIR = "/tmp/hf_cache"
76
  os.makedirs(CACHE_DIR, exist_ok=True)
@@ -82,18 +84,16 @@ os.environ.update({
82
  })
83
 
84
  # ==========================================================
85
- # πŸ“¦ Imports AFTER environment setup
86
  # ==========================================================
87
  from ingestion import extract_text_from_pdf, chunk_text
88
- from embeddings import generate_embeddings
89
  from vectorstore import build_faiss_index
90
- from qa import retrieve_chunks, generate_answer
91
-
92
 
93
  # ==========================================================
94
  # πŸ“ Paths
95
  # ==========================================================
96
- BASE_DIR = os.path.dirname(__file__) # /app/src
97
  LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
98
  SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
99
 
@@ -101,28 +101,24 @@ SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
101
  # πŸ–₯️ UI Header
102
  # ==========================================================
103
  st.title("πŸ“„ Enterprise Knowledge Assistant")
104
- st.caption("Upload a PDF or use the sample file to explore intelligent document Q&A.")
105
 
106
  # ==========================================================
107
- # 🧭 Sidebar (Document Library + Settings + Diagnostics)
108
  # ==========================================================
109
  with st.sidebar:
110
- # πŸ–ΌοΈ App Logo (if available)
111
  if os.path.exists(LOGO_PATH):
112
  st.image(LOGO_PATH, width=150)
113
 
114
- # 🧠 Reasoning Mode Toggle (Persistent)
115
  if "reasoning_mode" not in st.session_state:
116
- st.session_state.reasoning_mode = False # Default OFF
117
 
118
  st.session_state.reasoning_mode = st.toggle(
119
  "🧠 Enable Reasoning Mode",
120
  value=st.session_state.reasoning_mode,
121
- help=(
122
- "When ON, the assistant can use its world knowledge and reasoning ability "
123
- "to generate richer, more explanatory answers.\n\n"
124
- "When OFF, it sticks strictly to the document text for factual accuracy."
125
- )
126
  )
127
 
128
  st.markdown("---")
@@ -141,30 +137,39 @@ with st.sidebar:
141
  st.header("βš™οΈ Settings")
142
  chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
143
  overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
144
- top_k = st.slider("Top K Results (retrieved chunks)", 1, 10, 5)
145
 
146
  st.markdown("---")
147
-
148
- # πŸ‘¨β€πŸ’» Branding
149
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
150
- st.markdown("[πŸ“‚ GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
151
 
152
  # ==========================================================
153
  # 🧾 Document Handling
154
  # ==========================================================
155
- text, chunks, index = None, None, None
156
 
157
  if doc_choice == "-- Select --":
158
- st.info("⬅️ Please choose **Sample PDF** or **Upload Custom PDF** from the sidebar.")
159
 
160
  elif doc_choice == "Sample PDF":
161
  temp_path = SAMPLE_PATH
162
  st.success("πŸ“˜ Using built-in Sample PDF")
 
163
  with st.spinner("πŸ” Extracting and processing document..."):
164
  text = extract_text_from_pdf(temp_path)
165
  chunks = chunk_text(text, chunk_size=chunk_size)
166
- embeddings = generate_embeddings(chunks)
167
- index = build_faiss_index(embeddings)
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  elif doc_choice == "Upload Custom PDF":
170
  uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
@@ -177,8 +182,18 @@ elif doc_choice == "Upload Custom PDF":
177
  with st.spinner("βš™οΈ Extracting and processing your document..."):
178
  text = extract_text_from_pdf(temp_path)
179
  chunks = chunk_text(text, chunk_size=chunk_size)
180
- embeddings = generate_embeddings(chunks)
181
- index = build_faiss_index(embeddings)
 
 
 
 
 
 
 
 
 
 
182
  st.success("πŸš€ Document processed successfully!")
183
 
184
  # ==========================================================
@@ -188,11 +203,11 @@ if chunks:
188
  st.subheader("πŸ“‘ Document Preview")
189
  st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
190
  avg_len = int(sum(len(c) for c in chunks) / len(chunks))
191
- st.caption(f"πŸ“¦ {len(chunks)} chunks created | Avg chunk length: {avg_len} chars")
192
 
193
- # ---------------------------
194
- # Query Section
195
- # ---------------------------
196
  if index and chunks:
197
  st.markdown("---")
198
  st.subheader("πŸ€– Ask a Question")
@@ -200,7 +215,6 @@ if index and chunks:
200
  user_query = st.text_input("πŸ” Your question about the document:")
201
 
202
  if user_query:
203
- # Show which mode is active
204
  mode_label = (
205
  "🧠 Reasoning Mode (expanded thinking)"
206
  if st.session_state.reasoning_mode
@@ -208,13 +222,10 @@ if index and chunks:
208
  )
209
  st.caption(f"Mode: {mode_label}")
210
 
211
- # Generate the answer
212
  with st.spinner("🧠 Thinking... retrieving context and generating answer..."):
213
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
214
  answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
215
 
216
-
217
-
218
  # βœ… Display Answer
219
  st.markdown("### βœ… Assistant’s Answer")
220
  st.markdown(
@@ -236,4 +247,3 @@ if index and chunks:
236
 
237
  else:
238
  st.info("πŸ“₯ Upload or select a document to start exploring.")
239
-
 
1
  import os
2
+ import re
3
  import shutil
4
+ import hashlib
5
  import streamlit as st
 
6
  import torch
7
+
8
+ # ==========================================================
9
+ # βœ… Environment Diagnostics
10
+ # ==========================================================
11
  print("CUDA available:", torch.cuda.is_available())
12
  print("Device count:", torch.cuda.device_count())
13
  if torch.cuda.is_available():
 
15
  else:
16
  print("Running on CPU")
17
 
 
18
  # ==========================================================
19
+ # βœ… Page Configuration
20
  # ==========================================================
21
  st.set_page_config(
22
  page_title="Enterprise Knowledge Assistant",
 
24
  )
25
 
26
  # ==========================================================
27
+ # 🧹 Cache Management (prevent HF overflow)
28
  # ==========================================================
29
  def clean_cache(max_size_gb: float = 2.0):
30
  """
31
+ Cleans large cache folders (> max_size_gb),
32
+ preserving /tmp/hf_cache (used for model weights).
33
  """
34
  folders = [
35
+ "/root/.cache/huggingface",
36
+ "/root/.cache/transformers",
37
+ "/root/.cache/torch",
38
+ ]
 
39
  total_deleted = 0.0
40
 
41
  for folder in folders:
42
  if os.path.exists(folder):
 
43
  size_gb = sum(
44
  os.path.getsize(os.path.join(dp, f))
45
  for dp, _, files in os.walk(folder)
46
  for f in files
47
  ) / (1024**3)
48
 
 
49
  if size_gb > max_size_gb or "torch" in folder:
50
  shutil.rmtree(folder, ignore_errors=True)
51
  total_deleted += size_gb
 
58
 
59
 
60
  def check_disk_usage():
61
+ """Display disk usage info in sidebar."""
62
  st.sidebar.markdown("### πŸ’Ύ Disk Usage (Debug)")
63
  try:
64
  usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
 
67
  st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
68
 
69
 
70
+ # Run cache cleanup once at startup
71
  clean_cache()
72
  check_disk_usage()
73
 
74
  # ==========================================================
75
+ # βš™οΈ Hugging Face Cache Configuration
76
  # ==========================================================
77
  CACHE_DIR = "/tmp/hf_cache"
78
  os.makedirs(CACHE_DIR, exist_ok=True)
 
84
  })
85
 
86
  # ==========================================================
87
+ # πŸ“¦ Imports AFTER Environment Setup
88
  # ==========================================================
89
  from ingestion import extract_text_from_pdf, chunk_text
 
90
  from vectorstore import build_faiss_index
91
+ from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
 
92
 
93
  # ==========================================================
94
  # πŸ“ Paths
95
  # ==========================================================
96
+ BASE_DIR = os.path.dirname(__file__)
97
  LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
98
  SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
99
 
 
101
  # πŸ–₯️ UI Header
102
  # ==========================================================
103
  st.title("πŸ“„ Enterprise Knowledge Assistant")
104
+ st.caption("Query SAP documentation and enterprise PDFs using natural language and reasoning.")
105
 
106
  # ==========================================================
107
+ # 🧭 Sidebar β€” Library, Settings, Diagnostics
108
  # ==========================================================
109
  with st.sidebar:
110
+ # πŸ–ΌοΈ App Logo
111
  if os.path.exists(LOGO_PATH):
112
  st.image(LOGO_PATH, width=150)
113
 
114
+ # 🧠 Reasoning Mode Toggle
115
  if "reasoning_mode" not in st.session_state:
116
+ st.session_state.reasoning_mode = False
117
 
118
  st.session_state.reasoning_mode = st.toggle(
119
  "🧠 Enable Reasoning Mode",
120
  value=st.session_state.reasoning_mode,
121
+ help="When ON: GPT-4o uses reasoning + web-like synthesis.\nWhen OFF: Strictly factual from PDF."
 
 
 
 
122
  )
123
 
124
  st.markdown("---")
 
137
  st.header("βš™οΈ Settings")
138
  chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
139
  overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
140
+ top_k = st.slider("Top K Results", 1, 10, 5)
141
 
142
  st.markdown("---")
 
 
143
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
 
144
 
145
  # ==========================================================
146
  # 🧾 Document Handling
147
  # ==========================================================
148
+ text, chunks, index, embeddings = None, None, None, None
149
 
150
  if doc_choice == "-- Select --":
151
+ st.info("⬅️ Please choose a document from the sidebar.")
152
 
153
  elif doc_choice == "Sample PDF":
154
  temp_path = SAMPLE_PATH
155
  st.success("πŸ“˜ Using built-in Sample PDF")
156
+
157
  with st.spinner("πŸ” Extracting and processing document..."):
158
  text = extract_text_from_pdf(temp_path)
159
  chunks = chunk_text(text, chunk_size=chunk_size)
160
+ st.write(f"πŸ“‘ Extracted {len(chunks)} chunks.")
161
+
162
+ # βœ… Cached Embeddings
163
+ with st.spinner("βš™οΈ Loading cached embeddings or generating new ones..."):
164
+ embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
165
+ hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
166
+ cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
167
+ if os.path.exists(cache_file):
168
+ st.info(f"🧠 Using cached embeddings for {os.path.basename(temp_path)}")
169
+ else:
170
+ st.warning(f"πŸ’‘ Generated new embeddings for {os.path.basename(temp_path)}")
171
+
172
+ index = build_faiss_index(embeddings)
173
 
174
  elif doc_choice == "Upload Custom PDF":
175
  uploaded_file = st.file_uploader("πŸ“‚ Upload your PDF", type="pdf")
 
182
  with st.spinner("βš™οΈ Extracting and processing your document..."):
183
  text = extract_text_from_pdf(temp_path)
184
  chunks = chunk_text(text, chunk_size=chunk_size)
185
+ st.write(f"πŸ“„ Extracted {len(chunks)} chunks.")
186
+
187
+ with st.spinner("βš™οΈ Loading cached embeddings or generating new ones..."):
188
+ embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
189
+ hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
190
+ cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
191
+ if os.path.exists(cache_file):
192
+ st.info(f"🧠 Using cached embeddings for {os.path.basename(temp_path)}")
193
+ else:
194
+ st.warning(f"πŸ’‘ Generated new embeddings for {os.path.basename(temp_path)}")
195
+
196
+ index = build_faiss_index(embeddings)
197
  st.success("πŸš€ Document processed successfully!")
198
 
199
  # ==========================================================
 
203
  st.subheader("πŸ“‘ Document Preview")
204
  st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
205
  avg_len = int(sum(len(c) for c in chunks) / len(chunks))
206
+ st.caption(f"πŸ“¦ {len(chunks)} chunks | Avg length: {avg_len} chars")
207
 
208
+ # ==========================================================
209
+ # πŸ’¬ Query Section
210
+ # ==========================================================
211
  if index and chunks:
212
  st.markdown("---")
213
  st.subheader("πŸ€– Ask a Question")
 
215
  user_query = st.text_input("πŸ” Your question about the document:")
216
 
217
  if user_query:
 
218
  mode_label = (
219
  "🧠 Reasoning Mode (expanded thinking)"
220
  if st.session_state.reasoning_mode
 
222
  )
223
  st.caption(f"Mode: {mode_label}")
224
 
 
225
  with st.spinner("🧠 Thinking... retrieving context and generating answer..."):
226
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
227
  answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
228
 
 
 
229
  # βœ… Display Answer
230
  st.markdown("### βœ… Assistant’s Answer")
231
  st.markdown(
 
247
 
248
  else:
249
  st.info("πŸ“₯ Upload or select a document to start exploring.")