Shubham170793 commited on
Commit
c220dec
Β·
verified Β·
1 Parent(s): fea3890

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +74 -58
src/streamlit_app.py CHANGED
@@ -1,51 +1,68 @@
1
- import shutil
2
  import os
 
3
  import streamlit as st
4
 
5
- # ---------------------------
6
- # 🧹 One-time cache cleaner (prevents 50 GB overflow)
7
- # ---------------------------
8
- def clean_cache():
 
 
 
 
 
 
 
 
 
 
 
9
  folders = [
10
  "/root/.cache/huggingface",
11
  "/root/.cache/transformers",
12
  "/root/.cache/torch",
13
  "/tmp/hf_cache",
14
  ]
15
- total_deleted = 0
 
16
  for folder in folders:
17
  if os.path.exists(folder):
18
- # estimate size before deleting
19
- size = sum(
20
- os.path.getsize(os.path.join(dp, f)) for dp, _, files in os.walk(folder) for f in files
 
 
21
  ) / (1024**3)
22
- total_deleted += size
23
- shutil.rmtree(folder, ignore_errors=True)
 
 
 
 
 
 
 
24
  os.makedirs("/tmp/hf_cache", exist_ok=True)
25
- print(f"🧹 Cleaned cache folders (~{total_deleted:.2f} GB removed)")
 
26
 
27
  def check_disk_usage():
28
- """Log how much disk space is used (for debugging storage issues)."""
29
- st.sidebar.markdown("### πŸ’Ύ Disk Usage (for debugging)")
30
  try:
31
  usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
32
  st.sidebar.text(usage if usage else "No cache directories found.")
33
  except Exception as e:
34
- st.sidebar.text(f"⚠️ Could not fetch disk usage: {e}")
35
 
36
- # Run cleanup and diagnostics at startup
 
37
  clean_cache()
38
  check_disk_usage()
39
 
40
- import os
41
- import streamlit as st
42
-
43
- # --- Streamlit Safe Options (Hugging Face Spaces upload fix) ---
44
- st.set_option("client.showErrorDetails", True)
45
-
46
- # ---------------------------
47
- # Hugging Face Cache Fix (/tmp for writable)
48
- # ---------------------------
49
  CACHE_DIR = "/tmp/hf_cache"
50
  os.makedirs(CACHE_DIR, exist_ok=True)
51
  os.environ.update({
@@ -55,36 +72,34 @@ os.environ.update({
55
  "HF_MODULES_CACHE": CACHE_DIR
56
  })
57
 
58
- # ---------------------------
59
- # Imports AFTER environment setup
60
- # ---------------------------
61
  from ingestion import extract_text_from_pdf, chunk_text
62
  from embeddings import generate_embeddings
63
  from vectorstore import build_faiss_index
64
  from qa import retrieve_chunks, generate_answer
65
 
66
- # ---------------------------
67
- # Paths
68
- # ---------------------------
69
- BASE_DIR = os.path.dirname(__file__) # /app/src
70
  LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
71
  SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
72
 
73
- # ---------------------------
74
- # App Config
75
- # ---------------------------
76
- st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
77
  st.title("πŸ“„ Enterprise Knowledge Assistant")
78
  st.caption("Upload a PDF or use the sample file to explore intelligent document Q&A.")
79
 
80
- # ---------------------------
81
- # Sidebar (Library + Settings + Credits)
82
- # ---------------------------
83
  with st.sidebar:
84
  if os.path.exists(LOGO_PATH):
85
  st.image(LOGO_PATH, width=150)
86
 
87
- # 1️⃣ Document Library
88
  st.header("πŸ“š Document Library")
89
  doc_choice = st.radio(
90
  "Choose a document:",
@@ -94,24 +109,21 @@ with st.sidebar:
94
 
95
  st.markdown("---")
96
 
97
- # 2️⃣ Settings
98
  st.header("βš™οΈ Settings")
99
  chunk_size = st.slider("Chunk Size (characters)", 300, 1200, 800, step=100)
100
  top_k = st.slider("Top K Results (retrieved chunks)", 1, 10, 5)
101
 
102
  st.markdown("---")
103
-
104
- # 3️⃣ Branding
105
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
106
  st.markdown("[πŸ“‚ GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
107
 
108
- # ---------------------------
109
- # Document Handling
110
- # ---------------------------
111
  text, chunks, index = None, None, None
112
 
113
  if doc_choice == "-- Select --":
114
- st.info("⬅️ Please choose **Sample PDF** or **Upload Custom PDF** from the sidebar to get started.")
115
 
116
  elif doc_choice == "Sample PDF":
117
  temp_path = SAMPLE_PATH
@@ -128,7 +140,7 @@ elif doc_choice == "Upload Custom PDF":
128
  temp_path = os.path.join("/tmp", uploaded_file.name)
129
  with open(temp_path, "wb") as f:
130
  f.write(uploaded_file.getbuffer())
131
- st.success(f"βœ… File '{uploaded_file.name}' uploaded and saved to /tmp")
132
 
133
  with st.spinner("βš™οΈ Extracting and processing your document..."):
134
  text = extract_text_from_pdf(temp_path)
@@ -137,17 +149,18 @@ elif doc_choice == "Upload Custom PDF":
137
  index = build_faiss_index(embeddings)
138
  st.success("πŸš€ Document processed successfully!")
139
 
140
- # ---------------------------
141
- # Document Preview
142
- # ---------------------------
143
  if chunks:
144
  st.subheader("πŸ“‘ Document Preview")
145
  st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
146
- st.caption(f"πŸ“¦ {len(chunks)} chunks created | Avg chunk length: {int(sum(len(c) for c in chunks) / len(chunks))} chars")
 
147
 
148
- # ---------------------------
149
- # Query Section
150
- # ---------------------------
151
  if index and chunks:
152
  st.markdown("---")
153
  st.subheader("πŸ€– Ask a Question")
@@ -158,11 +171,14 @@ if index and chunks:
158
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
159
  answer = generate_answer(user_query, retrieved)
160
 
161
- # Answer Section
162
  st.markdown("### βœ… Assistant’s Answer")
163
- st.markdown(f"<div style='background-color:#0E1117;padding:12px;border-radius:10px;'>{answer}</div>", unsafe_allow_html=True)
 
 
 
164
 
165
- # Supporting Chunks Section
166
  with st.expander("πŸ“„ Supporting Chunks (Context Used)"):
167
  for i, r in enumerate(retrieved, start=1):
168
  st.markdown(
 
 
1
  import os
2
+ import shutil
3
  import streamlit as st
4
 
5
+ # ==========================================================
6
+ # βœ… Page Configuration (must be first Streamlit command)
7
+ # ==========================================================
8
+ st.set_page_config(
9
+ page_title="Enterprise Knowledge Assistant",
10
+ layout="wide"
11
+ )
12
+
13
+ # ==========================================================
14
+ # 🧹 Cache Management (prevents Hugging Face 50GB overflow)
15
+ # ==========================================================
16
+ def clean_cache(max_size_gb: float = 2.0):
17
+ """
18
+ Cleans large cache folders (> max_size_gb), preserving /tmp/hf_cache if small.
19
+ """
20
  folders = [
21
  "/root/.cache/huggingface",
22
  "/root/.cache/transformers",
23
  "/root/.cache/torch",
24
  "/tmp/hf_cache",
25
  ]
26
+ total_deleted = 0.0
27
+
28
  for folder in folders:
29
  if os.path.exists(folder):
30
+ # estimate folder size
31
+ size_gb = sum(
32
+ os.path.getsize(os.path.join(dp, f))
33
+ for dp, _, files in os.walk(folder)
34
+ for f in files
35
  ) / (1024**3)
36
+
37
+ # only delete if large
38
+ if size_gb > max_size_gb or "torch" in folder:
39
+ shutil.rmtree(folder, ignore_errors=True)
40
+ total_deleted += size_gb
41
+ print(f"πŸ—‘οΈ Deleted {folder} ({size_gb:.2f} GB)")
42
+ else:
43
+ print(f"βœ… Preserved {folder} ({size_gb:.2f} GB)")
44
+
45
  os.makedirs("/tmp/hf_cache", exist_ok=True)
46
+ print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
47
+
48
 
49
  def check_disk_usage():
50
+ """Show disk usage info in sidebar."""
51
+ st.sidebar.markdown("### πŸ’Ύ Disk Usage (Debug)")
52
  try:
53
  usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
54
  st.sidebar.text(usage if usage else "No cache directories found.")
55
  except Exception as e:
56
+ st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
57
 
58
+
59
+ # Run cleanup & diagnostics
60
  clean_cache()
61
  check_disk_usage()
62
 
63
+ # ==========================================================
64
+ # βš™οΈ Hugging Face Cache Configuration (/tmp for writable path)
65
+ # ==========================================================
 
 
 
 
 
 
66
  CACHE_DIR = "/tmp/hf_cache"
67
  os.makedirs(CACHE_DIR, exist_ok=True)
68
  os.environ.update({
 
72
  "HF_MODULES_CACHE": CACHE_DIR
73
  })
74
 
75
+ # ==========================================================
76
+ # πŸ“¦ Imports AFTER environment setup
77
+ # ==========================================================
78
  from ingestion import extract_text_from_pdf, chunk_text
79
  from embeddings import generate_embeddings
80
  from vectorstore import build_faiss_index
81
  from qa import retrieve_chunks, generate_answer
82
 
83
+ # ==========================================================
84
+ # πŸ“ Paths
85
+ # ==========================================================
86
+ BASE_DIR = os.path.dirname(__file__) # /app/src
87
  LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
88
  SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
89
 
90
+ # ==========================================================
91
+ # πŸ–₯️ UI Header
92
+ # ==========================================================
 
93
  st.title("πŸ“„ Enterprise Knowledge Assistant")
94
  st.caption("Upload a PDF or use the sample file to explore intelligent document Q&A.")
95
 
96
+ # ==========================================================
97
+ # 🧭 Sidebar (Document Library + Settings + Diagnostics)
98
+ # ==========================================================
99
  with st.sidebar:
100
  if os.path.exists(LOGO_PATH):
101
  st.image(LOGO_PATH, width=150)
102
 
 
103
  st.header("πŸ“š Document Library")
104
  doc_choice = st.radio(
105
  "Choose a document:",
 
109
 
110
  st.markdown("---")
111
 
 
112
  st.header("βš™οΈ Settings")
113
  chunk_size = st.slider("Chunk Size (characters)", 300, 1200, 800, step=100)
114
  top_k = st.slider("Top K Results (retrieved chunks)", 1, 10, 5)
115
 
116
  st.markdown("---")
 
 
117
  st.caption("πŸ‘¨β€πŸ’» Built by Shubham Sharma")
118
  st.markdown("[πŸ“‚ GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
119
 
120
+ # ==========================================================
121
+ # 🧾 Document Handling
122
+ # ==========================================================
123
  text, chunks, index = None, None, None
124
 
125
  if doc_choice == "-- Select --":
126
+ st.info("⬅️ Please choose **Sample PDF** or **Upload Custom PDF** from the sidebar.")
127
 
128
  elif doc_choice == "Sample PDF":
129
  temp_path = SAMPLE_PATH
 
140
  temp_path = os.path.join("/tmp", uploaded_file.name)
141
  with open(temp_path, "wb") as f:
142
  f.write(uploaded_file.getbuffer())
143
+ st.success(f"βœ… File '{uploaded_file.name}' uploaded successfully")
144
 
145
  with st.spinner("βš™οΈ Extracting and processing your document..."):
146
  text = extract_text_from_pdf(temp_path)
 
149
  index = build_faiss_index(embeddings)
150
  st.success("πŸš€ Document processed successfully!")
151
 
152
+ # ==========================================================
153
+ # πŸ“‘ Document Preview
154
+ # ==========================================================
155
  if chunks:
156
  st.subheader("πŸ“‘ Document Preview")
157
  st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
158
+ avg_len = int(sum(len(c) for c in chunks) / len(chunks))
159
+ st.caption(f"πŸ“¦ {len(chunks)} chunks created | Avg chunk length: {avg_len} chars")
160
 
161
+ # ==========================================================
162
+ # πŸ’¬ Query Section
163
+ # ==========================================================
164
  if index and chunks:
165
  st.markdown("---")
166
  st.subheader("πŸ€– Ask a Question")
 
171
  retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
172
  answer = generate_answer(user_query, retrieved)
173
 
174
+ # βœ… Answer Display
175
  st.markdown("### βœ… Assistant’s Answer")
176
+ st.markdown(
177
+ f"<div style='background-color:#0E1117;padding:12px;border-radius:10px;color:white;'>{answer}</div>",
178
+ unsafe_allow_html=True
179
+ )
180
 
181
+ # πŸ“„ Supporting Chunks
182
  with st.expander("πŸ“„ Supporting Chunks (Context Used)"):
183
  for i, r in enumerate(retrieved, start=1):
184
  st.markdown(