Update src/streamlit_app.py
Browse files- src/streamlit_app.py +59 -49
src/streamlit_app.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
import shutil
|
|
|
|
| 3 |
import streamlit as st
|
| 4 |
-
|
| 5 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
print("CUDA available:", torch.cuda.is_available())
|
| 7 |
print("Device count:", torch.cuda.device_count())
|
| 8 |
if torch.cuda.is_available():
|
|
@@ -10,9 +15,8 @@ if torch.cuda.is_available():
|
|
| 10 |
else:
|
| 11 |
print("Running on CPU")
|
| 12 |
|
| 13 |
-
|
| 14 |
# ==========================================================
|
| 15 |
-
# β
Page Configuration
|
| 16 |
# ==========================================================
|
| 17 |
st.set_page_config(
|
| 18 |
page_title="Enterprise Knowledge Assistant",
|
|
@@ -20,30 +24,28 @@ st.set_page_config(
|
|
| 20 |
)
|
| 21 |
|
| 22 |
# ==========================================================
|
| 23 |
-
# π§Ή Cache Management (
|
| 24 |
# ==========================================================
|
| 25 |
def clean_cache(max_size_gb: float = 2.0):
|
| 26 |
"""
|
| 27 |
-
Cleans large cache folders (> max_size_gb),
|
|
|
|
| 28 |
"""
|
| 29 |
folders = [
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
]
|
| 35 |
total_deleted = 0.0
|
| 36 |
|
| 37 |
for folder in folders:
|
| 38 |
if os.path.exists(folder):
|
| 39 |
-
# estimate folder size
|
| 40 |
size_gb = sum(
|
| 41 |
os.path.getsize(os.path.join(dp, f))
|
| 42 |
for dp, _, files in os.walk(folder)
|
| 43 |
for f in files
|
| 44 |
) / (1024**3)
|
| 45 |
|
| 46 |
-
# only delete if large
|
| 47 |
if size_gb > max_size_gb or "torch" in folder:
|
| 48 |
shutil.rmtree(folder, ignore_errors=True)
|
| 49 |
total_deleted += size_gb
|
|
@@ -56,7 +58,7 @@ def clean_cache(max_size_gb: float = 2.0):
|
|
| 56 |
|
| 57 |
|
| 58 |
def check_disk_usage():
|
| 59 |
-
"""
|
| 60 |
st.sidebar.markdown("### πΎ Disk Usage (Debug)")
|
| 61 |
try:
|
| 62 |
usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
|
|
@@ -65,12 +67,12 @@ def check_disk_usage():
|
|
| 65 |
st.sidebar.text(f"β οΈ Disk usage check failed: {e}")
|
| 66 |
|
| 67 |
|
| 68 |
-
# Run cleanup
|
| 69 |
clean_cache()
|
| 70 |
check_disk_usage()
|
| 71 |
|
| 72 |
# ==========================================================
|
| 73 |
-
# βοΈ Hugging Face Cache Configuration
|
| 74 |
# ==========================================================
|
| 75 |
CACHE_DIR = "/tmp/hf_cache"
|
| 76 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
@@ -82,18 +84,16 @@ os.environ.update({
|
|
| 82 |
})
|
| 83 |
|
| 84 |
# ==========================================================
|
| 85 |
-
# π¦ Imports AFTER
|
| 86 |
# ==========================================================
|
| 87 |
from ingestion import extract_text_from_pdf, chunk_text
|
| 88 |
-
from embeddings import generate_embeddings
|
| 89 |
from vectorstore import build_faiss_index
|
| 90 |
-
from qa import retrieve_chunks, generate_answer
|
| 91 |
-
|
| 92 |
|
| 93 |
# ==========================================================
|
| 94 |
# π Paths
|
| 95 |
# ==========================================================
|
| 96 |
-
BASE_DIR = os.path.dirname(__file__)
|
| 97 |
LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
|
| 98 |
SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
|
| 99 |
|
|
@@ -101,28 +101,24 @@ SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
|
|
| 101 |
# π₯οΈ UI Header
|
| 102 |
# ==========================================================
|
| 103 |
st.title("π Enterprise Knowledge Assistant")
|
| 104 |
-
st.caption("
|
| 105 |
|
| 106 |
# ==========================================================
|
| 107 |
-
# π§ Sidebar
|
| 108 |
# ==========================================================
|
| 109 |
with st.sidebar:
|
| 110 |
-
# πΌοΈ App Logo
|
| 111 |
if os.path.exists(LOGO_PATH):
|
| 112 |
st.image(LOGO_PATH, width=150)
|
| 113 |
|
| 114 |
-
# π§ Reasoning Mode Toggle
|
| 115 |
if "reasoning_mode" not in st.session_state:
|
| 116 |
-
st.session_state.reasoning_mode = False
|
| 117 |
|
| 118 |
st.session_state.reasoning_mode = st.toggle(
|
| 119 |
"π§ Enable Reasoning Mode",
|
| 120 |
value=st.session_state.reasoning_mode,
|
| 121 |
-
help=
|
| 122 |
-
"When ON, the assistant can use its world knowledge and reasoning ability "
|
| 123 |
-
"to generate richer, more explanatory answers.\n\n"
|
| 124 |
-
"When OFF, it sticks strictly to the document text for factual accuracy."
|
| 125 |
-
)
|
| 126 |
)
|
| 127 |
|
| 128 |
st.markdown("---")
|
|
@@ -141,30 +137,39 @@ with st.sidebar:
|
|
| 141 |
st.header("βοΈ Settings")
|
| 142 |
chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
|
| 143 |
overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
|
| 144 |
-
top_k = st.slider("Top K Results
|
| 145 |
|
| 146 |
st.markdown("---")
|
| 147 |
-
|
| 148 |
-
# π¨βπ» Branding
|
| 149 |
st.caption("π¨βπ» Built by Shubham Sharma")
|
| 150 |
-
st.markdown("[π GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
|
| 151 |
|
| 152 |
# ==========================================================
|
| 153 |
# π§Ύ Document Handling
|
| 154 |
# ==========================================================
|
| 155 |
-
text, chunks, index = None, None, None
|
| 156 |
|
| 157 |
if doc_choice == "-- Select --":
|
| 158 |
-
st.info("β¬
οΈ Please choose
|
| 159 |
|
| 160 |
elif doc_choice == "Sample PDF":
|
| 161 |
temp_path = SAMPLE_PATH
|
| 162 |
st.success("π Using built-in Sample PDF")
|
|
|
|
| 163 |
with st.spinner("π Extracting and processing document..."):
|
| 164 |
text = extract_text_from_pdf(temp_path)
|
| 165 |
chunks = chunk_text(text, chunk_size=chunk_size)
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
elif doc_choice == "Upload Custom PDF":
|
| 170 |
uploaded_file = st.file_uploader("π Upload your PDF", type="pdf")
|
|
@@ -177,8 +182,18 @@ elif doc_choice == "Upload Custom PDF":
|
|
| 177 |
with st.spinner("βοΈ Extracting and processing your document..."):
|
| 178 |
text = extract_text_from_pdf(temp_path)
|
| 179 |
chunks = chunk_text(text, chunk_size=chunk_size)
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
st.success("π Document processed successfully!")
|
| 183 |
|
| 184 |
# ==========================================================
|
|
@@ -188,11 +203,11 @@ if chunks:
|
|
| 188 |
st.subheader("π Document Preview")
|
| 189 |
st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
|
| 190 |
avg_len = int(sum(len(c) for c in chunks) / len(chunks))
|
| 191 |
-
st.caption(f"π¦ {len(chunks)} chunks
|
| 192 |
|
| 193 |
-
#
|
| 194 |
-
# Query Section
|
| 195 |
-
#
|
| 196 |
if index and chunks:
|
| 197 |
st.markdown("---")
|
| 198 |
st.subheader("π€ Ask a Question")
|
|
@@ -200,7 +215,6 @@ if index and chunks:
|
|
| 200 |
user_query = st.text_input("π Your question about the document:")
|
| 201 |
|
| 202 |
if user_query:
|
| 203 |
-
# Show which mode is active
|
| 204 |
mode_label = (
|
| 205 |
"π§ Reasoning Mode (expanded thinking)"
|
| 206 |
if st.session_state.reasoning_mode
|
|
@@ -208,13 +222,10 @@ if index and chunks:
|
|
| 208 |
)
|
| 209 |
st.caption(f"Mode: {mode_label}")
|
| 210 |
|
| 211 |
-
# Generate the answer
|
| 212 |
with st.spinner("π§ Thinking... retrieving context and generating answer..."):
|
| 213 |
retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
|
| 214 |
answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
# β
Display Answer
|
| 219 |
st.markdown("### β
Assistantβs Answer")
|
| 220 |
st.markdown(
|
|
@@ -236,4 +247,3 @@ if index and chunks:
|
|
| 236 |
|
| 237 |
else:
|
| 238 |
st.info("π₯ Upload or select a document to start exploring.")
|
| 239 |
-
|
|
|
|
| 1 |
import os
|
| 2 |
+
import re
|
| 3 |
import shutil
|
| 4 |
+
import hashlib
|
| 5 |
import streamlit as st
|
|
|
|
| 6 |
import torch
|
| 7 |
+
|
| 8 |
+
# ==========================================================
|
| 9 |
+
# β
Environment Diagnostics
|
| 10 |
+
# ==========================================================
|
| 11 |
print("CUDA available:", torch.cuda.is_available())
|
| 12 |
print("Device count:", torch.cuda.device_count())
|
| 13 |
if torch.cuda.is_available():
|
|
|
|
| 15 |
else:
|
| 16 |
print("Running on CPU")
|
| 17 |
|
|
|
|
| 18 |
# ==========================================================
|
| 19 |
+
# β
Page Configuration
|
| 20 |
# ==========================================================
|
| 21 |
st.set_page_config(
|
| 22 |
page_title="Enterprise Knowledge Assistant",
|
|
|
|
| 24 |
)
|
| 25 |
|
| 26 |
# ==========================================================
|
| 27 |
+
# π§Ή Cache Management (prevent HF overflow)
|
| 28 |
# ==========================================================
|
| 29 |
def clean_cache(max_size_gb: float = 2.0):
|
| 30 |
"""
|
| 31 |
+
Cleans large cache folders (> max_size_gb),
|
| 32 |
+
preserving /tmp/hf_cache (used for model weights).
|
| 33 |
"""
|
| 34 |
folders = [
|
| 35 |
+
"/root/.cache/huggingface",
|
| 36 |
+
"/root/.cache/transformers",
|
| 37 |
+
"/root/.cache/torch",
|
| 38 |
+
]
|
|
|
|
| 39 |
total_deleted = 0.0
|
| 40 |
|
| 41 |
for folder in folders:
|
| 42 |
if os.path.exists(folder):
|
|
|
|
| 43 |
size_gb = sum(
|
| 44 |
os.path.getsize(os.path.join(dp, f))
|
| 45 |
for dp, _, files in os.walk(folder)
|
| 46 |
for f in files
|
| 47 |
) / (1024**3)
|
| 48 |
|
|
|
|
| 49 |
if size_gb > max_size_gb or "torch" in folder:
|
| 50 |
shutil.rmtree(folder, ignore_errors=True)
|
| 51 |
total_deleted += size_gb
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
def check_disk_usage():
|
| 61 |
+
"""Display disk usage info in sidebar."""
|
| 62 |
st.sidebar.markdown("### πΎ Disk Usage (Debug)")
|
| 63 |
try:
|
| 64 |
usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
|
|
|
|
| 67 |
st.sidebar.text(f"β οΈ Disk usage check failed: {e}")
|
| 68 |
|
| 69 |
|
| 70 |
+
# Run cache cleanup once at startup
|
| 71 |
clean_cache()
|
| 72 |
check_disk_usage()
|
| 73 |
|
| 74 |
# ==========================================================
|
| 75 |
+
# βοΈ Hugging Face Cache Configuration
|
| 76 |
# ==========================================================
|
| 77 |
CACHE_DIR = "/tmp/hf_cache"
|
| 78 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
|
|
| 84 |
})
|
| 85 |
|
| 86 |
# ==========================================================
|
| 87 |
+
# π¦ Imports AFTER Environment Setup
|
| 88 |
# ==========================================================
|
| 89 |
from ingestion import extract_text_from_pdf, chunk_text
|
|
|
|
| 90 |
from vectorstore import build_faiss_index
|
| 91 |
+
from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
|
|
|
|
| 92 |
|
| 93 |
# ==========================================================
|
| 94 |
# π Paths
|
| 95 |
# ==========================================================
|
| 96 |
+
BASE_DIR = os.path.dirname(__file__)
|
| 97 |
LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
|
| 98 |
SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
|
| 99 |
|
|
|
|
| 101 |
# π₯οΈ UI Header
|
| 102 |
# ==========================================================
|
| 103 |
st.title("π Enterprise Knowledge Assistant")
|
| 104 |
+
st.caption("Query SAP documentation and enterprise PDFs using natural language and reasoning.")
|
| 105 |
|
| 106 |
# ==========================================================
|
| 107 |
+
# π§ Sidebar β Library, Settings, Diagnostics
|
| 108 |
# ==========================================================
|
| 109 |
with st.sidebar:
|
| 110 |
+
# πΌοΈ App Logo
|
| 111 |
if os.path.exists(LOGO_PATH):
|
| 112 |
st.image(LOGO_PATH, width=150)
|
| 113 |
|
| 114 |
+
# π§ Reasoning Mode Toggle
|
| 115 |
if "reasoning_mode" not in st.session_state:
|
| 116 |
+
st.session_state.reasoning_mode = False
|
| 117 |
|
| 118 |
st.session_state.reasoning_mode = st.toggle(
|
| 119 |
"π§ Enable Reasoning Mode",
|
| 120 |
value=st.session_state.reasoning_mode,
|
| 121 |
+
help="When ON: GPT-4o uses reasoning + web-like synthesis.\nWhen OFF: Strictly factual from PDF."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
)
|
| 123 |
|
| 124 |
st.markdown("---")
|
|
|
|
| 137 |
st.header("βοΈ Settings")
|
| 138 |
chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
|
| 139 |
overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
|
| 140 |
+
top_k = st.slider("Top K Results", 1, 10, 5)
|
| 141 |
|
| 142 |
st.markdown("---")
|
|
|
|
|
|
|
| 143 |
st.caption("π¨βπ» Built by Shubham Sharma")
|
|
|
|
| 144 |
|
| 145 |
# ==========================================================
|
| 146 |
# π§Ύ Document Handling
|
| 147 |
# ==========================================================
|
| 148 |
+
text, chunks, index, embeddings = None, None, None, None
|
| 149 |
|
| 150 |
if doc_choice == "-- Select --":
|
| 151 |
+
st.info("β¬
οΈ Please choose a document from the sidebar.")
|
| 152 |
|
| 153 |
elif doc_choice == "Sample PDF":
|
| 154 |
temp_path = SAMPLE_PATH
|
| 155 |
st.success("π Using built-in Sample PDF")
|
| 156 |
+
|
| 157 |
with st.spinner("π Extracting and processing document..."):
|
| 158 |
text = extract_text_from_pdf(temp_path)
|
| 159 |
chunks = chunk_text(text, chunk_size=chunk_size)
|
| 160 |
+
st.write(f"π Extracted {len(chunks)} chunks.")
|
| 161 |
+
|
| 162 |
+
# β
Cached Embeddings
|
| 163 |
+
with st.spinner("βοΈ Loading cached embeddings or generating new ones..."):
|
| 164 |
+
embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
|
| 165 |
+
hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
|
| 166 |
+
cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
|
| 167 |
+
if os.path.exists(cache_file):
|
| 168 |
+
st.info(f"π§ Using cached embeddings for {os.path.basename(temp_path)}")
|
| 169 |
+
else:
|
| 170 |
+
st.warning(f"π‘ Generated new embeddings for {os.path.basename(temp_path)}")
|
| 171 |
+
|
| 172 |
+
index = build_faiss_index(embeddings)
|
| 173 |
|
| 174 |
elif doc_choice == "Upload Custom PDF":
|
| 175 |
uploaded_file = st.file_uploader("π Upload your PDF", type="pdf")
|
|
|
|
| 182 |
with st.spinner("βοΈ Extracting and processing your document..."):
|
| 183 |
text = extract_text_from_pdf(temp_path)
|
| 184 |
chunks = chunk_text(text, chunk_size=chunk_size)
|
| 185 |
+
st.write(f"π Extracted {len(chunks)} chunks.")
|
| 186 |
+
|
| 187 |
+
with st.spinner("βοΈ Loading cached embeddings or generating new ones..."):
|
| 188 |
+
embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
|
| 189 |
+
hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
|
| 190 |
+
cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
|
| 191 |
+
if os.path.exists(cache_file):
|
| 192 |
+
st.info(f"π§ Using cached embeddings for {os.path.basename(temp_path)}")
|
| 193 |
+
else:
|
| 194 |
+
st.warning(f"π‘ Generated new embeddings for {os.path.basename(temp_path)}")
|
| 195 |
+
|
| 196 |
+
index = build_faiss_index(embeddings)
|
| 197 |
st.success("π Document processed successfully!")
|
| 198 |
|
| 199 |
# ==========================================================
|
|
|
|
| 203 |
st.subheader("π Document Preview")
|
| 204 |
st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
|
| 205 |
avg_len = int(sum(len(c) for c in chunks) / len(chunks))
|
| 206 |
+
st.caption(f"π¦ {len(chunks)} chunks | Avg length: {avg_len} chars")
|
| 207 |
|
| 208 |
+
# ==========================================================
|
| 209 |
+
# π¬ Query Section
|
| 210 |
+
# ==========================================================
|
| 211 |
if index and chunks:
|
| 212 |
st.markdown("---")
|
| 213 |
st.subheader("π€ Ask a Question")
|
|
|
|
| 215 |
user_query = st.text_input("π Your question about the document:")
|
| 216 |
|
| 217 |
if user_query:
|
|
|
|
| 218 |
mode_label = (
|
| 219 |
"π§ Reasoning Mode (expanded thinking)"
|
| 220 |
if st.session_state.reasoning_mode
|
|
|
|
| 222 |
)
|
| 223 |
st.caption(f"Mode: {mode_label}")
|
| 224 |
|
|
|
|
| 225 |
with st.spinner("π§ Thinking... retrieving context and generating answer..."):
|
| 226 |
retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
|
| 227 |
answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
|
| 228 |
|
|
|
|
|
|
|
| 229 |
# β
Display Answer
|
| 230 |
st.markdown("### β
Assistantβs Answer")
|
| 231 |
st.markdown(
|
|
|
|
| 247 |
|
| 248 |
else:
|
| 249 |
st.info("π₯ Upload or select a document to start exploring.")
|
|
|