Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 6

Commit

c220dec

verified ·

1 Parent(s): fea3890

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +74 -58

src/streamlit_app.py CHANGED Viewed

@@ -1,51 +1,68 @@
-import shutil
 import os
 import streamlit as st
-# ---------------------------
-# 🧹 One-time cache cleaner (prevents 50 GB overflow)
-# ---------------------------
-def clean_cache():
     folders = [
         "/root/.cache/huggingface",
         "/root/.cache/transformers",
         "/root/.cache/torch",
         "/tmp/hf_cache",
     ]
-    total_deleted = 0
     for folder in folders:
         if os.path.exists(folder):
-            # estimate size before deleting
-            size = sum(
-                os.path.getsize(os.path.join(dp, f)) for dp, _, files in os.walk(folder) for f in files
             ) / (1024**3)
-            total_deleted += size
-            shutil.rmtree(folder, ignore_errors=True)
     os.makedirs("/tmp/hf_cache", exist_ok=True)
-    print(f"🧹 Cleaned cache folders (~{total_deleted:.2f} GB removed)")
 def check_disk_usage():
-    """Log how much disk space is used (for debugging storage issues)."""
-    st.sidebar.markdown("### 💾 Disk Usage (for debugging)")
     try:
         usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
         st.sidebar.text(usage if usage else "No cache directories found.")
     except Exception as e:
-        st.sidebar.text(f"⚠️ Could not fetch disk usage: {e}")
-# Run cleanup and diagnostics at startup
 clean_cache()
 check_disk_usage()
-import os
-import streamlit as st
-# --- Streamlit Safe Options (Hugging Face Spaces upload fix) ---
-st.set_option("client.showErrorDetails", True)
-# ---------------------------
-# Hugging Face Cache Fix (/tmp for writable)
-# ---------------------------
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ.update({
@@ -55,36 +72,34 @@ os.environ.update({
     "HF_MODULES_CACHE": CACHE_DIR
 })
-# ---------------------------
-# Imports AFTER environment setup
-# ---------------------------
 from ingestion import extract_text_from_pdf, chunk_text
 from embeddings import generate_embeddings
 from vectorstore import build_faiss_index
 from qa import retrieve_chunks, generate_answer
-# ---------------------------
-# Paths
-# ---------------------------
-BASE_DIR = os.path.dirname(__file__)         # /app/src
 LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
 SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
-# ---------------------------
-# App Config
-# ---------------------------
-st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
 st.title("📄 Enterprise Knowledge Assistant")
 st.caption("Upload a PDF or use the sample file to explore intelligent document Q&A.")
-# ---------------------------
-# Sidebar (Library + Settings + Credits)
-# ---------------------------
 with st.sidebar:
     if os.path.exists(LOGO_PATH):
         st.image(LOGO_PATH, width=150)
-    # 1️⃣ Document Library
     st.header("📚 Document Library")
     doc_choice = st.radio(
         "Choose a document:",
@@ -94,24 +109,21 @@ with st.sidebar:
     st.markdown("---")
-    # 2️⃣ Settings
     st.header("⚙️ Settings")
     chunk_size = st.slider("Chunk Size (characters)", 300, 1200, 800, step=100)
     top_k = st.slider("Top K Results (retrieved chunks)", 1, 10, 5)
     st.markdown("---")
-    # 3️⃣ Branding
     st.caption("👨‍💻 Built by Shubham Sharma")
     st.markdown("[📂 GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
-# ---------------------------
-# Document Handling
-# ---------------------------
 text, chunks, index = None, None, None
 if doc_choice == "-- Select --":
-    st.info("⬅️ Please choose **Sample PDF** or **Upload Custom PDF** from the sidebar to get started.")
 elif doc_choice == "Sample PDF":
     temp_path = SAMPLE_PATH
@@ -128,7 +140,7 @@ elif doc_choice == "Upload Custom PDF":
         temp_path = os.path.join("/tmp", uploaded_file.name)
         with open(temp_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
-        st.success(f"✅ File '{uploaded_file.name}' uploaded and saved to /tmp")
         with st.spinner("⚙️ Extracting and processing your document..."):
             text = extract_text_from_pdf(temp_path)
@@ -137,17 +149,18 @@ elif doc_choice == "Upload Custom PDF":
             index = build_faiss_index(embeddings)
         st.success("🚀 Document processed successfully!")
-# ---------------------------
-# Document Preview
-# ---------------------------
 if chunks:
     st.subheader("📑 Document Preview")
     st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
-    st.caption(f"📦 {len(chunks)} chunks created | Avg chunk length: {int(sum(len(c) for c in chunks) / len(chunks))} chars")
-# ---------------------------
-# Query Section
-# ---------------------------
 if index and chunks:
     st.markdown("---")
     st.subheader("🤖 Ask a Question")
@@ -158,11 +171,14 @@ if index and chunks:
             retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
             answer = generate_answer(user_query, retrieved)
-        # Answer Section
         st.markdown("### ✅ Assistant’s Answer")
-        st.markdown(f"<div style='background-color:#0E1117;padding:12px;border-radius:10px;'>{answer}</div>", unsafe_allow_html=True)
-        # Supporting Chunks Section
         with st.expander("📄 Supporting Chunks (Context Used)"):
             for i, r in enumerate(retrieved, start=1):
                 st.markdown(

 import os
+import shutil
 import streamlit as st
+# ==========================================================
+# ✅ Page Configuration (must be first Streamlit command)
+# ==========================================================
+st.set_page_config(
+    page_title="Enterprise Knowledge Assistant",
+    layout="wide"
+)
+# ==========================================================
+# 🧹 Cache Management (prevents Hugging Face 50GB overflow)
+# ==========================================================
+def clean_cache(max_size_gb: float = 2.0):
+    """
+    Cleans large cache folders (> max_size_gb), preserving /tmp/hf_cache if small.
+    """
     folders = [
         "/root/.cache/huggingface",
         "/root/.cache/transformers",
         "/root/.cache/torch",
         "/tmp/hf_cache",
     ]
+    total_deleted = 0.0
     for folder in folders:
         if os.path.exists(folder):
+            # estimate folder size
+            size_gb = sum(
+                os.path.getsize(os.path.join(dp, f))
+                for dp, _, files in os.walk(folder)
+                for f in files
             ) / (1024**3)
+            # only delete if large
+            if size_gb > max_size_gb or "torch" in folder:
+                shutil.rmtree(folder, ignore_errors=True)
+                total_deleted += size_gb
+                print(f"🗑️ Deleted {folder} ({size_gb:.2f} GB)")
+            else:
+                print(f"✅ Preserved {folder} ({size_gb:.2f} GB)")
     os.makedirs("/tmp/hf_cache", exist_ok=True)
+    print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
 def check_disk_usage():
+    """Show disk usage info in sidebar."""
+    st.sidebar.markdown("### 💾 Disk Usage (Debug)")
     try:
         usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
         st.sidebar.text(usage if usage else "No cache directories found.")
     except Exception as e:
+        st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
+# Run cleanup & diagnostics
 clean_cache()
 check_disk_usage()
+# ==========================================================
+# ⚙️ Hugging Face Cache Configuration (/tmp for writable path)
+# ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ.update({
     "HF_MODULES_CACHE": CACHE_DIR
 })
+# ==========================================================
+# 📦 Imports AFTER environment setup
+# ==========================================================
 from ingestion import extract_text_from_pdf, chunk_text
 from embeddings import generate_embeddings
 from vectorstore import build_faiss_index
 from qa import retrieve_chunks, generate_answer
+# ==========================================================
+# 📁 Paths
+# ==========================================================
+BASE_DIR = os.path.dirname(__file__)  # /app/src
 LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
 SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
+# ==========================================================
+# 🖥️ UI Header
+# ==========================================================
 st.title("📄 Enterprise Knowledge Assistant")
 st.caption("Upload a PDF or use the sample file to explore intelligent document Q&A.")
+# ==========================================================
+# 🧭 Sidebar (Document Library + Settings + Diagnostics)
+# ==========================================================
 with st.sidebar:
     if os.path.exists(LOGO_PATH):
         st.image(LOGO_PATH, width=150)
     st.header("📚 Document Library")
     doc_choice = st.radio(
         "Choose a document:",
     st.markdown("---")
     st.header("⚙️ Settings")
     chunk_size = st.slider("Chunk Size (characters)", 300, 1200, 800, step=100)
     top_k = st.slider("Top K Results (retrieved chunks)", 1, 10, 5)
     st.markdown("---")
     st.caption("👨‍💻 Built by Shubham Sharma")
     st.markdown("[📂 GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
+# ==========================================================
+# 🧾 Document Handling
+# ==========================================================
 text, chunks, index = None, None, None
 if doc_choice == "-- Select --":
+    st.info("⬅️ Please choose **Sample PDF** or **Upload Custom PDF** from the sidebar.")
 elif doc_choice == "Sample PDF":
     temp_path = SAMPLE_PATH
         temp_path = os.path.join("/tmp", uploaded_file.name)
         with open(temp_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
+        st.success(f"✅ File '{uploaded_file.name}' uploaded successfully")
         with st.spinner("⚙️ Extracting and processing your document..."):
             text = extract_text_from_pdf(temp_path)
             index = build_faiss_index(embeddings)
         st.success("🚀 Document processed successfully!")
+# ==========================================================
+# 📑 Document Preview
+# ==========================================================
 if chunks:
     st.subheader("📑 Document Preview")
     st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
+    avg_len = int(sum(len(c) for c in chunks) / len(chunks))
+    st.caption(f"📦 {len(chunks)} chunks created | Avg chunk length: {avg_len} chars")
+# ==========================================================
+# 💬 Query Section
+# ==========================================================
 if index and chunks:
     st.markdown("---")
     st.subheader("🤖 Ask a Question")
             retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k)
             answer = generate_answer(user_query, retrieved)
+        # ✅ Answer Display
         st.markdown("### ✅ Assistant’s Answer")
+        st.markdown(
+            f"<div style='background-color:#0E1117;padding:12px;border-radius:10px;color:white;'>{answer}</div>",
+            unsafe_allow_html=True
+        )
+        # 📄 Supporting Chunks
         with st.expander("📄 Supporting Chunks (Context Used)"):
             for i, r in enumerate(retrieved, start=1):
                 st.markdown(