Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 18

Commit

585fec8

verified ·

1 Parent(s): f86b15f

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +59 -49

src/streamlit_app.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import os
 import shutil
 import streamlit as st
 import torch
 print("CUDA available:", torch.cuda.is_available())
 print("Device count:", torch.cuda.device_count())
 if torch.cuda.is_available():
@@ -10,9 +15,8 @@ if torch.cuda.is_available():
 else:
     print("Running on CPU")
 # ==========================================================
-# ✅ Page Configuration (must be first Streamlit command)
 # ==========================================================
 st.set_page_config(
     page_title="Enterprise Knowledge Assistant",
@@ -20,30 +24,28 @@ st.set_page_config(
 )
 # ==========================================================
-# 🧹 Cache Management (prevents Hugging Face 50GB overflow)
 # ==========================================================
 def clean_cache(max_size_gb: float = 2.0):
     """
-    Cleans large cache folders (> max_size_gb), preserving /tmp/hf_cache if small.
     """
     folders = [
-    "/root/.cache/huggingface",
-    "/root/.cache/transformers",
-    "/root/.cache/torch",
-    # "/tmp/hf_cache",  # 🚫 DO NOT DELETE: used by Mistral for offloading
-    ]
     total_deleted = 0.0
     for folder in folders:
         if os.path.exists(folder):
-            # estimate folder size
             size_gb = sum(
                 os.path.getsize(os.path.join(dp, f))
                 for dp, _, files in os.walk(folder)
                 for f in files
             ) / (1024**3)
-            # only delete if large
             if size_gb > max_size_gb or "torch" in folder:
                 shutil.rmtree(folder, ignore_errors=True)
                 total_deleted += size_gb
@@ -56,7 +58,7 @@ def clean_cache(max_size_gb: float = 2.0):
 def check_disk_usage():
-    """Show disk usage info in sidebar."""
     st.sidebar.markdown("### 💾 Disk Usage (Debug)")
     try:
         usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
@@ -65,12 +67,12 @@ def check_disk_usage():
         st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
-# Run cleanup & diagnostics
 clean_cache()
 check_disk_usage()
 # ==========================================================
-# ⚙️ Hugging Face Cache Configuration (/tmp for writable path)
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -82,18 +84,16 @@ os.environ.update({
 })
 # ==========================================================
-# 📦 Imports AFTER environment setup
 # ==========================================================
 from ingestion import extract_text_from_pdf, chunk_text
-from embeddings import generate_embeddings
 from vectorstore import build_faiss_index
-from qa import retrieve_chunks, generate_answer
 # ==========================================================
 # 📁 Paths
 # ==========================================================
-BASE_DIR = os.path.dirname(__file__)  # /app/src
 LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
 SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
@@ -101,28 +101,24 @@ SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
 # 🖥️ UI Header
 # ==========================================================
 st.title("📄 Enterprise Knowledge Assistant")
-st.caption("Upload a PDF or use the sample file to explore intelligent document Q&A.")
 # ==========================================================
-# 🧭 Sidebar (Document Library + Settings + Diagnostics)
 # ==========================================================
 with st.sidebar:
-    # 🖼️ App Logo (if available)
     if os.path.exists(LOGO_PATH):
         st.image(LOGO_PATH, width=150)
-    # 🧠 Reasoning Mode Toggle (Persistent)
     if "reasoning_mode" not in st.session_state:
-        st.session_state.reasoning_mode = False  # Default OFF
     st.session_state.reasoning_mode = st.toggle(
         "🧠 Enable Reasoning Mode",
         value=st.session_state.reasoning_mode,
-        help=(
-            "When ON, the assistant can use its world knowledge and reasoning ability "
-            "to generate richer, more explanatory answers.\n\n"
-            "When OFF, it sticks strictly to the document text for factual accuracy."
-        )
     )
     st.markdown("---")
@@ -141,30 +137,39 @@ with st.sidebar:
     st.header("⚙️ Settings")
     chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
     overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
-    top_k = st.slider("Top K Results (retrieved chunks)", 1, 10, 5)
     st.markdown("---")
-    # 👨‍💻 Branding
     st.caption("👨‍💻 Built by Shubham Sharma")
-    st.markdown("[📂 GitHub Repo](https://github.com/shubhamsharma170793-cpu/enterprise-knowledge-assistant)")
 # ==========================================================
 # 🧾 Document Handling
 # ==========================================================
-text, chunks, index = None, None, None
 if doc_choice == "-- Select --":
-    st.info("⬅️ Please choose **Sample PDF** or **Upload Custom PDF** from the sidebar.")
 elif doc_choice == "Sample PDF":
     temp_path = SAMPLE_PATH
     st.success("📘 Using built-in Sample PDF")
     with st.spinner("🔍 Extracting and processing document..."):
         text = extract_text_from_pdf(temp_path)
         chunks = chunk_text(text, chunk_size=chunk_size)
-        embeddings = generate_embeddings(chunks)
-        index = build_faiss_index(embeddings)
 elif doc_choice == "Upload Custom PDF":
     uploaded_file = st.file_uploader("📂 Upload your PDF", type="pdf")
@@ -177,8 +182,18 @@ elif doc_choice == "Upload Custom PDF":
         with st.spinner("⚙️ Extracting and processing your document..."):
             text = extract_text_from_pdf(temp_path)
             chunks = chunk_text(text, chunk_size=chunk_size)
-            embeddings = generate_embeddings(chunks)
-            index = build_faiss_index(embeddings)
         st.success("🚀 Document processed successfully!")
 # ==========================================================
@@ -188,11 +203,11 @@ if chunks:
     st.subheader("📑 Document Preview")
     st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
     avg_len = int(sum(len(c) for c in chunks) / len(chunks))
-    st.caption(f"📦 {len(chunks)} chunks created | Avg chunk length: {avg_len} chars")
-# ---------------------------
-# Query Section
-# ---------------------------
 if index and chunks:
     st.markdown("---")
     st.subheader("🤖 Ask a Question")
@@ -200,7 +215,6 @@ if index and chunks:
     user_query = st.text_input("🔍 Your question about the document:")
     if user_query:
-        # Show which mode is active
         mode_label = (
             "🧠 Reasoning Mode (expanded thinking)"
             if st.session_state.reasoning_mode
@@ -208,13 +222,10 @@ if index and chunks:
         )
         st.caption(f"Mode: {mode_label}")
-        # Generate the answer
         with st.spinner("🧠 Thinking... retrieving context and generating answer..."):
             retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
             answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
         # ✅ Display Answer
         st.markdown("### ✅ Assistant’s Answer")
         st.markdown(
@@ -236,4 +247,3 @@ if index and chunks:
 else:
     st.info("📥 Upload or select a document to start exploring.")

 import os
+import re
 import shutil
+import hashlib
 import streamlit as st
 import torch
+# ==========================================================
+# ✅ Environment Diagnostics
+# ==========================================================
 print("CUDA available:", torch.cuda.is_available())
 print("Device count:", torch.cuda.device_count())
 if torch.cuda.is_available():
 else:
     print("Running on CPU")
 # ==========================================================
+# ✅ Page Configuration
 # ==========================================================
 st.set_page_config(
     page_title="Enterprise Knowledge Assistant",
 )
 # ==========================================================
+# 🧹 Cache Management (prevent HF overflow)
 # ==========================================================
 def clean_cache(max_size_gb: float = 2.0):
     """
+    Cleans large cache folders (> max_size_gb),
+    preserving /tmp/hf_cache (used for model weights).
     """
     folders = [
+        "/root/.cache/huggingface",
+        "/root/.cache/transformers",
+        "/root/.cache/torch",
+    ]
     total_deleted = 0.0
     for folder in folders:
         if os.path.exists(folder):
             size_gb = sum(
                 os.path.getsize(os.path.join(dp, f))
                 for dp, _, files in os.walk(folder)
                 for f in files
             ) / (1024**3)
             if size_gb > max_size_gb or "torch" in folder:
                 shutil.rmtree(folder, ignore_errors=True)
                 total_deleted += size_gb
 def check_disk_usage():
+    """Display disk usage info in sidebar."""
     st.sidebar.markdown("### 💾 Disk Usage (Debug)")
     try:
         usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
         st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
+# Run cache cleanup once at startup
 clean_cache()
 check_disk_usage()
 # ==========================================================
+# ⚙️ Hugging Face Cache Configuration
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 })
 # ==========================================================
+# 📦 Imports AFTER Environment Setup
 # ==========================================================
 from ingestion import extract_text_from_pdf, chunk_text
 from vectorstore import build_faiss_index
+from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
 # ==========================================================
 # 📁 Paths
 # ==========================================================
+BASE_DIR = os.path.dirname(__file__)
 LOGO_PATH = os.path.join(BASE_DIR, "logo.png")
 SAMPLE_PATH = os.path.join(BASE_DIR, "sample.pdf")
 # 🖥️ UI Header
 # ==========================================================
 st.title("📄 Enterprise Knowledge Assistant")
+st.caption("Query SAP documentation and enterprise PDFs using natural language and reasoning.")
 # ==========================================================
+# 🧭 Sidebar — Library, Settings, Diagnostics
 # ==========================================================
 with st.sidebar:
+    # 🖼️ App Logo
     if os.path.exists(LOGO_PATH):
         st.image(LOGO_PATH, width=150)
+    # 🧠 Reasoning Mode Toggle
     if "reasoning_mode" not in st.session_state:
+        st.session_state.reasoning_mode = False
     st.session_state.reasoning_mode = st.toggle(
         "🧠 Enable Reasoning Mode",
         value=st.session_state.reasoning_mode,
+        help="When ON: GPT-4o uses reasoning + web-like synthesis.\nWhen OFF: Strictly factual from PDF."
     )
     st.markdown("---")
     st.header("⚙️ Settings")
     chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
     overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
+    top_k = st.slider("Top K Results", 1, 10, 5)
     st.markdown("---")
     st.caption("👨‍💻 Built by Shubham Sharma")
 # ==========================================================
 # 🧾 Document Handling
 # ==========================================================
+text, chunks, index, embeddings = None, None, None, None
 if doc_choice == "-- Select --":
+    st.info("⬅️ Please choose a document from the sidebar.")
 elif doc_choice == "Sample PDF":
     temp_path = SAMPLE_PATH
     st.success("📘 Using built-in Sample PDF")
     with st.spinner("🔍 Extracting and processing document..."):
         text = extract_text_from_pdf(temp_path)
         chunks = chunk_text(text, chunk_size=chunk_size)
+        st.write(f"📑 Extracted {len(chunks)} chunks.")
+    # ✅ Cached Embeddings
+    with st.spinner("⚙️ Loading cached embeddings or generating new ones..."):
+        embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
+        hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
+        cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
+        if os.path.exists(cache_file):
+            st.info(f"🧠 Using cached embeddings for {os.path.basename(temp_path)}")
+        else:
+            st.warning(f"💡 Generated new embeddings for {os.path.basename(temp_path)}")
+    index = build_faiss_index(embeddings)
 elif doc_choice == "Upload Custom PDF":
     uploaded_file = st.file_uploader("📂 Upload your PDF", type="pdf")
         with st.spinner("⚙️ Extracting and processing your document..."):
             text = extract_text_from_pdf(temp_path)
             chunks = chunk_text(text, chunk_size=chunk_size)
+            st.write(f"📄 Extracted {len(chunks)} chunks.")
+        with st.spinner("⚙️ Loading cached embeddings or generating new ones..."):
+            embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
+            hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
+            cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
+            if os.path.exists(cache_file):
+                st.info(f"🧠 Using cached embeddings for {os.path.basename(temp_path)}")
+            else:
+                st.warning(f"💡 Generated new embeddings for {os.path.basename(temp_path)}")
+        index = build_faiss_index(embeddings)
         st.success("🚀 Document processed successfully!")
 # ==========================================================
     st.subheader("📑 Document Preview")
     st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
     avg_len = int(sum(len(c) for c in chunks) / len(chunks))
+    st.caption(f"📦 {len(chunks)} chunks | Avg length: {avg_len} chars")
+# ==========================================================
+# 💬 Query Section
+# ==========================================================
 if index and chunks:
     st.markdown("---")
     st.subheader("🤖 Ask a Question")
     user_query = st.text_input("🔍 Your question about the document:")
     if user_query:
         mode_label = (
             "🧠 Reasoning Mode (expanded thinking)"
             if st.session_state.reasoning_mode
         )
         st.caption(f"Mode: {mode_label}")
         with st.spinner("🧠 Thinking... retrieving context and generating answer..."):
             retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
             answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
         # ✅ Display Answer
         st.markdown("### ✅ Assistant’s Answer")
         st.markdown(
 else:
     st.info("📥 Upload or select a document to start exploring.")