Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 18

Commit

bdf26f2

verified ·

1 Parent(s): f5088d3

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +59 -59

src/streamlit_app.py CHANGED Viewed

@@ -27,10 +27,7 @@ st.set_page_config(
 # 🧹 Cache Management (prevent HF overflow)
 # ==========================================================
 def clean_cache(max_size_gb: float = 2.0):
-    """
-    Cleans large cache folders (> max_size_gb),
-    preserving /tmp/hf_cache (used for model weights).
-    """
     folders = [
         "/root/.cache/huggingface",
         "/root/.cache/transformers",
@@ -45,14 +42,10 @@ def clean_cache(max_size_gb: float = 2.0):
                 for dp, _, files in os.walk(folder)
                 for f in files
             ) / (1024**3)
             if size_gb > max_size_gb or "torch" in folder:
                 shutil.rmtree(folder, ignore_errors=True)
                 total_deleted += size_gb
                 print(f"🗑️ Deleted {folder} ({size_gb:.2f} GB)")
-            else:
-                print(f"✅ Preserved {folder} ({size_gb:.2f} GB)")
     os.makedirs("/tmp/hf_cache", exist_ok=True)
     print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
@@ -91,13 +84,13 @@ from vectorstore import build_faiss_index
 from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
 # ==========================================================
-# 🧠 TOC-Based Smart Question Generator
 # ==========================================================
 def clean_toc_titles(toc):
     """Removes section numbers and keeps only meaningful text."""
     clean_titles = []
     for _, title in toc:
-        title = re.sub(r"^\d+(\.\d+)*\s*", "", title)  # remove numbering like 3.1
         title = title.strip()
         if len(title) > 3:
             clean_titles.append(title)
@@ -135,7 +128,41 @@ def generate_query_suggestions(toc_titles):
         if s not in seen:
             seen.add(s)
             final.append(s)
-    return final[:6]  # Show top 6
 # ==========================================================
@@ -152,14 +179,12 @@ st.title("📄 Enterprise Knowledge Assistant")
 st.caption("Query SAP documentation and enterprise PDFs using natural language and reasoning.")
 # ==========================================================
-# 🧭 Sidebar — Library, Settings, Diagnostics
 # ==========================================================
 with st.sidebar:
-    # 🖼️ App Logo
     if os.path.exists(LOGO_PATH):
         st.image(LOGO_PATH, width=150)
-    # 🧠 Reasoning Mode Toggle
     if "reasoning_mode" not in st.session_state:
         st.session_state.reasoning_mode = False
@@ -170,8 +195,6 @@ with st.sidebar:
     )
     st.markdown("---")
-    # 📚 Document Library
     st.header("📚 Document Library")
     doc_choice = st.radio(
         "Choose a document:",
@@ -180,13 +203,10 @@ with st.sidebar:
     )
     st.markdown("---")
-    # ⚙️ Settings
     st.header("⚙️ Settings")
     chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
     overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
     top_k = st.slider("Top K Results", 1, 10, 5)
     st.markdown("---")
     st.caption("👨‍💻 Built by Shubham Sharma")
@@ -212,26 +232,21 @@ elif doc_choice == "Sample PDF":
             toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
             st.text_area("TOC Preview", toc_text, height=200)
-            # 💡 Generate and display smart suggestions
             clean_titles = clean_toc_titles(toc)
             query_suggestions = generate_query_suggestions(clean_titles)
-            if query_suggestions:
-                st.markdown("#### 💡 Suggested Questions")
-                cols = st.columns(2)
-                for i, q in enumerate(query_suggestions):
-                    if cols[i % 2].button(f"🔍 {q}"):
-                        st.session_state["user_query"] = q
-    # ✅ Cached Embeddings
     with st.spinner("⚙️ Loading cached embeddings or generating new ones..."):
         embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
-        hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
-        cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
-        if os.path.exists(cache_file):
-            st.info(f"🧠 Using cached embeddings for {os.path.basename(temp_path)}")
-        else:
-            st.warning(f"💡 Generated new embeddings for {os.path.basename(temp_path)}")
     index = build_faiss_index(embeddings)
 elif doc_choice == "Upload Custom PDF":
@@ -252,37 +267,24 @@ elif doc_choice == "Upload Custom PDF":
                 toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
                 st.text_area("TOC Preview", toc_text, height=200)
-                # 💡 Generate and display smart suggestions
                 clean_titles = clean_toc_titles(toc)
                 query_suggestions = generate_query_suggestions(clean_titles)
-                if query_suggestions:
-                    st.markdown("#### 💡 Suggested Questions")
-                    cols = st.columns(2)
-                    for i, q in enumerate(query_suggestions):
-                        if cols[i % 2].button(f"🔍 {q}"):
-                            st.session_state["user_query"] = q
         with st.spinner("⚙️ Loading cached embeddings or generating new ones..."):
             embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
-            hash_name = hashlib.md5(os.path.basename(temp_path).encode()).hexdigest()
-            cache_file = f"/tmp/embed_cache/{hash_name}.pkl"
-            if os.path.exists(cache_file):
-                st.info(f"🧠 Using cached embeddings for {os.path.basename(temp_path)}")
-            else:
-                st.warning(f"💡 Generated new embeddings for {os.path.basename(temp_path)}")
         index = build_faiss_index(embeddings)
         st.success("🚀 Document processed successfully!")
-# ==========================================================
-# 📑 Document Preview
-# ==========================================================
-if chunks:
-    st.subheader("📑 Document Preview")
-    st.text_area("Extracted text (first 1000 chars)", text[:1000], height=200)
-    avg_len = int(sum(len(c) for c in chunks) / len(chunks))
-    st.caption(f"📦 {len(chunks)} chunks | Avg length: {avg_len} chars")
 # ==========================================================
 # 💬 Query Section
 # ==========================================================
@@ -307,14 +309,12 @@ if index and chunks:
             retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
             answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
-        # ✅ Display Answer
         st.markdown("### ✅ Assistant’s Answer")
         st.markdown(
             f"<div style='background-color:#0E1117;padding:12px;border-radius:10px;color:white;'>{answer}</div>",
             unsafe_allow_html=True
         )
-        # 📄 Supporting Chunks
         with st.expander("📄 Supporting Chunks (Context Used)"):
             for i, r in enumerate(retrieved, start=1):
                 st.markdown(

 # 🧹 Cache Management (prevent HF overflow)
 # ==========================================================
 def clean_cache(max_size_gb: float = 2.0):
+    """Cleans large cache folders (> max_size_gb)."""
     folders = [
         "/root/.cache/huggingface",
         "/root/.cache/transformers",
                 for dp, _, files in os.walk(folder)
                 for f in files
             ) / (1024**3)
             if size_gb > max_size_gb or "torch" in folder:
                 shutil.rmtree(folder, ignore_errors=True)
                 total_deleted += size_gb
                 print(f"🗑️ Deleted {folder} ({size_gb:.2f} GB)")
     os.makedirs("/tmp/hf_cache", exist_ok=True)
     print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
 from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
 # ==========================================================
+# 🧠 TOC-Based Smart Question Generator + AI Fallback
 # ==========================================================
 def clean_toc_titles(toc):
     """Removes section numbers and keeps only meaningful text."""
     clean_titles = []
     for _, title in toc:
+        title = re.sub(r"^\d+(\.\d+)*\s*", "", title)
         title = title.strip()
         if len(title) > 3:
             clean_titles.append(title)
         if s not in seen:
             seen.add(s)
             final.append(s)
+    return final[:6]
+def generate_ai_fallback_suggestions(chunks):
+    """When no TOC is detected, use document content to guess interactive suggestions."""
+    if not chunks:
+        return []
+    # Take the first few chunks (intro + overview usually)
+    head_text = " ".join(chunks[:3]).lower()
+    suggestions = []
+    if "overview" in head_text or "introduction" in head_text:
+        suggestions.append("Can you summarize the overview of this document?")
+    if "setup" in head_text or "configuration" in head_text:
+        suggestions.append("How do I configure or set this up?")
+    if "prerequisite" in head_text:
+        suggestions.append("What are the prerequisites before using this process?")
+    if "troubleshoot" in head_text or "error" in head_text:
+        suggestions.append("How do I troubleshoot common errors?")
+    if "step" in head_text or "procedure" in head_text:
+        suggestions.append("Can you list the steps involved in this process?")
+    if "benefit" in head_text or "objective" in head_text:
+        suggestions.append("What is the objective or benefit of this guide?")
+    # Fallback generic questions if no keywords found
+    if not suggestions:
+        suggestions = [
+            "Can you summarize the main topic of this document?",
+            "What process does this guide explain?",
+            "How can I get started with the described setup?",
+            "What are the important details to remember?",
+        ]
+    return suggestions[:6]
 # ==========================================================
 st.caption("Query SAP documentation and enterprise PDFs using natural language and reasoning.")
 # ==========================================================
+# 🧭 Sidebar
 # ==========================================================
 with st.sidebar:
     if os.path.exists(LOGO_PATH):
         st.image(LOGO_PATH, width=150)
     if "reasoning_mode" not in st.session_state:
         st.session_state.reasoning_mode = False
     )
     st.markdown("---")
     st.header("📚 Document Library")
     doc_choice = st.radio(
         "Choose a document:",
     )
     st.markdown("---")
     st.header("⚙️ Settings")
     chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
     overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
     top_k = st.slider("Top K Results", 1, 10, 5)
     st.markdown("---")
     st.caption("👨‍💻 Built by Shubham Sharma")
             toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
             st.text_area("TOC Preview", toc_text, height=200)
             clean_titles = clean_toc_titles(toc)
             query_suggestions = generate_query_suggestions(clean_titles)
+        else:
+            st.warning("⚠️ No TOC detected — generating smart suggestions using content...")
+            query_suggestions = generate_ai_fallback_suggestions(chunks)
+        if query_suggestions:
+            st.markdown("#### 💡 Suggested Questions")
+            cols = st.columns(2)
+            for i, q in enumerate(query_suggestions):
+                if cols[i % 2].button(f"🔍 {q}"):
+                    st.session_state["user_query"] = q
     with st.spinner("⚙️ Loading cached embeddings or generating new ones..."):
         embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
     index = build_faiss_index(embeddings)
 elif doc_choice == "Upload Custom PDF":
                 toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
                 st.text_area("TOC Preview", toc_text, height=200)
                 clean_titles = clean_toc_titles(toc)
                 query_suggestions = generate_query_suggestions(clean_titles)
+            else:
+                st.warning("⚠️ No TOC detected — generating smart suggestions using content...")
+                query_suggestions = generate_ai_fallback_suggestions(chunks)
+            if query_suggestions:
+                st.markdown("#### 💡 Suggested Questions")
+                cols = st.columns(2)
+                for i, q in enumerate(query_suggestions):
+                    if cols[i % 2].button(f"🔍 {q}"):
+                        st.session_state["user_query"] = q
         with st.spinner("⚙️ Loading cached embeddings or generating new ones..."):
             embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
         index = build_faiss_index(embeddings)
         st.success("🚀 Document processed successfully!")
 # ==========================================================
 # 💬 Query Section
 # ==========================================================
             retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
             answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
         st.markdown("### ✅ Assistant’s Answer")
         st.markdown(
             f"<div style='background-color:#0E1117;padding:12px;border-radius:10px;color:white;'>{answer}</div>",
             unsafe_allow_html=True
         )
         with st.expander("📄 Supporting Chunks (Context Used)"):
             for i, r in enumerate(retrieved, start=1):
                 st.markdown(