Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 18

Commit

59078ee

verified ·

1 Parent(s): bdf26f2

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +61 -98

src/streamlit_app.py CHANGED Viewed

@@ -24,17 +24,15 @@ st.set_page_config(
 )
 # ==========================================================
-# 🧹 Cache Management (prevent HF overflow)
 # ==========================================================
 def clean_cache(max_size_gb: float = 2.0):
-    """Cleans large cache folders (> max_size_gb)."""
     folders = [
         "/root/.cache/huggingface",
         "/root/.cache/transformers",
         "/root/.cache/torch",
     ]
     total_deleted = 0.0
     for folder in folders:
         if os.path.exists(folder):
             size_gb = sum(
@@ -45,13 +43,10 @@ def clean_cache(max_size_gb: float = 2.0):
             if size_gb > max_size_gb or "torch" in folder:
                 shutil.rmtree(folder, ignore_errors=True)
                 total_deleted += size_gb
-                print(f"🗑️ Deleted {folder} ({size_gb:.2f} GB)")
     os.makedirs("/tmp/hf_cache", exist_ok=True)
     print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
 def check_disk_usage():
-    """Display disk usage info in sidebar."""
     st.sidebar.markdown("### 💾 Disk Usage (Debug)")
     try:
         usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
@@ -59,13 +54,11 @@ def check_disk_usage():
     except Exception as e:
         st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
-# Run cache cleanup once at startup
 clean_cache()
 check_disk_usage()
 # ==========================================================
-# ⚙️ Hugging Face Cache Configuration
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -81,13 +74,12 @@ os.environ.update({
 # ==========================================================
 from ingestion import extract_text_from_pdf, chunk_text
 from vectorstore import build_faiss_index
-from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks
 # ==========================================================
-# 🧠 TOC-Based Smart Question Generator + AI Fallback
 # ==========================================================
 def clean_toc_titles(toc):
-    """Removes section numbers and keeps only meaningful text."""
     clean_titles = []
     for _, title in toc:
         title = re.sub(r"^\d+(\.\d+)*\s*", "", title)
@@ -98,14 +90,12 @@ def clean_toc_titles(toc):
 def generate_query_suggestions(toc_titles):
-    """Converts section titles into conversational question suggestions."""
     suggestions = []
     for t in toc_titles:
         lower = t.lower()
         if "prerequisite" in lower:
             suggestions.append("What are the prerequisites for setting this up?")
-        elif "restriction" in lower or "limitation" in lower:
             suggestions.append("What are the key restrictions or limitations?")
         elif "configuration" in lower or "setup" in lower:
             suggestions.append(f"How do I {t.lower()}?")
@@ -117,12 +107,8 @@ def generate_query_suggestions(toc_titles):
             suggestions.append("Can you show an example from this document?")
         elif "process" in lower:
             suggestions.append(f"Can you explain the {t.lower()} process?")
-        elif "use" in lower:
-            suggestions.append(f"How do I {t.lower()}?")
         else:
             suggestions.append(f"Explain the section about {t.lower()}.")
-    # Deduplicate & limit
     seen, final = set(), []
     for s in suggestions:
         if s not in seen:
@@ -131,38 +117,48 @@ def generate_query_suggestions(toc_titles):
     return final[:6]
-def generate_ai_fallback_suggestions(chunks):
-    """When no TOC is detected, use document content to guess interactive suggestions."""
     if not chunks:
         return []
-    # Take the first few chunks (intro + overview usually)
-    head_text = " ".join(chunks[:3]).lower()
-    suggestions = []
-    if "overview" in head_text or "introduction" in head_text:
-        suggestions.append("Can you summarize the overview of this document?")
-    if "setup" in head_text or "configuration" in head_text:
-        suggestions.append("How do I configure or set this up?")
-    if "prerequisite" in head_text:
-        suggestions.append("What are the prerequisites before using this process?")
-    if "troubleshoot" in head_text or "error" in head_text:
-        suggestions.append("How do I troubleshoot common errors?")
-    if "step" in head_text or "procedure" in head_text:
-        suggestions.append("Can you list the steps involved in this process?")
-    if "benefit" in head_text or "objective" in head_text:
-        suggestions.append("What is the objective or benefit of this guide?")
-    # Fallback generic questions if no keywords found
-    if not suggestions:
-        suggestions = [
-            "Can you summarize the main topic of this document?",
-            "What process does this guide explain?",
-            "How can I get started with the described setup?",
-            "What are the important details to remember?",
-        ]
-    return suggestions[:6]
 # ==========================================================
@@ -191,21 +187,17 @@ with st.sidebar:
     st.session_state.reasoning_mode = st.toggle(
         "🧠 Enable Reasoning Mode",
         value=st.session_state.reasoning_mode,
-        help="When ON: GPT-4o uses reasoning + web-like synthesis.\nWhen OFF: Strictly factual from PDF."
     )
     st.markdown("---")
     st.header("📚 Document Library")
-    doc_choice = st.radio(
-        "Choose a document:",
-        ["-- Select --", "Sample PDF", "Upload Custom PDF"],
-        index=0
-    )
     st.markdown("---")
     st.header("⚙️ Settings")
-    chunk_size = st.slider("Chunk Size (characters)", 200, 1500, 800, step=50)
-    overlap = st.slider("Chunk Overlap (characters)", 50, 200, 120, step=10)
     top_k = st.slider("Top K Results", 1, 10, 5)
     st.markdown("---")
     st.caption("👨‍💻 Built by Shubham Sharma")
@@ -218,49 +210,21 @@ text, chunks, index, embeddings, toc = None, None, None, None, None
 if doc_choice == "-- Select --":
     st.info("⬅️ Please choose a document from the sidebar.")
-elif doc_choice == "Sample PDF":
-    temp_path = SAMPLE_PATH
-    st.success("📘 Using built-in Sample PDF")
-    with st.spinner("🔍 Extracting and processing document..."):
-        text, toc = extract_text_from_pdf(temp_path)
-        chunks = chunk_text(text, chunk_size=chunk_size)
-        st.write(f"📑 Extracted {len(chunks)} chunks.")
-        if toc:
-            st.markdown("### 🧭 Detected Table of Contents")
-            toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
-            st.text_area("TOC Preview", toc_text, height=200)
-            clean_titles = clean_toc_titles(toc)
-            query_suggestions = generate_query_suggestions(clean_titles)
-        else:
-            st.warning("⚠️ No TOC detected — generating smart suggestions using content...")
-            query_suggestions = generate_ai_fallback_suggestions(chunks)
-        if query_suggestions:
-            st.markdown("#### 💡 Suggested Questions")
-            cols = st.columns(2)
-            for i, q in enumerate(query_suggestions):
-                if cols[i % 2].button(f"🔍 {q}"):
-                    st.session_state["user_query"] = q
-    with st.spinner("⚙️ Loading cached embeddings or generating new ones..."):
-        embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
-    index = build_faiss_index(embeddings)
-elif doc_choice == "Upload Custom PDF":
-    uploaded_file = st.file_uploader("📂 Upload your PDF", type="pdf")
-    if uploaded_file:
-        temp_path = os.path.join("/tmp", uploaded_file.name)
-        with open(temp_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        st.success(f"✅ File '{uploaded_file.name}' uploaded successfully")
-        with st.spinner("⚙️ Extracting and processing your document..."):
             text, toc = extract_text_from_pdf(temp_path)
             chunks = chunk_text(text, chunk_size=chunk_size)
-            st.write(f"📄 Extracted {len(chunks)} chunks.")
             if toc:
                 st.markdown("### 🧭 Detected Table of Contents")
@@ -270,8 +234,8 @@ elif doc_choice == "Upload Custom PDF":
                 clean_titles = clean_toc_titles(toc)
                 query_suggestions = generate_query_suggestions(clean_titles)
             else:
-                st.warning("⚠️ No TOC detected — generating smart suggestions using content...")
-                query_suggestions = generate_ai_fallback_suggestions(chunks)
             if query_suggestions:
                 st.markdown("#### 💡 Suggested Questions")
@@ -325,6 +289,5 @@ if index and chunks:
                     """,
                     unsafe_allow_html=True,
                 )
 else:
     st.info("📥 Upload or select a document to start exploring.")

 )
 # ==========================================================
+# 🧹 Cache Management
 # ==========================================================
 def clean_cache(max_size_gb: float = 2.0):
     folders = [
         "/root/.cache/huggingface",
         "/root/.cache/transformers",
         "/root/.cache/torch",
     ]
     total_deleted = 0.0
     for folder in folders:
         if os.path.exists(folder):
             size_gb = sum(
             if size_gb > max_size_gb or "torch" in folder:
                 shutil.rmtree(folder, ignore_errors=True)
                 total_deleted += size_gb
     os.makedirs("/tmp/hf_cache", exist_ok=True)
     print(f"🧹 Cache cleanup done. ~{total_deleted:.2f} GB removed.")
 def check_disk_usage():
     st.sidebar.markdown("### 💾 Disk Usage (Debug)")
     try:
         usage = os.popen("du -sh /root/.cache /tmp 2>/dev/null").read()
     except Exception as e:
         st.sidebar.text(f"⚠️ Disk usage check failed: {e}")
 clean_cache()
 check_disk_usage()
 # ==========================================================
+# ⚙️ HF Cache Configuration
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 # ==========================================================
 from ingestion import extract_text_from_pdf, chunk_text
 from vectorstore import build_faiss_index
+from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate  # add genai_generate!
 # ==========================================================
+# 🧠 TOC & Dynamic AI Suggestion System
 # ==========================================================
 def clean_toc_titles(toc):
     clean_titles = []
     for _, title in toc:
         title = re.sub(r"^\d+(\.\d+)*\s*", "", title)
 def generate_query_suggestions(toc_titles):
     suggestions = []
     for t in toc_titles:
         lower = t.lower()
         if "prerequisite" in lower:
             suggestions.append("What are the prerequisites for setting this up?")
+        elif "restriction" in lower:
             suggestions.append("What are the key restrictions or limitations?")
         elif "configuration" in lower or "setup" in lower:
             suggestions.append(f"How do I {t.lower()}?")
             suggestions.append("Can you show an example from this document?")
         elif "process" in lower:
             suggestions.append(f"Can you explain the {t.lower()} process?")
         else:
             suggestions.append(f"Explain the section about {t.lower()}.")
     seen, final = set(), []
     for s in suggestions:
         if s not in seen:
     return final[:6]
+def generate_ai_dynamic_suggestions(chunks, doc_name="Document"):
+    """
+    🤖 Uses GPT-4o via SAP GenAI Hub to analyze first few chunks
+    and generate dynamic, context-aware question suggestions.
+    """
     if not chunks:
         return []
+    # Take top 3 chunks as context
+    sample_text = " ".join(chunks[:3])[:3000]
+    prompt = f"""
+    You are an intelligent assistant helping users explore enterprise documentation titled '{doc_name}'.
+    Based on the content below, generate 5 short, interactive, human-like questions
+    that a curious user might ask to understand this document better.
+    Avoid section numbers, and sound conversational.
+    ---
+    Content Sample:
+    {sample_text}
+    ---
+    Questions:
+    """
+    try:
+        ai_response = genai_generate(prompt)  # Uses your existing GPT-4o connector
+        questions = re.findall(r"[-•]?\s*(.+)", ai_response)
+        clean_q = [q.strip("•-— ").strip() for q in questions if 8 < len(q) < 120]
+        clean_q = [q for q in clean_q if q.endswith("?")]
+        return clean_q[:6] if clean_q else [
+            "What is this document about?",
+            "How do I start using the process described here?",
+            "What key setup steps are involved?",
+            "What benefits or objectives are explained?",
+        ]
+    except Exception as e:
+        print(f"⚠️ AI suggestion generation failed: {e}")
+        return [
+            "Can you summarize the document?",
+            "What is the main idea here?",
+            "How does this guide help me?",
+        ]
 # ==========================================================
     st.session_state.reasoning_mode = st.toggle(
         "🧠 Enable Reasoning Mode",
         value=st.session_state.reasoning_mode,
+        help="When ON: GPT-4o uses reasoning + synthesis.\nWhen OFF: strictly factual."
     )
     st.markdown("---")
     st.header("📚 Document Library")
+    doc_choice = st.radio("Choose a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
     st.markdown("---")
     st.header("⚙️ Settings")
+    chunk_size = st.slider("Chunk Size", 200, 1500, 800, step=50)
+    overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
     top_k = st.slider("Top K Results", 1, 10, 5)
     st.markdown("---")
     st.caption("👨‍💻 Built by Shubham Sharma")
 if doc_choice == "-- Select --":
     st.info("⬅️ Please choose a document from the sidebar.")
+elif doc_choice in ["Sample PDF", "Upload Custom PDF"]:
+    temp_path = SAMPLE_PATH if doc_choice == "Sample PDF" else None
+    if doc_choice == "Upload Custom PDF":
+        uploaded_file = st.file_uploader("📂 Upload your PDF", type="pdf")
+        if uploaded_file:
+            temp_path = os.path.join("/tmp", uploaded_file.name)
+            with open(temp_path, "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            st.success(f"✅ File '{uploaded_file.name}' uploaded successfully")
+    if temp_path:
+        with st.spinner("🔍 Extracting and processing document..."):
             text, toc = extract_text_from_pdf(temp_path)
             chunks = chunk_text(text, chunk_size=chunk_size)
+            st.write(f"📑 Extracted {len(chunks)} chunks.")
             if toc:
                 st.markdown("### 🧭 Detected Table of Contents")
                 clean_titles = clean_toc_titles(toc)
                 query_suggestions = generate_query_suggestions(clean_titles)
             else:
+                st.warning("⚠️ No TOC detected — generating dynamic suggestions using AI...")
+                query_suggestions = generate_ai_dynamic_suggestions(chunks, doc_name=os.path.basename(temp_path))
             if query_suggestions:
                 st.markdown("#### 💡 Suggested Questions")
                     """,
                     unsafe_allow_html=True,
                 )
 else:
     st.info("📥 Upload or select a document to start exploring.")