Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 18

Commit

f571fb6

verified ·

1 Parent(s): a1fa58d

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +238 -184

src/streamlit_app.py CHANGED Viewed

@@ -1,16 +1,49 @@
 import os
 import re
 import streamlit as st
 import torch
 # ==========================================================
-# ✅ Environment Setup
 # ==========================================================
 st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
 print("CUDA available:", torch.cuda.is_available())
 # ==========================================================
-# ⚙️ Hugging Face Cache Setup
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
@@ -22,221 +55,242 @@ os.environ.update({
 })
 # ==========================================================
-# 📦 Imports
 # ==========================================================
 from ingestion import extract_text_from_pdf, chunk_text
 from vectorstore import build_faiss_index
 from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
 # ==========================================================
-# 🧠 Smart Suggestion Generator
 # ==========================================================
-def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
-    """Generate contextual, short, and relevant questions dynamically from the document."""
     if not toc or not chunks:
         return []
     titles = []
     for sec, raw_title in toc:
-        title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
-        title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
-        if 4 < len(title) < 120:
-            titles.append(title)
-    context_sample = " ".join(chunks[:3])[:4000]
     prompt = f"""
-    You are generating user-friendly, context-aware questions based on the document "{doc_name}".
-    Use the Table of Contents and text sample below.
-    TABLE OF CONTENTS:
-    {chr(10).join(['- ' + t for t in titles[:8]])}
-    TEXT SAMPLE:
-    {context_sample}
-    Generate 5–7 concise, practical questions (max 18 words each) that help a user understand or use this document.
-    """
     try:
-        ai_response = genai_generate(prompt)
-        questions = re.findall(r"[-•]?\s*(.+?)\?", ai_response)
-        clean_qs = [q.strip("•-— ").strip() + "?" for q in questions if 8 < len(q) < 120]
-        seen, final = set(), []
-        for q in clean_qs:
-            if q.lower() not in seen:
-                seen.add(q.lower())
                 final.append(q)
-        return final[:7]
     except Exception:
-        return ["What is this document about?", "How can I start using this guide?"]
-# ==========================================================
-# 🎨 Styling — Customer-Ready Theme
-# ==========================================================
-st.markdown("""
-<style>
-div.block-container {padding-top: 1.5rem; max-width: 1000px;}
-h1, h2, h3, h4 {font-weight: 600; color: #f3f4f6;}
-hr {border: none; border-top: 1px solid #2c2c2c; margin: 1rem 0;}
-.suggest-chip {
-    background-color: #1f2937;
-    border: 1px solid #374151;
-    border-radius: 16px;
-    color: #e5e7eb;
-    padding: 6px 12px;
-    cursor: pointer;
-    font-size: 13px;
-    transition: all 0.2s ease-in-out;
-}
-.suggest-chip:hover {
-    background-color: #2563eb;
-    border-color: #3b82f6;
-    color: white;
-    box-shadow: 0 0 8px rgba(59,130,246,0.4);
-}
-.answer-box {
-    background: linear-gradient(135deg, #0f172a, #1e293b);
-    border-left: 4px solid #3b82f6;
-    border-radius: 8px;
-    padding: 14px 16px;
-    color: #f1f5f9;
-    margin-top: 1rem;
-    box-shadow: 0 0 10px rgba(59,130,246,0.1);
-}
-.stTextInput > div > div > input {
-    background-color: #0f172a;
-    color: #f1f5f9;
-    border-radius: 6px;
-    border: 1px solid #334155;
-    padding: 6px 10px;
-}
-.stTextArea > div > div > textarea {
-    background-color: #0f172a;
-    color: #f1f5f9;
-    border-radius: 6px;
-    border: 1px solid #334155;
-}
-</style>
-""", unsafe_allow_html=True)
 # ==========================================================
-# 🧭 Sidebar
 # ==========================================================
 with st.sidebar:
-    st.markdown("### 💬 Answer Style")
-    if "reasoning_mode" not in st.session_state:
-        st.session_state.reasoning_mode = False
-    style = st.radio(
-        "Choose how detailed answers should be:",
-        ["Concise", "Detailed"],
-        index=0 if not st.session_state.reasoning_mode else 1,
-    )
-    st.session_state.reasoning_mode = style == "Detailed"
     st.markdown("---")
-    st.markdown("### ⚙️ Advanced Settings")
-    with st.expander("Show Advanced Options"):
-        chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
-        overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
-        top_k = st.slider("Top K Results", 1, 10, 5)
     st.markdown("---")
-    st.caption("✨ Built by Shubham Sharma")
 # ==========================================================
-# 📄 Main Flow
 # ==========================================================
 st.title("Enterprise Knowledge Assistant")
-st.caption("Upload or select a document to ask intelligent, document-based questions.")
 text, chunks, index, embeddings, toc = None, None, None, None, None
-# Initialize session variables safely
-if "user_query_input" not in st.session_state:
-    st.session_state["user_query_input"] = ""
-if "show_more" not in st.session_state:
-    st.session_state["show_more"] = False
-def set_user_query(q):
-    st.session_state["user_query_input"] = q
-# ----------------------------------------------------------
-# 📂 Document Selection
-# ----------------------------------------------------------
-doc_choice = st.radio("Select a document source:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
-if doc_choice == "-- Select --":
-    st.info("📄 Please choose or upload a document to get started.")
-else:
-    if doc_choice == "Sample PDF":
-        temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
-        st.success("📘 Using built-in Sample PDF.")
-    else:
-        uploaded_file = st.file_uploader("📂 Upload your PDF", type="pdf")
-        if uploaded_file:
-            temp_path = os.path.join("/tmp", uploaded_file.name)
-            with open(temp_path, "wb") as f:
-                f.write(uploaded_file.getbuffer())
-            st.success(f"✅ '{uploaded_file.name}' uploaded successfully.")
-        else:
-            temp_path = None
-    # ----------------------------------------------------------
-    # 🧠 Process the Document
-    # ----------------------------------------------------------
-    if temp_path:
-        with st.spinner("🔍 Analyzing your document..."):
-            text, toc = extract_text_from_pdf(temp_path)
-            chunks = chunk_text(text, chunk_size=1000)
-            query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, os.path.basename(temp_path))
-        with st.spinner("⚙️ Preparing intelligent search..."):
-            embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
-            index = build_faiss_index(embeddings)
-        st.success("✅ Your document is ready! Ask the assistant below.")
-        # ----------------------------------------------------------
-        # 💬 Ask the Assistant
-        # ----------------------------------------------------------
-        st.markdown("### 💬 Ask the Assistant")
-        if query_suggestions:
-            visible = query_suggestions if st.session_state.show_more else query_suggestions[:3]
-            cols = st.columns(min(3, len(visible)))
-            for i, q in enumerate(visible):
-                cols[i % 3].button(f"🔍 {q}", key=f"suggest_{i}", on_click=set_user_query, args=(q,))
-            toggle_text = "More Suggestions ▼" if not st.session_state.show_more else "Fewer Suggestions ▲"
-            if st.button(toggle_text):
-                st.session_state.show_more = not st.session_state.show_more
-                st.experimental_rerun()
-        user_query = st.text_input("Type your question or click a suggestion:", key="user_query_input")
-        if user_query.strip():
-            with st.spinner("💭 Thinking..."):
-                retrieved = retrieve_chunks(user_query, index, chunks, top_k=5, embeddings=embeddings)
-                answer = generate_answer(user_query, retrieved, reasoning_mode=st.session_state.reasoning_mode)
-            st.markdown("### ✅ Assistant’s Answer")
-            st.markdown(f"<div class='answer-box'>💡 {answer}</div>", unsafe_allow_html=True)
-            with st.expander("See how this was answered (Advanced)"):
-                for i, r in enumerate(retrieved, start=1):
-                    st.markdown(f"**Chunk {i}:** {r}")
-        # ----------------------------------------------------------
-        # 📚 Table of Contents
-        # ----------------------------------------------------------
-        if toc:
-            with st.expander("📚 View Table of Contents", expanded=True):
-                toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
-                st.text_area("", toc_text, height=150)
-        # ----------------------------------------------------------
-        # 📄 Document Preview
-        # ----------------------------------------------------------
         if chunks:
-            with st.expander("📄 View Extracted Text"):
-                st.text_area("", text[:1000], height=150)
-                st.caption(f"{len(chunks)} sections processed.")

+# streamlit_app.py
 import os
 import re
+import shutil
 import streamlit as st
 import torch
 # ==========================================================
+# ✅ Environment & Page
 # ==========================================================
 st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
 print("CUDA available:", torch.cuda.is_available())
+if torch.cuda.is_available():
+    try:
+        print("GPU:", torch.cuda.get_device_name(0))
+    except Exception:
+        pass
+# minimal cache cleanup (safe)
+def clean_cache(max_size_gb: float = 2.0):
+    folders = [
+        "/root/.cache/huggingface",
+        "/root/.cache/transformers",
+        "/root/.cache/torch",
+    ]
+    total_deleted = 0.0
+    for folder in folders:
+        if os.path.exists(folder):
+            try:
+                size_gb = sum(
+                    os.path.getsize(os.path.join(dp, f))
+                    for dp, _, files in os.walk(folder)
+                    for f in files
+                ) / (1024**3)
+            except Exception:
+                size_gb = 0.0
+            if size_gb > max_size_gb or "torch" in folder:
+                shutil.rmtree(folder, ignore_errors=True)
+                total_deleted += size_gb
+    os.makedirs("/tmp/hf_cache", exist_ok=True)
+    return total_deleted
+clean_cache()
 # ==========================================================
+# ⚙️ HF cache env (keeps HuggingFace caches local)
 # ==========================================================
 CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 })
 # ==========================================================
+# 📦 Project imports (assumed to exist in your repo)
+# - ingestion.extract_text_from_pdf, chunk_text
+# - vectorstore.build_faiss_index
+# - qa.retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
 # ==========================================================
 from ingestion import extract_text_from_pdf, chunk_text
 from vectorstore import build_faiss_index
 from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
 # ==========================================================
+# 🎨 UI styles (concise, production friendly)
+# ==========================================================
+st.markdown(
+    """
+    <style>
+    div.block-container { padding-top: 1.2rem; max-width: 1050px; }
+    .status-line { background:#0f172a; border-left:4px solid #10b981; padding:10px 14px; border-radius:8px; color:#d1fae5; margin-bottom:10px; }
+    .suggest-chip { background:#111827; border:1px solid #2b3440; border-radius:16px; padding:8px 14px; color:#e5e7eb; margin:6px 6px 6px 0; cursor:pointer; display:inline-block; font-size:13px; }
+    .suggest-chip:hover { background:#2563eb; border-color:#3b82f6; color:#fff; box-shadow:0 0 8px rgba(59,130,246,0.25); }
+    .answer-box { background: linear-gradient(135deg,#0b1220,#0f1b2b); border-left:4px solid #3b82f6; padding:14px; border-radius:8px; color:#f1f5f9; box-shadow:0 6px 18px rgba(2,6,23,0.5); }
+    .small-muted { color:#9ca3af; font-size:13px; margin-top:6px; }
+    .sidebar-small { font-size:14px; color:#d1d5db; }
+    .section-title { font-weight:700; font-size:20px; margin-top:8px; margin-bottom:10px; color:#f3f4f6; }
+    .compact-expander > div[role="button"] { padding:10px 12px; border-radius:8px; background:#0f172a; border:1px solid #1f2937; color:#e5e7eb;}
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+# ==========================================================
+# 🔧 Helper: safe session-state initialization
 # ==========================================================
+default_state = {
+    "user_query_input": "",
+    "show_more": False,
+    "selected_suggestion": None,
+    "response_mode": "strict",  # 'strict' or 'extended'
+    "last_doc_path": None,
+}
+for k, v in default_state.items():
+    if k not in st.session_state:
+        st.session_state[k] = v
+# ==========================================================
+# 🧠 Suggestion generator (uses TOC + text sample; robust fallback)
+# ==========================================================
+def generate_suggestions_from_toc(toc, chunks, doc_name="Document"):
+    """Try AI first (genai_generate), otherwise deterministic fallback based on TOC."""
     if not toc or not chunks:
         return []
+    # clean titles
     titles = []
     for sec, raw_title in toc:
+        t = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
+        t = re.sub(r"\.{2,}\s*\d+$", "", t).strip()
+        if 4 < len(t) < 120:
+            titles.append(t)
+    # prompt
+    sample = " ".join(chunks[:3])[:3000]
     prompt = f"""
+You are generating concise, document-based suggestion questions for a user exploring the document named "{doc_name}".
+TABLE OF CONTENTS:
+{chr(10).join(['- '+t for t in titles[:8]])}
+SAMPLE:
+{sample}
+Generate 5 short, professional questions (each under 18 words) that a user could ask about this document. Focus strictly on the document content.
+"""
     try:
+        ai_resp = genai_generate(prompt)
+        # extract lines that look like questions
+        found = re.findall(r"[-•]?\s*(.+?)\?", ai_resp)
+        qs = []
+        for s in found:
+            s = s.strip("•-— ").strip()
+            if 8 < len(s) < 120:
+                if not s.endswith("?"):
+                    s = s + "?"
+                qs.append(s)
+        # dedupe while preserving order
+        seen = set()
+        final = []
+        for q in qs:
+            low = q.lower()
+            if low not in seen:
+                seen.add(low)
                 final.append(q)
+        if final:
+            return final[:7]
     except Exception:
+        pass
+    # deterministic fallback: form 'What is ...' or 'How do I ...' from TOC lines
+    fallback = []
+    for t in titles[:10]:
+        lt = t.lower()
+        if re.search(r"\b(setup|configure|installation|setup|enable|configure|install)\b", lt):
+            fallback.append(f"How do I {lt.strip()}?")
+        elif re.search(r"\b(overview|purpose|objective|introduction|summary)\b", lt):
+            fallback.append(f"What is the {lt.strip()}?")
+        else:
+            fallback.append(f"What does '{t}' cover?")
+    # cleanup & unique
+    out = []
+    seen = set()
+    for q in fallback:
+        q = q.strip()
+        if q.lower() not in seen and 10 < len(q) < 120:
+            seen.add(q.lower())
+            out.append(q)
+    return out[:7]
 # ==========================================================
+# 🎛 Sidebar (controls)
 # ==========================================================
 with st.sidebar:
+    st.markdown("### Response Mode")
+    # user-facing naming: Strict vs Extended
+    mode = st.radio("", ["Strict (Document-only)", "Extended (Document + general)"], index=0, key="ui_response_mode")
+    # map to internal key
+    st.session_state.response_mode = "strict" if "Strict" in mode else "extended"
     st.markdown("---")
+    with st.expander("Advanced Settings (for power users)", expanded=False):
+        st.markdown("**Indexing & retrieval**")
+        chunk_size = st.slider("Chunk size (chars)", 200, 1500, 1000, step=50)
+        overlap = st.slider("Chunk overlap (chars)", 50, 300, 120, step=10)
+        top_k = st.slider("Top K results", 1, 10, 5)
+        st.session_state["adv_chunk_size"] = chunk_size
+        st.session_state["adv_overlap"] = overlap
+        st.session_state["adv_top_k"] = top_k
     st.markdown("---")
+    st.caption("✨ Built by Shubham Sharma", unsafe_allow_html=True)
 # ==========================================================
+# 📄 Main content flow
 # ==========================================================
 st.title("Enterprise Knowledge Assistant")
+st.caption("Query SAP documentation and enterprise PDFs — powered by retrieval and reasoning.", unsafe_allow_html=True)
+# Document selection
+st.markdown("#### Select a document")
+doc_choice = st.radio("", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0, key="doc_choice_radio")
+temp_path = None
+if doc_choice == "Sample PDF":
+    sample_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
+    temp_path = sample_path
+elif doc_choice == "Upload Custom PDF":
+    uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
+    if uploaded_file:
+        temp_path = os.path.join("/tmp", uploaded_file.name)
+        with open(temp_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+# If user selects Sample PDF by mistake (user wanted default select), keep default as Select.
+# (We set index=0 above, so default is Select.)
+# If temp_path is set, process document:
 text, chunks, index, embeddings, toc = None, None, None, None, None
+if temp_path:
+    # avoid re-processing same file repeatedly in the same session unless path changes
+    if st.session_state.get("last_doc_path") != temp_path:
+        st.session_state.last_doc_path = temp_path
+    with st.spinner("Processing document..."):
+        text, toc = extract_text_from_pdf(temp_path)
+        # chunk size from advanced settings if present else default
+        chunk_size = st.session_state.get("adv_chunk_size", 1000)
+        chunks = chunk_text(text, chunk_size=chunk_size)
+        query_suggestions = generate_suggestions_from_toc(toc, chunks, os.path.basename(temp_path))
+    with st.spinner("Preparing embeddings and index..."):
+        embeddings = cache_embeddings(os.path.basename(temp_path), chunks, embed_chunks)
+        index = build_faiss_index(embeddings)
+    # single, subtle status line
+    st.markdown(f"<div class='status-line'>✅ Your document is ready. Ask the assistant below.</div>", unsafe_allow_html=True)
+    # ------------------------
+    # Suggested questions (compact chips)
+    # ------------------------
+    st.markdown("<div class='section-title'>Ask the Assistant</div>", unsafe_allow_html=True)
+    if query_suggestions:
+        visible = query_suggestions if st.session_state.show_more else query_suggestions[:3]
+        for i, q in enumerate(visible):
+            # show suggestion chips; clicking sets the input and clears selection for re-query
+            if st.button(q, key=f"sugg_btn_{i}"):
+                st.session_state.user_query_input = q
+                st.session_state.selected_suggestion = i
+        # show toggle
+        toggle_text = "Show less ▲" if st.session_state.show_more else "More suggestions ▼"
+        if st.button(toggle_text, key="toggle_more"):
+            st.session_state.show_more = not st.session_state.show_more
+            st.experimental_rerun()
+    # input
+    user_query = st.text_input("Type your question or pick one above:", key="user_query_input", value=st.session_state.user_query_input)
+    # Answer generation
+    if user_query and user_query.strip():
+        # small caption about mode
+        mode_label = "Strict (document-only)" if st.session_state.response_mode == "strict" else "Extended (document + general)"
+        st.markdown(f"<div class='small-muted'>Mode: {mode_label}</div>", unsafe_allow_html=True)
+        with st.spinner("Retrieving context and generating answer..."):
+            # use top_k from adv settings if available
+            top_k = st.session_state.get("adv_top_k", 5)
+            retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
+            # generate_answer should accept a reasoning_mode flag or similar; map our response_mode
+            reasoning_mode_flag = True if st.session_state.response_mode == "extended" else False
+            answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode_flag)
+        # present answer in a card
+        st.markdown("<div class='section-title'>Assistant</div>", unsafe_allow_html=True)
+        st.markdown(f"<div class='answer-box'>💡 {answer}</div>", unsafe_allow_html=True)
+        st.caption("Answer is based on the uploaded document; Extended mode may include general insights.", unsafe_allow_html=True)
+        # supporting context (collapsed)
+        with st.expander("Supporting context (document chunks)"):
+            for i, c in enumerate(retrieved, start=1):
+                st.markdown(f"**Chunk {i}:** {c}")
+    # ------------------------
+    # Optional: Document explorer (single expander containing TOC + preview)
+    # ------------------------
+    with st.expander("Explore document (TOC & preview)", expanded=False):
+        if toc:
+            st.markdown("**Table of Contents**")
+            toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc])
+            st.text_area("", toc_text, height=140)
         if chunks:
+            st.markdown("**Extracted text preview**")
+            st.text_area("", text[:1600], height=180)
+            st.caption(f"{len(chunks)} chunks processed.", unsafe_allow_html=True)
+# If no document selected, show gentle onboarding hint
+else:
+    st.info("Select 'Sample PDF' or upload a PDF to begin. Suggested questions are extracted from the document itself (no pre-seeded suggestions).")