Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 25, 2025

Commit

3fbd2b9

verified ·

1 Parent(s): c9a83aa

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +76 -234

src/streamlit_app.py CHANGED Viewed

@@ -1,51 +1,17 @@
 # ==========================================================
-# streamlit_app.py — Stable Layout (English Only)
 # ==========================================================
-import os
-import re
-import streamlit as st
-import torch
-from document_registry import DocumentRegistry
-# ==========================================================
-# ✅ PAGE CONFIGS
-# ==========================================================
-st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
-print("CUDA available:", torch.cuda.is_available())
-# ==========================================================
-# ⚙️ SAFE RERUN HANDLER
-# ==========================================================
-def trigger_safe_rerun():
-    """Mark rerun flag for next render instead of rerunning immediately."""
-    st.session_state["_safe_rerun"] = True
-if st.session_state.get("_safe_rerun"):
-    st.session_state["_safe_rerun"] = False
-    st.rerun()
-# ==========================================================
-# ⚙️ CACHE SETUP
-# ==========================================================
-CACHE_DIR = "/tmp/hf_cache"
-os.makedirs(CACHE_DIR, exist_ok=True)
-os.environ.update({
-    "HF_HOME": CACHE_DIR,
-    "TRANSFORMERS_CACHE": CACHE_DIR,
-    "HF_DATASETS_CACHE": CACHE_DIR,
-    "HF_MODULES_CACHE": CACHE_DIR,
-})
-# ==========================================================
-# 📦 IMPORTS
-# ==========================================================
-from ingestion import extract_text_from_pdf, chunk_text
-from vectorstore import build_faiss_index
-from qa import retrieve_chunks, generate_answer, cache_embeddings, embed_chunks, genai_generate
-# ==========================================================
-# 🧠 HELPER: Suggestion Refresher
-# ==========================================================
 def refresh_suggestions(doc_name, toc, chunks):
     """Refresh dynamic suggestions and reset related states."""
     st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
@@ -55,176 +21,23 @@ def refresh_suggestions(doc_name, toc, chunks):
     st.session_state["selected_suggestion"] = None
     st.session_state["show_more"] = False
-# ==========================================================
-# 🧠 SMART SUGGESTION GENERATOR (English Only)
-# ==========================================================
-def generate_dynamic_suggestions_from_toc(toc, chunks, doc_name="Document"):
-    """
-    Generates 5–7 short, natural English questions based on TOC and document text.
-    """
-    if not toc or not chunks:
-        return ["How do I start using this guide?", "What does this document cover?"]
-    titles = []
-    for sec, raw_title in toc:
-        title = re.sub(r"^\s*[\dA-Za-z.\-]+\s*", "", raw_title)
-        title = re.sub(r"\.{2,}\s*\d+$", "", title).strip()
-        if 4 < len(title) < 120:
-            titles.append(title)
-    context_sample = " ".join(chunks[:3])[:4000]
-    prompt = f"""
-You are a content assistant. Based on the Table of Contents and the sample document text below,
-generate 5–7 short, natural user-facing questions.
-Each question should be under 18 words, end with a question mark, and sound human.
-Document: "{doc_name}"
-TABLE OF CONTENTS:
-{chr(10).join(['- ' + t for t in titles[:8]])}
-SAMPLE TEXT:
-{context_sample}
-Output: Write each question on a new line. Do not invent facts — base questions only on the document.
-"""
-    try:
-        ai_response = genai_generate(prompt)
-        lines = [ln.strip() for ln in ai_response.splitlines() if ln.strip()]
-        questions = []
-        for ln in lines:
-            q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
-            if not q.endswith("?") and len(q.split()) < 18 and re.match(
-                r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q
-            ):
-                q += "?"
-            if 8 <= len(q) <= 140:
-                questions.append(q)
-        final = []
-        seen = set()
-        for q in questions:
-            if q.lower() not in seen:
-                seen.add(q.lower())
-                final.append(q)
-        if not final:
-            final = [f"What should I know about {t.rstrip('.')}?" for t in titles[:7]]
-        return final[:7]
-    except Exception:
-        return ["How do I start using this guide?", "What does this document cover?"]
-# ==========================================================
-# 🎨 STYLING
-# ==========================================================
-st.markdown("""
-<style>
-div.block-container {padding-top: 1.2rem; max-width: 1080px;}
-h1, h2, h3 {color: #f3f4f6; font-weight: 600;}
-.suggest-chip {
-    background: #0f1724;
-    border: 1px solid #374151;
-    border-radius: 14px;
-    color: #e6eef8;
-    padding: 8px 12px;
-    cursor: pointer;
-    font-size: 13px;
-    margin: 6px 6px 6px 0;
-    display: inline-block;
-    transition: background 0.2s, transform 0.1s;
-}
-.suggest-chip:hover {background: #1e3a8a; transform: translateY(-2px);}
-.answer-box {
-    background: linear-gradient(180deg,#0b1220,#071027);
-    border-left: 4px solid #3b82f6;
-    border-radius: 8px;
-    padding: 16px 18px;
-    color: #e6eef8;
-    margin-top: 12px;
-    box-shadow: 0 4px 14px rgba(0,0,0,0.35);
-}
-.stTextInput > div > div > input {
-    background-color: #0f172a !important;
-    color: #f1f5f9 !important;
-    border-radius: 6px !important;
-    border: 1px solid #334155 !important;
-    padding: 8px 10px !important;
-    font-size: 15px !important;
-}
-.stTextInput > label {font-weight: 500;}
-.small-link {font-size: 13px; color: #60a5fa; cursor: pointer;}
-</style>
-""", unsafe_allow_html=True)
-# ==========================================================
-# 🧭 SIDEBAR
-# ==========================================================
-with st.sidebar:
-    st.markdown("### 🧭 Response Style")
-    mode = st.radio(
-        "",
-        ("Strict (Document-only)", "Extended (Document + General)"),
-        index=0,
-    )
-    st.markdown("---")
-    if "registry" in st.session_state:
-        registry = st.session_state["registry"]
-        registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
-        if registered_docs:
-            with st.expander("📚 Registered Documents", expanded=False):
-                for i, doc in enumerate(registered_docs, start=1):
-                    doc_name = doc.get("name", "Unknown")
-                    chunks = doc.get("num_chunks", "?")
-                    toc_source = doc.get("toc_source", "—")
-                    st.markdown(f"**{i}. {doc_name}** — {chunks} chunks *(TOC: {toc_source})*")
-            st.markdown("---")
-            active_doc_name = st.selectbox(
-                "📄 Select Active Document",
-                [doc["name"] for doc in registered_docs],
-                index=0,
-                key="active_doc_selector"
-            )
-            selected_doc = registry.get_doc(active_doc_name)
-            if selected_doc:
-                st.session_state.update({
-                    "active_doc": active_doc_name,
-                    "chunks": selected_doc["chunks"],
-                    "embeddings": selected_doc["embeddings"],
-                    "index": selected_doc["index"],
-                    "doc_ready": True,
-                    "status_text": f"📄 {active_doc_name} loaded from registry — ready for queries."
-                })
-    st.caption("✨ Built by Shubham Sharma")
-# ==========================================================
-# 📄 MAIN SECTION
-# ==========================================================
-st.title("📄 Enterprise Knowledge Assistant")
-st.caption("Query SAP documentation and enterprise PDFs — powered by reasoning and retrieval.")
-doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
-# ==========================================================
-# 📂 DOCUMENT HANDLING — SAFE VERSION
-# ==========================================================
-import hashlib
-def _hash_content(file_path):
-    hasher = hashlib.sha256()
-    with open(file_path, "rb") as f:
-        while chunk := f.read(8192):
-            hasher.update(chunk)
-    return hasher.hexdigest()[:12]
 if doc_choice == "-- Select --":
     st.info("⬅️ Select or upload a document to begin.")
 else:
     temp_path = None
     if doc_choice == "Sample PDF":
         temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
         st.markdown("✅ **Sample PDF selected.** Preparing document...")
     else:
-        uploaded_file = st.file_uploader("Upload a PDF document (max 200MB):", type="pdf", label_visibility="collapsed")
         if uploaded_file:
             temp_path = os.path.join("/tmp", uploaded_file.name)
             with open(temp_path, "wb") as f:
@@ -232,15 +45,18 @@ else:
         else:
             st.stop()
     if temp_path:
         doc_name = os.path.basename(temp_path)
         file_hash = _hash_content(temp_path)
         doc_identifier = f"{doc_name}_{file_hash}"
         if "registry" not in st.session_state:
             st.session_state["registry"] = DocumentRegistry()
         registry = st.session_state["registry"]
         existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
         if existing_doc:
             doc_data = registry.get_doc(existing_doc["name"])
@@ -254,21 +70,29 @@ else:
                 "active_doc": existing_doc["name"],
                 "status_text": f"✅ {doc_name} already processed — loaded from registry."
             })
             refresh_suggestions(existing_doc["name"], st.session_state["toc"], st.session_state["chunks"])
-            trigger_safe_rerun()
         status = st.empty()
         status.info("📤 Upload complete — reading document...")
         text, toc, toc_source = extract_text_from_pdf(temp_path)
         status.info("📑 Parsing and chunking document...")
-        chunks = chunk_text(text, chunk_size=1000, overlap=120)
         status.info("🧠 Building embeddings and search index...")
         embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
         index = build_faiss_index(embeddings)
         doc_id = registry.register(temp_path, chunks, embeddings, index)
         st.session_state["active_doc"] = doc_id
-        status.success("✅ Document processed successfully — ready to query!")
         refresh_suggestions(doc_name, toc, chunks)
         st.session_state.update({
             "text": text,
             "toc": toc,
@@ -277,32 +101,50 @@ else:
             "index": index,
             "doc_ready": True,
             "last_doc": doc_identifier,
-            "status_text": "✅ Document processed successfully — ready to query!"
         })
-        trigger_safe_rerun()
-if st.session_state.get("doc_ready"):
-    st.info(st.session_state.get("status_text"))
-    st.markdown("### 💬 Ask the Assistant")
-    query_suggestions = st.session_state.get("query_suggestions_fixed", [])
-    if query_suggestions:
-        visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
-        cols = st.columns(min(3, len(visible)))
-        for i, q in enumerate(visible):
-            if cols[i % 3].button(f"💬 {q}", key=f"sugg_{i}"):
-                st.session_state["user_query_input"] = q
-                st.session_state["selected_suggestion"] = i
-                trigger_safe_rerun()
-        toggle_text = "Show less ▲" if st.session_state["show_more"] else "Show more ▼"
-        if st.button(toggle_text):
-            st.session_state["show_more"] = not st.session_state["show_more"]
-            trigger_safe_rerun()
-    user_query = st.text_input("Your Question:", key="user_query_input", label_visibility="visible")
-    if user_query.strip():
-        reasoning_mode = mode == "Extended (Document + General)"
-        with st.spinner("💭 Generating your answer..."):
-            retrieved = retrieve_chunks(user_query, st.session_state["index"], st.session_state["chunks"], top_k=5, embeddings=st.session_state["embeddings"])
-            answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
-        st.markdown("### 🤖 Assistant’s Answer")
-        st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)

 # ==========================================================
+# 📂 DOCUMENT HANDLING — CLEAN, ACCURATE, AND BYTE-AWARE
 # ==========================================================
+import hashlib
+def _hash_content(file_path):
+    """Generate a short SHA256 hash of the file's actual binary content."""
+    hasher = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        while chunk := f.read(8192):
+            hasher.update(chunk)
+    return hasher.hexdigest()[:12]  # short unique hash for same-name files
 def refresh_suggestions(doc_name, toc, chunks):
     """Refresh dynamic suggestions and reset related states."""
     st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
     st.session_state["selected_suggestion"] = None
     st.session_state["show_more"] = False
+# --- Document selection ---
 if doc_choice == "-- Select --":
     st.info("⬅️ Select or upload a document to begin.")
 else:
     temp_path = None
+    # --- File selection ---
     if doc_choice == "Sample PDF":
         temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
         st.markdown("✅ **Sample PDF selected.** Preparing document...")
     else:
+        uploaded_file = st.file_uploader(
+            "Upload a PDF document (max 200MB):",
+            type="pdf",
+            label_visibility="collapsed"
+        )
         if uploaded_file:
             temp_path = os.path.join("/tmp", uploaded_file.name)
             with open(temp_path, "wb") as f:
         else:
             st.stop()
+    # --- Start processing if file exists ---
     if temp_path:
         doc_name = os.path.basename(temp_path)
         file_hash = _hash_content(temp_path)
         doc_identifier = f"{doc_name}_{file_hash}"
+        # ✅ Step 0: Ensure registry exists
         if "registry" not in st.session_state:
             st.session_state["registry"] = DocumentRegistry()
         registry = st.session_state["registry"]
+        # ✅ Step 1: Check if already registered
         existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
         if existing_doc:
             doc_data = registry.get_doc(existing_doc["name"])
                 "active_doc": existing_doc["name"],
                 "status_text": f"✅ {doc_name} already processed — loaded from registry."
             })
             refresh_suggestions(existing_doc["name"], st.session_state["toc"], st.session_state["chunks"])
+            st.experimental_rerun()
+        # ✅ Step 2: New document → process
         status = st.empty()
         status.info("📤 Upload complete — reading document...")
         text, toc, toc_source = extract_text_from_pdf(temp_path)
         status.info("📑 Parsing and chunking document...")
+        chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
         status.info("🧠 Building embeddings and search index...")
         embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
         index = build_faiss_index(embeddings)
         doc_id = registry.register(temp_path, chunks, embeddings, index)
         st.session_state["active_doc"] = doc_id
+        status.success("✅ Document processed successfully — all set to query your assistant!")
         refresh_suggestions(doc_name, toc, chunks)
         st.session_state.update({
             "text": text,
             "toc": toc,
             "index": index,
             "doc_ready": True,
             "last_doc": doc_identifier,
+            "status_text": "✅ Document processed successfully — all set to query your assistant!"
         })
+        st.experimental_rerun()
+    # --- Display Ready Message + Ask Section ---
+    if st.session_state.get("doc_ready"):
+        active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
+        st.info(st.session_state.get("status_text", f"📄 {active_name or 'Document'} is ready for queries."))
+        st.markdown("### 💬 Ask the Assistant")
+        query_suggestions = st.session_state.get("query_suggestions_fixed", [])
+        if query_suggestions:
+            visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
+            cols = st.columns(min(3, len(visible)))
+            for i, q in enumerate(visible):
+                if cols[i % 3].button(f"💬 {q}", key=f"sugg_{i}"):
+                    st.session_state["user_query_input"] = q
+                    st.session_state["selected_suggestion"] = i
+                    st.experimental_rerun()
+            toggle_text = "Show less ▲" if st.session_state["show_more"] else "Show more ▼"
+            if st.button(toggle_text, help="Show or hide more suggestions"):
+                st.session_state["show_more"] = not st.session_state["show_more"]
+                st.experimental_rerun()
+        user_query = st.text_input(
+            "Type your question or click one above:",
+            key="user_query_input",
+            label_visibility="visible"
+        )
+        if user_query.strip():
+            reasoning_mode = mode == "Extended (Document + General)"
+            with st.spinner("💭 Generating your answer..."):
+                retrieved = retrieve_chunks(
+                    user_query,
+                    st.session_state["index"],
+                    st.session_state["chunks"],
+                    top_k=top_k,
+                    embeddings=st.session_state["embeddings"]
+                )
+                answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
+                st.session_state["retrieved"] = retrieved
+            st.markdown("### 🤖 Assistant’s Answer")
+            st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)