Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 25

Commit

c9a83aa

verified ·

1 Parent(s): 2239986

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +53 -187

src/streamlit_app.py CHANGED Viewed

@@ -13,6 +13,17 @@ from document_registry import DocumentRegistry
 st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
 print("CUDA available:", torch.cuda.is_available())
 # ==========================================================
 # ⚙️ CACHE SETUP
 # ==========================================================
@@ -83,11 +94,12 @@ Output: Write each question on a new line. Do not invent facts — base question
         questions = []
         for ln in lines:
             q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
-            if not q.endswith("?") and len(q.split()) < 18 and re.match(r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q):
                 q += "?"
             if 8 <= len(q) <= 140:
                 questions.append(q)
-        # dedupe
         final = []
         seen = set()
         for q in questions:
@@ -101,7 +113,7 @@ Output: Write each question on a new line. Do not invent facts — base question
         return ["How do I start using this guide?", "What does this document cover?"]
 # ==========================================================
-# 🎨 STYLING — REVERT TO ORIGINAL
 # ==========================================================
 st.markdown("""
 <style>
@@ -138,11 +150,7 @@ h1, h2, h3 {color: #f3f4f6; font-weight: 600;}
     font-size: 15px !important;
 }
 .stTextInput > label {font-weight: 500;}
-.small-link {
-    font-size: 13px;
-    color: #60a5fa;
-    cursor: pointer;
-}
 </style>
 """, unsafe_allow_html=True)
@@ -155,27 +163,20 @@ with st.sidebar:
         "",
         ("Strict (Document-only)", "Extended (Document + General)"),
         index=0,
-        help="Strict = answers only from the uploaded document. Extended = may include related general info.",
     )
     st.markdown("---")
-    # 🧩 Document Registry Viewer
     if "registry" in st.session_state:
         registry = st.session_state["registry"]
         registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
         if registered_docs:
             with st.expander("📚 Registered Documents", expanded=False):
                 for i, doc in enumerate(registered_docs, start=1):
                     doc_name = doc.get("name", "Unknown")
                     chunks = doc.get("num_chunks", "?")
                     toc_source = doc.get("toc_source", "—")
-                    st.markdown(
-                        f"**{i}. {doc_name}** — {chunks} chunks *(TOC: {toc_source})*"
-                    )
-            # 🧭 Active Document Selector (Commit #3)
             st.markdown("---")
             active_doc_name = st.selectbox(
                 "📄 Select Active Document",
@@ -183,7 +184,6 @@ with st.sidebar:
                 index=0,
                 key="active_doc_selector"
             )
             selected_doc = registry.get_doc(active_doc_name)
             if selected_doc:
                 st.session_state.update({
@@ -194,70 +194,8 @@ with st.sidebar:
                     "doc_ready": True,
                     "status_text": f"📄 {active_doc_name} loaded from registry — ready for queries."
                 })
-        else:
-            st.caption("📭 No documents registered yet.")
-    else:
-        st.caption("📁 Upload or process a document to see registered files here.")
-    st.markdown("---")
-    show_dev = st.checkbox("Show advanced settings (for developers)", value=False)
-    if show_dev:
-        st.markdown("### ⚙️ Developer Options")
-        chunk_size = st.slider("Chunk Size", 200, 1500, 1000, step=50)
-        overlap = st.slider("Chunk Overlap", 50, 200, 120, step=10)
-        top_k = st.slider("Top K Results", 1, 10, 7)
-    else:
-        chunk_size, overlap, top_k = 1000, 120, 5
-    st.markdown("---")
     st.caption("✨ Built by Shubham Sharma")
-    # 🧩 Developer Insights (Moved up here from main block)
-    if show_dev:
-        st.markdown("---")
-        with st.expander("🧩 Developer Insights", expanded=False):
-            st.markdown("**Retrieved Chunks (Context):**")
-            retrieved_chunks = st.session_state.get("retrieved", [])
-            for i, r in enumerate(retrieved_chunks, start=1):
-                st.markdown(f"- **Chunk {i}:** {r}")
-            toc_data = st.session_state.get("toc", [])
-            if toc_data:
-                st.markdown("---")
-                st.markdown("**Document Sections (TOC):**")
-                toc_text = "\n".join([f"{sec}. {title}" for sec, title in toc_data])
-                st.text_area("", toc_text, height=120)
-            doc_text = st.session_state.get("text", "")
-            if doc_text:
-                st.markdown("---")
-                st.markdown("**Document Preview:**")
-                st.text_area("", doc_text[:1000], height=120)
-                st.caption(f"{len(st.session_state.get('chunks', []))} chunks processed.")
-# ==========================================================
-# 🧠 SESSION STATE
-# ==========================================================
-for key, val in {
-    "user_query_input": "",
-    "show_more": False,
-    "selected_suggestion": None,
-    "query_suggestions_fixed": None,
-    "last_doc": None,
-    "doc_lang": "en",  # 🆕 optional: store document language
-}.items():
-    if key not in st.session_state:
-        st.session_state[key] = val
-def set_user_query(q, idx):
-    st.session_state["user_query_input"] = q
-    st.session_state["selected_suggestion"] = idx
-    st.experimental_rerun()
 # ==========================================================
 # 📄 MAIN SECTION
 # ==========================================================
@@ -267,45 +205,26 @@ st.caption("Query SAP documentation and enterprise PDFs — powered by reasoning
 doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
 # ==========================================================
-# 📂 DOCUMENT HANDLING — CLEAN, ACCURATE, AND BYTE-AWARE
 # ==========================================================
 import hashlib
 def _hash_content(file_path):
-    """Generate a short SHA256 hash of the file's actual binary content."""
     hasher = hashlib.sha256()
     with open(file_path, "rb") as f:
         while chunk := f.read(8192):
             hasher.update(chunk)
-    return hasher.hexdigest()[:12]  # short unique hash for same-name files
-def refresh_suggestions(doc_name, toc, chunks):
-    """Refresh dynamic suggestions and reset related states."""
-    st.session_state["query_suggestions_fixed"] = generate_dynamic_suggestions_from_toc(
-        toc, chunks, doc_name
-    )
-    st.session_state["user_query_input"] = ""
-    st.session_state["selected_suggestion"] = None
-    st.session_state["show_more"] = False
-# --- Document selection ---
 if doc_choice == "-- Select --":
     st.info("⬅️ Select or upload a document to begin.")
 else:
     temp_path = None
-    # --- File selection ---
     if doc_choice == "Sample PDF":
         temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
         st.markdown("✅ **Sample PDF selected.** Preparing document...")
     else:
-        uploaded_file = st.file_uploader(
-            "Upload a PDF document (max 200MB):",
-            type="pdf",
-            label_visibility="collapsed"
-        )
         if uploaded_file:
             temp_path = os.path.join("/tmp", uploaded_file.name)
             with open(temp_path, "wb") as f:
@@ -313,18 +232,15 @@ else:
         else:
             st.stop()
-    # --- Start processing if file exists ---
     if temp_path:
         doc_name = os.path.basename(temp_path)
         file_hash = _hash_content(temp_path)
-        doc_identifier = f"{doc_name}_{file_hash}"  # unique per content
-        # ✅ Step 0: Initialize registry
         if "registry" not in st.session_state:
             st.session_state["registry"] = DocumentRegistry()
         registry = st.session_state["registry"]
-        # ✅ Step 1: Check if document already in registry
         existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
         if existing_doc:
             doc_data = registry.get_doc(existing_doc["name"])
@@ -338,43 +254,21 @@ else:
                 "active_doc": existing_doc["name"],
                 "status_text": f"✅ {doc_name} already processed — loaded from registry."
             })
-            # ✅ Refresh suggestions when switching
-            refresh_suggestions(
-                existing_doc["name"],
-                st.session_state["toc"],
-                st.session_state["chunks"]
-            )
-            if show_dev:
-                st.info(f"🧠 Loaded from registry: {doc_name}")
-            st.rerun()
-        # ✅ Step 2: If new document → process normally
         status = st.empty()
         status.info("📤 Upload complete — reading document...")
-        # 🧩 Step 2.1: Extract text and TOC
         text, toc, toc_source = extract_text_from_pdf(temp_path)
-        # 🧩 Step 2.2: Chunk the text
         status.info("📑 Parsing and chunking document...")
-        chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
-        # 🧩 Step 2.3: Embed and index
         status.info("🧠 Building embeddings and search index...")
         embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
         index = build_faiss_index(embeddings)
-        # 🧩 Step 2.4: Register document
         doc_id = registry.register(temp_path, chunks, embeddings, index)
         st.session_state["active_doc"] = doc_id
-        # 🧩 Step 2.5: Success message + suggestions
-        status.success("✅ Document processed successfully — all set to query your assistant!")
         refresh_suggestions(doc_name, toc, chunks)
-        # 🧠 Update session
         st.session_state.update({
             "text": text,
             "toc": toc,
@@ -383,60 +277,32 @@ else:
             "index": index,
             "doc_ready": True,
             "last_doc": doc_identifier,
-            "status_text": "✅ Document processed successfully — all set to query your assistant!"
         })
-        st.rerun()
-    # --- Display Ready Message + Ask Section ---
-    if st.session_state.get("doc_ready"):
-        active_name = st.session_state.get("active_doc") or st.session_state.get("last_doc")
-        st.info(st.session_state.get("status_text", f"📄 {active_name or 'Document'} is ready for queries."))
-        st.markdown("### 💬 Ask the Assistant")
-        query_suggestions = st.session_state.get("query_suggestions_fixed", [])
-        if query_suggestions:
-            visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
-            cols = st.columns(min(3, len(visible)))
-            for i, q in enumerate(visible):
-                if cols[i % 3].button(f"💬 {q}", key=f"sugg_{i}"):
-                    set_user_query(q, i)
-            toggle_text = "Show less ▲" if st.session_state["show_more"] else "Show more ▼"
-            if st.button(toggle_text, help="Show or hide more suggestions"):
-                st.session_state["show_more"] = not st.session_state["show_more"]
-                st.rerun()
-        user_query = st.text_input(
-            "Type your question or click one above:",
-            key="user_query_input",
-            label_visibility="visible"
-        )
-        if user_query.strip():
-            reasoning_mode = mode == "Extended (Document + General)"
-            with st.spinner("💭 Generating your answer..."):
-                retrieved = retrieve_chunks(user_query, index, chunks, top_k=top_k, embeddings=embeddings)
-                answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
-                st.session_state["retrieved"] = retrieved
-            st.markdown("### 🤖 Assistant’s Answer")
-            if not reasoning_mode and not answer.startswith("⚠️"):
-                answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
-                answer = re.sub(r"(^|\n)-\s*", r"\1<br>• ", answer)
-            st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
-# ==========================================================
-# 🎨 Optional Sidebar Scroll Styling (keeps it clean)
-# ==========================================================
-st.markdown("""
-<style>
-section[data-testid="stSidebar"] div.stExpander {
-    max-height: 480px;
-    overflow-y: auto;
-}
-</style>
-""", unsafe_allow_html=True)

 st.set_page_config(page_title="Enterprise Knowledge Assistant", layout="wide")
 print("CUDA available:", torch.cuda.is_available())
+# ==========================================================
+# ⚙️ SAFE RERUN HANDLER
+# ==========================================================
+def trigger_safe_rerun():
+    """Mark rerun flag for next render instead of rerunning immediately."""
+    st.session_state["_safe_rerun"] = True
+if st.session_state.get("_safe_rerun"):
+    st.session_state["_safe_rerun"] = False
+    st.rerun()
 # ==========================================================
 # ⚙️ CACHE SETUP
 # ==========================================================
         questions = []
         for ln in lines:
             q = re.sub(r"^[\-\u2022\*\d\.\)\s]+", "", ln).strip()
+            if not q.endswith("?") and len(q.split()) < 18 and re.match(
+                r"(?i)^(what|how|why|where|who|when|which|can|does|is|are)\b", q
+            ):
                 q += "?"
             if 8 <= len(q) <= 140:
                 questions.append(q)
         final = []
         seen = set()
         for q in questions:
         return ["How do I start using this guide?", "What does this document cover?"]
 # ==========================================================
+# 🎨 STYLING
 # ==========================================================
 st.markdown("""
 <style>
     font-size: 15px !important;
 }
 .stTextInput > label {font-weight: 500;}
+.small-link {font-size: 13px; color: #60a5fa; cursor: pointer;}
 </style>
 """, unsafe_allow_html=True)
         "",
         ("Strict (Document-only)", "Extended (Document + General)"),
         index=0,
     )
     st.markdown("---")
     if "registry" in st.session_state:
         registry = st.session_state["registry"]
         registered_docs = registry.list_docs() if hasattr(registry, "list_docs") else []
         if registered_docs:
             with st.expander("📚 Registered Documents", expanded=False):
                 for i, doc in enumerate(registered_docs, start=1):
                     doc_name = doc.get("name", "Unknown")
                     chunks = doc.get("num_chunks", "?")
                     toc_source = doc.get("toc_source", "—")
+                    st.markdown(f"**{i}. {doc_name}** — {chunks} chunks *(TOC: {toc_source})*")
             st.markdown("---")
             active_doc_name = st.selectbox(
                 "📄 Select Active Document",
                 index=0,
                 key="active_doc_selector"
             )
             selected_doc = registry.get_doc(active_doc_name)
             if selected_doc:
                 st.session_state.update({
                     "doc_ready": True,
                     "status_text": f"📄 {active_doc_name} loaded from registry — ready for queries."
                 })
     st.caption("✨ Built by Shubham Sharma")
 # ==========================================================
 # 📄 MAIN SECTION
 # ==========================================================
 doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
 # ==========================================================
+# 📂 DOCUMENT HANDLING — SAFE VERSION
 # ==========================================================
 import hashlib
 def _hash_content(file_path):
     hasher = hashlib.sha256()
     with open(file_path, "rb") as f:
         while chunk := f.read(8192):
             hasher.update(chunk)
+    return hasher.hexdigest()[:12]
 if doc_choice == "-- Select --":
     st.info("⬅️ Select or upload a document to begin.")
 else:
     temp_path = None
     if doc_choice == "Sample PDF":
         temp_path = os.path.join(os.path.dirname(__file__), "sample.pdf")
         st.markdown("✅ **Sample PDF selected.** Preparing document...")
     else:
+        uploaded_file = st.file_uploader("Upload a PDF document (max 200MB):", type="pdf", label_visibility="collapsed")
         if uploaded_file:
             temp_path = os.path.join("/tmp", uploaded_file.name)
             with open(temp_path, "wb") as f:
         else:
             st.stop()
     if temp_path:
         doc_name = os.path.basename(temp_path)
         file_hash = _hash_content(temp_path)
+        doc_identifier = f"{doc_name}_{file_hash}"
         if "registry" not in st.session_state:
             st.session_state["registry"] = DocumentRegistry()
         registry = st.session_state["registry"]
         existing_doc = next((d for d in registry.list_docs() if d["name"] == doc_name), None)
         if existing_doc:
             doc_data = registry.get_doc(existing_doc["name"])
                 "active_doc": existing_doc["name"],
                 "status_text": f"✅ {doc_name} already processed — loaded from registry."
             })
+            refresh_suggestions(existing_doc["name"], st.session_state["toc"], st.session_state["chunks"])
+            trigger_safe_rerun()
         status = st.empty()
         status.info("📤 Upload complete — reading document...")
         text, toc, toc_source = extract_text_from_pdf(temp_path)
         status.info("📑 Parsing and chunking document...")
+        chunks = chunk_text(text, chunk_size=1000, overlap=120)
         status.info("🧠 Building embeddings and search index...")
         embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
         index = build_faiss_index(embeddings)
         doc_id = registry.register(temp_path, chunks, embeddings, index)
         st.session_state["active_doc"] = doc_id
+        status.success("✅ Document processed successfully — ready to query!")
         refresh_suggestions(doc_name, toc, chunks)
         st.session_state.update({
             "text": text,
             "toc": toc,
             "index": index,
             "doc_ready": True,
             "last_doc": doc_identifier,
+            "status_text": "✅ Document processed successfully — ready to query!"
         })
+        trigger_safe_rerun()
+if st.session_state.get("doc_ready"):
+    st.info(st.session_state.get("status_text"))
+    st.markdown("### 💬 Ask the Assistant")
+    query_suggestions = st.session_state.get("query_suggestions_fixed", [])
+    if query_suggestions:
+        visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
+        cols = st.columns(min(3, len(visible)))
+        for i, q in enumerate(visible):
+            if cols[i % 3].button(f"💬 {q}", key=f"sugg_{i}"):
+                st.session_state["user_query_input"] = q
+                st.session_state["selected_suggestion"] = i
+                trigger_safe_rerun()
+        toggle_text = "Show less ▲" if st.session_state["show_more"] else "Show more ▼"
+        if st.button(toggle_text):
+            st.session_state["show_more"] = not st.session_state["show_more"]
+            trigger_safe_rerun()
+    user_query = st.text_input("Your Question:", key="user_query_input", label_visibility="visible")
+    if user_query.strip():
+        reasoning_mode = mode == "Extended (Document + General)"
+        with st.spinner("💭 Generating your answer..."):
+            retrieved = retrieve_chunks(user_query, st.session_state["index"], st.session_state["chunks"], top_k=5, embeddings=st.session_state["embeddings"])
+            answer = generate_answer(user_query, retrieved, reasoning_mode=reasoning_mode)
+        st.markdown("### 🤖 Assistant’s Answer")
+        st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)