Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 25

Commit

dc571c1

verified ·

1 Parent(s): 59b2329

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +32 -21

src/streamlit_app.py CHANGED Viewed

@@ -212,8 +212,19 @@ st.caption("Query SAP documentation and enterprise PDFs — powered by reasoning
 doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
 # ==========================================================
-# 📂 DOCUMENT HANDLING — POLISHED UI FLOW
 # ==========================================================
 if doc_choice == "-- Select --":
     st.info("⬅️ Select or upload a document to begin.")
 else:
@@ -234,31 +245,35 @@ else:
             with open(temp_path, "wb") as f:
                 f.write(uploaded_file.getbuffer())
         else:
-            st.stop()  # Wait until file is uploaded
-    # --- Real processing begins here ---
     if temp_path:
         doc_name = os.path.basename(temp_path)
-        # Process only once per document
-        if "doc_ready" not in st.session_state or st.session_state.get("last_doc") != doc_name:
             status = st.empty()
-            status.info("📤 Upload complete — hang tight while we process your document...")
-            # Step 1: Extract text
             text, toc, toc_source = extract_text_from_pdf(temp_path)
             status.info("📑 Parsing and chunking document...")
             chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
-            # Step 2: Build embeddings + FAISS index
             status.info("🧠 Building embeddings and search index...")
             embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
             index = build_faiss_index(embeddings)
-            # Step 3: Final ready state
-            status.success("✅ All set — your AI assistant is ready to help.")
-            # Persist session data for reruns
             st.session_state.update({
                 "text": text,
                 "toc": toc,
@@ -266,32 +281,29 @@ else:
                 "embeddings": embeddings,
                 "index": index,
                 "doc_ready": True,
-                "last_doc": doc_name,
-                "status_text": "📄 Document is ready for queries."
             })
-            # Build question suggestions (once per doc)
             query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
             st.session_state["query_suggestions_fixed"] = query_suggestions
             st.session_state["user_query_input"] = ""
             st.session_state["selected_suggestion"] = None
             st.session_state["show_more"] = False
-            # Refresh to cleanly show "ready" state
             st.rerun()
         else:
-            # --- Reuse existing state (rerun-safe) ---
             text = st.session_state["text"]
             toc = st.session_state["toc"]
             chunks = st.session_state["chunks"]
             embeddings = st.session_state["embeddings"]
             index = st.session_state["index"]
             query_suggestions = st.session_state.get("query_suggestions_fixed", [])
             st.info(st.session_state.get("status_text", f"📄 {doc_name} is ready for queries."))
-        # --- Ask the Assistant section ---
         st.markdown("### 💬 Ask the Assistant")
         if query_suggestions:
             visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
@@ -305,7 +317,6 @@ else:
                 st.session_state["show_more"] = not st.session_state["show_more"]
                 st.rerun()
-        # --- Query input box ---
         user_query = st.text_input("Type your question or click one above:", key="user_query_input")
         if user_query.strip():
@@ -317,13 +328,13 @@ else:
             st.markdown("### 🤖 Assistant’s Answer")
-            # Clean up answer format (bullets, bold)
             if not reasoning_mode and not answer.startswith("⚠️"):
                 answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
                 answer = re.sub(r"(^|\n)-\s*", r"\1<br>• ", answer)
             st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
 # ==========================================================
 # 🎨 Optional Sidebar Scroll Styling (keeps it clean)
 # ==========================================================

 doc_choice = st.radio("Select a document:", ["-- Select --", "Sample PDF", "Upload Custom PDF"], index=0)
 # ==========================================================
+# 📂 DOCUMENT HANDLING — CLEAN, ACCURATE, AND BYTE-AWARE
 # ==========================================================
+import hashlib
+def _hash_content(file_path):
+    """Generate a short SHA256 hash of the file's actual binary content."""
+    hasher = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        while chunk := f.read(8192):
+            hasher.update(chunk)
+    return hasher.hexdigest()[:12]  # keep short hash for filenames
+# --- Document selection ---
 if doc_choice == "-- Select --":
     st.info("⬅️ Select or upload a document to begin.")
 else:
             with open(temp_path, "wb") as f:
                 f.write(uploaded_file.getbuffer())
         else:
+            st.stop()
+    # --- Start processing if file exists ---
     if temp_path:
         doc_name = os.path.basename(temp_path)
+        file_hash = _hash_content(temp_path)
+        doc_identifier = f"{doc_name}_{file_hash}"  # unique per content
+        # 🔍 Reprocess only if new or changed document
+        if "doc_ready" not in st.session_state or st.session_state.get("last_doc") != doc_identifier:
             status = st.empty()
+            status.info("📤 Upload complete — reading document...")
+            # 🧩 Step 1: Extract text and TOC
             text, toc, toc_source = extract_text_from_pdf(temp_path)
+            # 🧩 Step 2: Chunk the text
             status.info("📑 Parsing and chunking document...")
             chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
+            # 🧩 Step 3: Embed and index
             status.info("🧠 Building embeddings and search index...")
             embeddings = cache_embeddings(doc_name, chunks, embed_chunks)
             index = build_faiss_index(embeddings)
+            # 🧩 Step 4: Final success message
+            status.success("✅ Document processed successfully — all set to query your assistant!")
+            # 🧠 Store everything in session state
             st.session_state.update({
                 "text": text,
                 "toc": toc,
                 "embeddings": embeddings,
                 "index": index,
                 "doc_ready": True,
+                "last_doc": doc_identifier,
+                "status_text": "✅ Document processed successfully — all set to query your assistant!"
             })
+            # Build fresh suggestions and rerun
             query_suggestions = generate_dynamic_suggestions_from_toc(toc, chunks, doc_name)
             st.session_state["query_suggestions_fixed"] = query_suggestions
             st.session_state["user_query_input"] = ""
             st.session_state["selected_suggestion"] = None
             st.session_state["show_more"] = False
             st.rerun()
         else:
+            # ♻️ Reuse cached session state (same file)
             text = st.session_state["text"]
             toc = st.session_state["toc"]
             chunks = st.session_state["chunks"]
             embeddings = st.session_state["embeddings"]
             index = st.session_state["index"]
             query_suggestions = st.session_state.get("query_suggestions_fixed", [])
             st.info(st.session_state.get("status_text", f"📄 {doc_name} is ready for queries."))
+        # --- Ask section ---
         st.markdown("### 💬 Ask the Assistant")
         if query_suggestions:
             visible = query_suggestions if st.session_state["show_more"] else query_suggestions[:3]
                 st.session_state["show_more"] = not st.session_state["show_more"]
                 st.rerun()
         user_query = st.text_input("Type your question or click one above:", key="user_query_input")
         if user_query.strip():
             st.markdown("### 🤖 Assistant’s Answer")
             if not reasoning_mode and not answer.startswith("⚠️"):
                 answer = re.sub(r"\*\*(.*?)\*\*", r"\1", answer)
                 answer = re.sub(r"(^|\n)-\s*", r"\1<br>• ", answer)
             st.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)
 # ==========================================================
 # 🎨 Optional Sidebar Scroll Styling (keeps it clean)
 # ==========================================================