Spaces:

Dinesh310
/

demo2

Sleeping

App Files Files Community

Dinesh310 commited on 28 days ago

Commit

8586611

verified ·

1 Parent(s): c086254

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +133 -108

streamlit_app.py CHANGED Viewed

@@ -2,8 +2,11 @@ import streamlit as st
 from pathlib import Path
 import sys
 import os
-# Add src to path to ensure imports work correctly
 sys.path.append(str(Path(__file__).parent))
 from src.config.config import Config
@@ -11,164 +14,186 @@ from src.document_ingestion.document_processor import DocumentProcessor
 from src.vectorstore.vectorstore import VectorStore
 from src.graph_builder.graph_builder import GraphBuilder
-# --- Page Configuration ---
 st.set_page_config(
     page_title="Agentic PDF RAG",
     page_icon="🧠",
     layout="wide"
 )
-# Custom CSS for chat styling
 st.markdown("""
-    <style>
-    .stChatMessage { border-radius: 10px; margin-bottom: 10px; }
-    .stSidebar { background-color: #f8f9fa; }
-    </style>
 """, unsafe_allow_html=True)
 def init_session_state():
-    """Initializes all required session state variables"""
-    if 'rag_system' not in st.session_state:
-        st.session_state.rag_system = None
-    if 'messages' not in st.session_state:
-        st.session_state.messages = [
-            {"role": "assistant", "content": "Hello! Please upload PDF documents in the sidebar to begin our technical deep-dive."}
-        ]
-    if 'processed_files' not in st.session_state:
-        st.session_state.processed_files = []
 def process_documents(uploaded_files):
     """
-    Handles multi-file ingestion:
-    1. Loops through all uploaded files
-    2. Saves each to a temp path
-    3. Aggregates all document chunks
-    4. Initializes VectorStore and Graph once
     """
     try:
-        doc_processor = DocumentProcessor(
             chunk_size=Config.CHUNK_SIZE,
             chunk_overlap=Config.CHUNK_OVERLAP
         )
-        all_docs = []
-        # Ensure a temporary directory exists
         temp_dir = Path("temp_uploads")
         temp_dir.mkdir(exist_ok=True)
-        for uploaded_file in uploaded_files:
-            # 1. Save uploaded bytes to a local string path
-            temp_path = temp_dir / uploaded_file.name
             with open(temp_path, "wb") as f:
-                f.write(uploaded_file.getvalue())
-            # 2. Process this specific PDF into chunks
-            # Assuming your DocumentProcessor.process_pdf takes a string path
-            docs = doc_processor.process_pdf(str(temp_path))
             all_docs.extend(docs)
-            # 3. Clean up the temporary file immediately after processing
-            if temp_path.exists():
-                os.remove(temp_path)
         if not all_docs:
-            st.error("No text could be extracted from the uploaded files.")
             return None, 0
-        # 4. Create Vector Store with the combined list of all chunks
         vector_store = VectorStore()
         vector_store.create_vectorstore(all_docs)
-        # 5. Build the Agentic Graph using the compiled retriever
-        graph_builder = GraphBuilder(
             retriever=vector_store.get_retriever(),
             llm=Config.get_llm()
         )
-        graph_builder.build()
-        return graph_builder, len(all_docs)
     except Exception as e:
-        st.error(f"Critical Error during ingestion: {str(e)}")
         return None, 0
 def main():
     init_session_state()
-    # --- Sidebar UI ---
     with st.sidebar:
-        st.header("Document Ingestion")
         uploaded_files = st.file_uploader(
-            "Upload PDF files",
-            type="pdf",
-            accept_multiple_files=True,
-            help="You can select multiple files at once."
         )
         if st.button("🛠️ Build Knowledge Base", type="primary"):
-            if uploaded_files:
-                with st.spinner("Analyzing PDF structure and generating embeddings..."):
-                    rag_system, num_chunks = process_documents(uploaded_files)
-                    if rag_system:
-                        st.session_state.rag_system = rag_system
-                        st.session_state.processed_files = [f.name for f in uploaded_files]
-                        # Add success notification to chat
-                        confirm_msg = f"I have successfully indexed {num_chunks} chunks from: {', '.join(st.session_state.processed_files)}."
-                        st.session_state.messages.append({"role": "assistant", "content": confirm_msg})
-                        st.rerun() # Refresh to show the message immediately
             else:
-                st.warning("Please upload at least one PDF first.")
         if st.session_state.processed_files:
             st.markdown("---")
-            st.subheader("Loaded Documents")
             for f in st.session_state.processed_files:
-                st.caption(f"✅ {f}")
-            if st.button("Clear Chat History"):
-                st.session_state.messages = [{"role": "assistant", "content": "Chat cleared. How can I help with the current documents?"}]
                 st.rerun()
-    # --- Main Chat UI ---
-    st.title("🔍 Agentic RAG Explorer")
-    st.caption("Powered by LangGraph & Vector Embeddings")
-    # Display existing chat history
-    for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    # Chat Input logic
-    if prompt := st.chat_input("Ask a question about your documents..."):
-        st.chat_message("user").markdown(prompt)
         st.session_state.messages.append({"role": "user", "content": prompt})
-        if st.session_state.rag_system:
-            with st.chat_message("assistant"):
-                with st.spinner("Agent searching knowledge base..."):
-                    try:
-                        # Call your GraphBuilder's run method
-                        result = st.session_state.rag_system.run(prompt)
-                        answer = result.get('answer', "I couldn't find a definitive answer.")
-                        st.markdown(answer)
-                        # Show Source Citations in an Expander
-                        if result.get('retrieved_docs'):
-                            with st.expander("🔍 View Referenced Context"):
-                                for i, doc in enumerate(result['retrieved_docs'], 1):
-                                    source_name = Path(doc.metadata.get('source', 'Unknown')).name
-                                    page_num = doc.metadata.get('page', 'N/A')
-                                    st.markdown(f"**Source {i}:** {source_name} (Page {page_num})")
-                                    st.info(doc.page_content[:400] + "...")
-                        st.session_state.messages.append({"role": "assistant", "content": answer})
-                    except Exception as e:
-                        st.error(f"Search Error: {str(e)}")
-        else:
-            st.warning("Please upload and build the knowledge base first!")
 if __name__ == "__main__":
-    main()

 from pathlib import Path
 import sys
 import os
+import hashlib
+# -------------------------------------------------
+# Path setup
+# -------------------------------------------------
 sys.path.append(str(Path(__file__).parent))
 from src.config.config import Config
 from src.vectorstore.vectorstore import VectorStore
 from src.graph_builder.graph_builder import GraphBuilder
+# -------------------------------------------------
+# Page config
+# -------------------------------------------------
 st.set_page_config(
     page_title="Agentic PDF RAG",
     page_icon="🧠",
     layout="wide"
 )
+# -------------------------------------------------
+# Styles
+# -------------------------------------------------
 st.markdown("""
+<style>
+.stChatMessage { border-radius: 10px; margin-bottom: 10px; }
+.stSidebar { background-color: #f8f9fa; }
+</style>
 """, unsafe_allow_html=True)
+# -------------------------------------------------
+# Session state
+# -------------------------------------------------
 def init_session_state():
+    defaults = {
+        "rag_system": None,
+        "messages": [
+            {"role": "assistant", "content": "👋 Upload one or more PDFs from the sidebar and start chatting across them."}
+        ],
+        "processed_files": [],
+        "kb_hash": None
+    }
+    for k, v in defaults.items():
+        if k not in st.session_state:
+            st.session_state[k] = v
+# -------------------------------------------------
+# Helpers
+# -------------------------------------------------
+def compute_files_hash(files):
+    """Prevents rebuilding KB for same PDFs"""
+    hasher = hashlib.md5()
+    for f in files:
+        hasher.update(f.name.encode())
+        hasher.update(f.getvalue())
+    return hasher.hexdigest()
 def process_documents(uploaded_files):
     """
+    Ingests multiple PDFs into ONE knowledge base
     """
     try:
+        processor = DocumentProcessor(
             chunk_size=Config.CHUNK_SIZE,
             chunk_overlap=Config.CHUNK_OVERLAP
         )
         temp_dir = Path("temp_uploads")
         temp_dir.mkdir(exist_ok=True)
+        all_docs = []
+        for file in uploaded_files:
+            temp_path = temp_dir / file.name
             with open(temp_path, "wb") as f:
+                f.write(file.getvalue())
+            docs = processor.process_pdf(str(temp_path))
+            # Ensure filename metadata exists
+            for d in docs:
+                d.metadata["source"] = file.name
             all_docs.extend(docs)
+            os.remove(temp_path)
         if not all_docs:
             return None, 0
         vector_store = VectorStore()
         vector_store.create_vectorstore(all_docs)
+        graph = GraphBuilder(
             retriever=vector_store.get_retriever(),
             llm=Config.get_llm()
         )
+        graph.build()
+        return graph, len(all_docs)
     except Exception as e:
+        st.error(f"Ingestion failed: {e}")
         return None, 0
+# -------------------------------------------------
+# Main app
+# -------------------------------------------------
 def main():
     init_session_state()
+    # ---------------- Sidebar ----------------
     with st.sidebar:
+        st.header("📄 Document Ingestion")
         uploaded_files = st.file_uploader(
+            "Upload PDF files",
+            type="pdf",
+            accept_multiple_files=True
         )
         if st.button("🛠️ Build Knowledge Base", type="primary"):
+            if not uploaded_files:
+                st.warning("Upload at least one PDF.")
             else:
+                new_hash = compute_files_hash(uploaded_files)
+                if new_hash == st.session_state.kb_hash:
+                    st.info("Knowledge base already built for these PDFs.")
+                else:
+                    with st.spinner("Indexing PDFs and building agent graph..."):
+                        rag, chunks = process_documents(uploaded_files)
+                        if rag:
+                            st.session_state.rag_system = rag
+                            st.session_state.processed_files = [f.name for f in uploaded_files]
+                            st.session_state.kb_hash = new_hash
+                            msg = (
+                                f"✅ Knowledge base ready!\n\n"
+                                f"Indexed **{chunks} chunks** from:\n"
+                                + "\n".join(f"- {f}" for f in st.session_state.processed_files)
+                            )
+                            st.session_state.messages.append({"role": "assistant", "content": msg})
+                            st.rerun()
         if st.session_state.processed_files:
             st.markdown("---")
+            st.subheader("📚 Loaded PDFs")
             for f in st.session_state.processed_files:
+                st.caption(f"✔ {f}")
+            if st.button("🧹 Clear Chat"):
+                st.session_state.messages = [
+                    {"role": "assistant", "content": "Chat cleared. Ask anything about the loaded PDFs!"}
+                ]
                 st.rerun()
+    # ---------------- Main Chat ----------------
+    st.title("🔍 Agentic Multi-PDF Chat")
+    st.caption("Ask questions across all uploaded documents")
+    for msg in st.session_state.messages:
+        with st.chat_message(msg["role"]):
+            st.markdown(msg["content"])
+    if prompt := st.chat_input("Ask a question across all PDFs..."):
         st.session_state.messages.append({"role": "user", "content": prompt})
+        st.chat_message("user").markdown(prompt)
+        if not st.session_state.rag_system:
+            st.warning("Build the knowledge base first.")
+            return
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking across documents..."):
+                result = st.session_state.rag_system.run(prompt)
+                answer = result.get("answer", "No clear answer found.")
+                st.markdown(answer)
+                if result.get("retrieved_docs"):
+                    with st.expander("📌 Sources"):
+                        for i, doc in enumerate(result["retrieved_docs"], 1):
+                            st.markdown(
+                                f"**{i}. {doc.metadata.get('source', 'Unknown')} "
+                                f"(Page {doc.metadata.get('page', 'N/A')})**"
+                            )
+                            st.info(doc.page_content[:400] + "...")
+                st.session_state.messages.append({"role": "assistant", "content": answer})
+# -------------------------------------------------
 if __name__ == "__main__":
+    main()