Spaces:

dnj0
/

project2

Sleeping

App Files Files Community

dnj0 commited on Nov 16, 2025

Commit

21f3961

verified ·

1 Parent(s): 483d0df

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +306 -267

src/app.py CHANGED Viewed

@@ -1,267 +1,306 @@
-import streamlit as st
-import os
-from pathlib import Path
-from rag_pipeline import RAGPipeline
-import shutil
-# Page configuration
-st.set_page_config(
-    page_title="Local Multimodal RAG",
-    page_icon="📚",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-st.title("📚 Local Multimodal RAG System")
-st.markdown("**Analyze PDF documents locally with Mistral + CLIP embeddings**")
-# Initialize session state
-if "uploaded_files" not in st.session_state:
-    st.session_state.uploaded_files = []
-if "rag_pipeline" not in st.session_state:
-    st.session_state.rag_pipeline = None
-if "needs_reindex" not in st.session_state:
-    st.session_state.needs_reindex = False
-# Sidebar configuration
-with st.sidebar:
-    st.header("⚙️ Configuration")
-    pdf_dir = st.text_input(
-        "📁 PDF Directory",
-        value="./pdfs",
-        help="Path to directory containing PDF files"
-    )
-    device = st.selectbox(
-        "🖥️ Device",
-        ["cpu", "cuda"],
-        help="Device for model inference"
-    )
-    n_context_docs = st.slider(
-        "📄 Context Documents",
-        min_value=1,
-        max_value=10,
-        value=3,
-        help="Number of documents to retrieve for context"
-    )
-    st.divider()
-    # PDF Upload Section
-    st.subheader("📤 Upload PDF Files")
-    uploaded_pdfs = st.file_uploader(
-        "Choose PDF files to upload",
-        type="pdf",
-        accept_multiple_files=True,
-        help="Select one or more PDF files to add to the system"
-    )
-    if uploaded_pdfs:
-        # Create PDF directory if not exists
-        os.makedirs(pdf_dir, exist_ok=True)
-        upload_button = st.button("⬆️ Upload PDFs", use_container_width=True)
-        if upload_button:
-            uploaded_count = 0
-            for uploaded_file in uploaded_pdfs:
-                file_path = os.path.join(pdf_dir, uploaded_file.name)
-                # Save file
-                with open(file_path, "wb") as f:
-                    f.write(uploaded_file.getbuffer())
-                st.session_state.uploaded_files.append(uploaded_file.name)
-                uploaded_count += 1
-            st.success(f"✅ Uploaded {uploaded_count} PDF(s) successfully!")
-            st.session_state.needs_reindex = True
-    st.divider()
-    # Display uploaded files
-    pdf_files = list(Path(pdf_dir).glob("*.pdf"))
-    if pdf_files:
-        st.subheader(f"📚 Documents ({len(pdf_files)})")
-        for pdf_file in pdf_files:
-            col1, col2 = st.columns([4, 1])
-            with col1:
-                st.write(f"• {pdf_file.name}")
-            with col2:
-                if st.button("🗑️", key=f"delete_{pdf_file.name}", help="Delete this file"):
-                    os.remove(pdf_file)
-                    st.session_state.needs_reindex = True
-                    st.rerun()
-    st.divider()
-    # Reindex button
-    if st.button("🔄 Reload & Index PDFs", use_container_width=True):
-        st.session_state.rag_pipeline = None
-        st.session_state.needs_reindex = True
-        st.rerun()
-# Initialize pipeline in session state
-@st.cache_resource
-def init_rag_pipeline(_device, _pdf_dir):
-    """Initialize RAG pipeline (cached)"""
-    # Create PDF directory if not exists
-    os.makedirs(_pdf_dir, exist_ok=True)
-    # Check if PDFs exist
-    pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
-    if not pdf_files:
-        return None, f"No PDF files found in {_pdf_dir}. Upload PDFs using the sidebar."
-    try:
-        with st.spinner("⏳ Initializing RAG pipeline..."):
-            pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
-            with st.spinner("⏳ Indexing PDFs..."):
-                pipeline.index_pdfs()
-        return pipeline, None
-    except Exception as e:
-        return None, str(e)
-# Get or initialize pipeline
-if st.session_state.rag_pipeline is None or st.session_state.needs_reindex:
-    pipeline, error = init_rag_pipeline(device, pdf_dir)
-    if error:
-        st.error(f"❌ Error: {error}")
-        st.info("💡 **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")
-        st.stop()
-    st.session_state.rag_pipeline = pipeline
-    st.session_state.needs_reindex = False
-else:
-    pipeline = st.session_state.rag_pipeline
-# Main content
-if pipeline:
-    # Tabs
-    tab1, tab2, tab3 = st.tabs(["❓ Q&A", "📊 Summary", "📖 Retrieval"])
-    # Tab 1: Question Answering
-    with tab1:
-        st.subheader("Ask Questions about Your Documents")
-        question = st.text_area(
-            "Your question (in Russian or English):",
-            height=100,
-            placeholder="What is this document about? What are the main points? Etc.",
-            key="qa_question"
-        )
-        col1, col2 = st.columns(2)
-        with col1:
-            get_answer_btn = st.button("🔍 Get Answer", use_container_width=True)
-        with col2:
-            clear_btn = st.button("🗑️ Clear", use_container_width=True)
-        if clear_btn:
-            st.rerun()
-        if get_answer_btn:
-            if question.strip():
-                with st.spinner("⏳ Retrieving documents and generating answer..."):
-                    try:
-                        result = pipeline.answer_question(question, n_context_docs=n_context_docs)
-                    except Exception as e:
-                        st.error(f"Error generating answer: {str(e)}")
-                        result = None
-                if result and result.get("answer"):
-                    st.success("✓ Answer generated!")
-                    # Display answer
-                    st.subheader("📝 Answer")
-                    st.write(result["answer"])
-                    # Display sources
-                    with st.expander("📚 Sources Used"):
-                        for i, source in enumerate(result["sources"], 1):
-                            st.write(f"{i}. {source}")
-                    # Display stats
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.metric("Documents Used", result.get("context_used", 0))
-                    with col2:
-                        st.metric("Answer Length", len(result["answer"]))
-            else:
-                st.warning("Please enter a question")
-    # Tab 2: Document Summary
-    with tab2:
-        st.subheader("Summary of Indexed Documents")
-        if st.button("📊 Generate Summary", use_container_width=True):
-            with st.spinner("⏳ Generating summary..."):
-                try:
-                    summary = pipeline.summarize_documents()
-                    st.success("✓ Summary generated!")
-                    st.subheader("📄 Document Summary")
-                    st.write(summary)
-                except Exception as e:
-                    st.error(f"Error generating summary: {str(e)}")
-    # Tab 3: Document Retrieval
-    with tab3:
-        st.subheader("Search and Retrieve Documents")
-        search_query = st.text_input(
-            "Search query:",
-            placeholder="Enter search terms...",
-            key="retrieval_search"
-        )
-        col1, col2 = st.columns(2)
-        with col1:
-            search_btn = st.button("🔎 Search", use_container_width=True)
-        with col2:
-            clear_search_btn = st.button("Clear Search", use_container_width=True)
-        if clear_search_btn:
-            st.rerun()
-        if search_btn:
-            if search_query.strip():
-                with st.spinner("⏳ Searching..."):
-                    try:
-                        results = pipeline.retrieve_documents(search_query, n_results=n_context_docs)
-                    except Exception as e:
-                        st.error(f"Search error: {str(e)}")
-                        results = []
-                if results:
-                    st.success(f"✓ Found {len(results)} documents")
-                    for i, doc in enumerate(results, 1):
-                        with st.expander(f"📄 Document {i} - {doc['source']}", expanded=(i==1)):
-                            st.write(doc["content"])
-                else:
-                    st.warning("No documents found matching your query")
-            else:
-                st.warning("Please enter a search query")
-    # Footer
-    st.divider()
-    with st.expander("ℹ️ System Information"):
-        info = pipeline.vector_store.get_collection_info()
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            st.metric("📚 Documents", info.get("document_count", 0))
-        with col2:
-            st.metric("🖥️ Device", device.upper())
-        with col3:
-            st.metric("🔍 Context Docs", n_context_docs)
-        with col4:
-            pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
-            st.metric("📁 PDF Files", pdf_count)
-else:
-    st.error("❌ Failed to initialize RAG pipeline")
-    st.info("💡 **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")

+import streamlit as st
+import os
+from pathlib import Path
+from rag_pipeline import RAGPipeline
+import time
+# Page configuration
+st.set_page_config(
+    page_title="Local Multimodal RAG",
+    page_icon="📚",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+st.title("📚 Local Multimodal RAG System")
+st.markdown("**Analyze PDF documents locally with Mistral + CLIP embeddings**")
+# Initialize session state
+if "uploaded_files" not in st.session_state:
+    st.session_state.uploaded_files = []
+if "rag_pipeline" not in st.session_state:
+    st.session_state.rag_pipeline = None
+if "last_upload_time" not in st.session_state:
+    st.session_state.last_upload_time = 0
+# Sidebar configuration
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    pdf_dir = st.text_input(
+        "📁 PDF Directory",
+        value="./pdfs",
+        help="Path to directory containing PDF files"
+    )
+    # Ensure directory exists
+    os.makedirs(pdf_dir, exist_ok=True)
+    device = st.selectbox(
+        "🖥️ Device",
+        ["cpu", "cuda"],
+        help="Device for model inference"
+    )
+    n_context_docs = st.slider(
+        "📄 Context Documents",
+        min_value=1,
+        max_value=10,
+        value=3,
+        help="Number of documents to retrieve for context"
+    )
+    st.divider()
+    # PDF Upload Section with Form
+    st.subheader("📤 Upload PDF Files")
+    # Use a form to separate file upload from submission
+    with st.form("pdf_upload_form", clear_on_submit=True):
+        uploaded_pdfs = st.file_uploader(
+            "Choose PDF files to upload",
+            type="pdf",
+            accept_multiple_files=True,
+            help="Select one or more PDF files to add to the system"
+        )
+        submit_button = st.form_submit_button("⬆️ Upload PDFs", use_container_width=True)
+        if submit_button and uploaded_pdfs:
+            upload_successful = True
+            uploaded_count = 0
+            for uploaded_file in uploaded_pdfs:
+                try:
+                    file_path = os.path.join(pdf_dir, uploaded_file.name)
+                    # Save file to disk
+                    with open(file_path, "wb") as f:
+                        f.write(uploaded_file.getbuffer())
+                    st.session_state.uploaded_files.append(uploaded_file.name)
+                    uploaded_count += 1
+                except Exception as e:
+                    st.error(f"Failed to upload {uploaded_file.name}: {str(e)}")
+                    upload_successful = False
+            if upload_successful and uploaded_count > 0:
+                st.session_state.last_upload_time = time.time()
+                st.success(f"✅ Uploaded {uploaded_count} PDF(s) successfully!")
+                st.info("📌 Click 'Reload & Index PDFs' below to process them.")
+                # Don't call st.rerun() here - let form handle clear_on_submit
+    st.divider()
+    # Display uploaded files
+    pdf_files = list(Path(pdf_dir).glob("*.pdf"))
+    if pdf_files:
+        st.subheader(f"📚 Documents ({len(pdf_files)})")
+        for pdf_file in pdf_files:
+            col1, col2 = st.columns([4, 1])
+            with col1:
+                st.write(f"• {pdf_file.name}")
+            with col2:
+                if st.button("🗑️", key=f"delete_{pdf_file.name}", help="Delete this file"):
+                    try:
+                        os.remove(pdf_file)
+                        st.session_state.rag_pipeline = None  # Clear pipeline
+                        st.success(f"Deleted {pdf_file.name}")
+                        time.sleep(0.5)
+                        st.rerun()
+                    except Exception as e:
+                        st.error(f"Failed to delete: {str(e)}")
+    else:
+        st.info("📭 No PDF files in directory yet")
+    st.divider()
+    # Reload/Index button
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("🔄 Reload & Index", use_container_width=True):
+            st.session_state.rag_pipeline = None  # Clear cached pipeline
+            st.rerun()
+    with col2:
+        if st.button("🗑️ Clear All", use_container_width=True):
+            # Delete all PDFs
+            for pdf_file in Path(pdf_dir).glob("*.pdf"):
+                try:
+                    os.remove(pdf_file)
+                except:
+                    pass
+            st.session_state.rag_pipeline = None
+            st.session_state.uploaded_files = []
+            st.success("All PDFs cleared")
+            time.sleep(0.5)
+            st.rerun()
+# Initialize pipeline
+@st.cache_resource
+def init_rag_pipeline(_device, _pdf_dir):
+    """Initialize RAG pipeline (cached)"""
+    os.makedirs(_pdf_dir, exist_ok=True)
+    pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
+    if not pdf_files:
+        return None, f"No PDF files found in {_pdf_dir}"
+    try:
+        with st.spinner("⏳ Initializing models..."):
+            pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
+        with st.spinner("⏳ Indexing PDFs..."):
+            pipeline.index_pdfs()
+        return pipeline, None
+    except Exception as e:
+        return None, str(e)
+# Get or initialize pipeline
+if st.session_state.rag_pipeline is None:
+    pdf_files = list(Path(pdf_dir).glob("*.pdf"))
+    if pdf_files:
+        pipeline, error = init_rag_pipeline(device, pdf_dir)
+        if error:
+            st.error(f"❌ Error: {error}")
+            st.stop()
+        st.session_state.rag_pipeline = pipeline
+    else:
+        st.warning("📭 No PDF files found")
+        st.info("""
+        **How to get started:**
+        1. 📤 Upload PDF files using the sidebar file uploader
+        2. ✅ Click 'Upload PDFs' to save them
+        3. 🔄 Click 'Reload & Index PDFs' to process
+        4. ❓ Ask questions in the Q&A tab
+        """)
+        st.stop()
+else:
+    pipeline = st.session_state.rag_pipeline
+# Main content
+if pipeline:
+    # Tabs
+    tab1, tab2, tab3 = st.tabs(["❓ Q&A", "📊 Summary", "📖 Retrieval"])
+    # Tab 1: Question Answering
+    with tab1:
+        st.subheader("Ask Questions about Your Documents")
+        question = st.text_area(
+            "Your question (in Russian or English):",
+            height=100,
+            placeholder="What is this document about? What are the main points? Etc.",
+            key="qa_question"
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            get_answer_btn = st.button("🔍 Get Answer", use_container_width=True)
+        with col2:
+            clear_btn = st.button("🗑️ Clear", use_container_width=True)
+        if clear_btn:
+            st.rerun()
+        if get_answer_btn:
+            if question.strip():
+                with st.spinner("⏳ Retrieving documents and generating answer..."):
+                    try:
+                        result = pipeline.answer_question(question, n_context_docs=n_context_docs)
+                    except Exception as e:
+                        st.error(f"Error: {str(e)}")
+                        result = None
+                if result and result.get("answer"):
+                    st.success("✓ Answer generated!")
+                    st.subheader("📝 Answer")
+                    st.write(result["answer"])
+                    with st.expander("📚 Sources Used"):
+                        for i, source in enumerate(result["sources"], 1):
+                            st.write(f"{i}. {source}")
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.metric("Documents Used", result.get("context_used", 0))
+                    with col2:
+                        st.metric("Answer Length", len(result["answer"]))
+            else:
+                st.warning("Please enter a question")
+    # Tab 2: Document Summary
+    with tab2:
+        st.subheader("Summary of Indexed Documents")
+        if st.button("📊 Generate Summary", use_container_width=True):
+            with st.spinner("⏳ Generating summary..."):
+                try:
+                    summary = pipeline.summarize_documents()
+                    st.success("✓ Summary generated!")
+                    st.subheader("📄 Document Summary")
+                    st.write(summary)
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+    # Tab 3: Document Retrieval
+    with tab3:
+        st.subheader("Search and Retrieve Documents")
+        search_query = st.text_input(
+            "Search query:",
+            placeholder="Enter search terms...",
+            key="retrieval_search"
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            search_btn = st.button("🔎 Search", use_container_width=True)
+        with col2:
+            clear_search_btn = st.button("Clear Search", use_container_width=True)
+        if clear_search_btn:
+            st.rerun()
+        if search_btn:
+            if search_query.strip():
+                with st.spinner("⏳ Searching..."):
+                    try:
+                        results = pipeline.retrieve_documents(search_query, n_results=n_context_docs)
+                    except Exception as e:
+                        st.error(f"Search error: {str(e)}")
+                        results = []
+                if results:
+                    st.success(f"✓ Found {len(results)} documents")
+                    for i, doc in enumerate(results, 1):
+                        with st.expander(f"📄 Document {i} - {doc['source']}", expanded=(i==1)):
+                            st.write(doc["content"])
+                else:
+                    st.warning("No documents found matching your query")
+            else:
+                st.warning("Please enter a search query")
+    # Footer
+    st.divider()
+    with st.expander("ℹ️ System Information"):
+        info = pipeline.vector_store.get_collection_info()
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("📚 Chunks", info.get("document_count", 0))
+        with col2:
+            st.metric("🖥️ Device", device.upper())
+        with col3:
+            st.metric("🔍 Context", n_context_docs)
+        with col4:
+            pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
+            st.metric("📁 PDFs", pdf_count)