final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 17, 2025

Commit

893bbbd

verified ·

1 Parent(s): d657efd

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +466 -223

src/app.py CHANGED Viewed

@@ -1,223 +1,466 @@
-import streamlit as st
-import os
-from pathlib import Path
-from dotenv import load_dotenv
-# Load environment variables
-load_dotenv()
-# Import custom modules
-from pdf_processor import PDFProcessor, prepare_documents_for_embedding
-from embeddings_handler import CLIPLangChainEmbeddings
-from vectorstore_manager import VectorStoreManager
-from image_summarizer import ImageSummarizer, process_images_in_documents
-from rag_chain import RAGChain
-from langchain_core.documents import Document
-# Page configuration
-st.set_page_config(
-    page_title="Multimodal RAG Assistant",
-    page_icon="📄",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-st.markdown("""
-<style>
-    .main {
-        padding: 2rem;
-    }
-    .stChatMessage {
-        padding: 1rem;
-        border-radius: 0.5rem;
-        margin-bottom: 1rem;
-    }
-</style>
-""", unsafe_allow_html=True)
-# Initialize session state
-if "vector_store" not in st.session_state:
-    st.session_state.vector_store = None
-if "rag_chain" not in st.session_state:
-    st.session_state.rag_chain = None
-if "document_count" not in st.session_state:
-    st.session_state.document_count = 0
-# Sidebar configuration
-st.sidebar.title("⚙️ Configuration")
-st.sidebar.markdown("---")
-# OpenAI API Key
-api_key = st.sidebar.text_input(
-    "OpenAI API Key",
-    type="password",
-    value=os.getenv("OPENAI_API_KEY", ""),
-    help="Enter your OpenAI API key"
-)
-if api_key:
-    os.environ["OPENAI_API_KEY"] = api_key
-# PDF directory setup
-pdf_dir = st.sidebar.text_input(
-    "PDF Directory",
-    value="./pdfs",
-    help="Directory containing PDF files"
-)
-# Vector store settings
-st.sidebar.markdown("### Vector Store")
-collection_name = st.sidebar.text_input(
-    "Collection Name",
-    value="pdf_documents",
-    help="ChromaDB collection name"
-)
-persist_dir = st.sidebar.text_input(
-    "Persist Directory",
-    value="./chroma_db",
-    help="Directory for ChromaDB storage"
-)
-# Initialize vector store button
-if st.sidebar.button("🔄 Initialize Vector Store", use_container_width=True):
-    with st.spinner("Initializing vector store..."):
-        try:
-            # Initialize embeddings
-            embeddings = CLIPLangChainEmbeddings(
-                model_name="ViT-B-32",
-                pretrained="openai"
-            )
-            # Initialize vector store
-            st.session_state.vector_store = VectorStoreManager(
-                persist_dir=persist_dir,
-                collection_name=collection_name,
-                embeddings=embeddings
-            )
-            # Initialize RAG chain
-            retriever = st.session_state.vector_store.get_retriever()
-            st.session_state.rag_chain = RAGChain(retriever, api_key=api_key)
-            st.session_state.document_count = st.session_state.vector_store.collection_count()
-            st.success("✅ Vector store initialized!")
-        except Exception as e:
-            st.error(f"❌ Error initializing vector store: {str(e)}")
-# Load and process PDFs button
-if st.sidebar.button("📥 Load & Process PDFs", use_container_width=True):
-    if not api_key:
-        st.error("Please enter OpenAI API Key first")
-    elif st.session_state.vector_store is None:
-        st.error("Please initialize vector store first")
-    else:
-        with st.spinner("Processing PDFs..."):
-            try:
-                # Process PDFs
-                pdf_processor = PDFProcessor(pdf_dir=pdf_dir)
-                documents_data = pdf_processor.process_all_pdfs()
-                if not documents_data:
-                    st.warning(f"No PDFs found in {pdf_dir}")
-                else:
-                    # Summarize images
-                    image_summarizer = ImageSummarizer(api_key=api_key)
-                    documents_data = process_images_in_documents(
-                        documents_data,
-                        image_summarizer
-                    )
-                    # Prepare documents for embedding
-                    all_documents = []
-                    for doc_data in documents_data:
-                        doc_tuples = prepare_documents_for_embedding(doc_data)
-                        for text, metadata in doc_tuples:
-                            all_documents.append(
-                                Document(page_content=text, metadata=metadata)
-                            )
-                    # Add to vector store
-                    st.session_state.vector_store.add_documents(all_documents)
-                    st.session_state.document_count = st.session_state.vector_store.collection_count()
-                    # Reinitialize RAG chain
-                    retriever = st.session_state.vector_store.get_retriever()
-                    st.session_state.rag_chain = RAGChain(retriever, api_key=api_key)
-                    st.success(f"✅ Processed {len(documents_data)} PDFs with {len(all_documents)} chunks")
-                    st.info(f"Total documents in store: {st.session_state.document_count}")
-            except Exception as e:
-                st.error(f"❌ Error processing PDFs: {str(e)}")
-# Display vector store status
-st.sidebar.markdown("### Status")
-if st.session_state.vector_store:
-    doc_count = st.session_state.vector_store.collection_count()
-    st.sidebar.success(f"✅ Vector Store Ready")
-    st.sidebar.metric("Documents in Store", doc_count)
-else:
-    st.sidebar.warning("⚠️ Vector Store Not Initialized")
-# Main content area
-st.title("📄 Multimodal PDF RAG Assistant")
-st.markdown("Ask questions about your PDF documents. Responses will be provided in Russian.")
-# Check if system is ready
-if st.session_state.rag_chain is None:
-    st.info("""
-    ### Getting Started:
-    1. Enter your OpenAI API Key in the sidebar
-    2. Click "Initialize Vector Store"
-    3. Place PDF files in the configured directory
-    4. Click "Load & Process PDFs"
-    5. Ask questions in the chat below
-    """)
-else:
-    # Chat interface
-    st.markdown("---")
-    st.markdown("### Ask a Question")
-    col1, col2 = st.columns([1, 0.15])
-    with col1:
-        user_question = st.text_input(
-            "Your question:",
-            placeholder="Ask about your documents...",
-            label_visibility="collapsed"
-        )
-    with col2:
-        search_button = st.button("🔍 Search", use_container_width=True)
-    # Process question
-    if search_button and user_question:
-        with st.spinner("🤖 Searching documents and generating response..."):
-            try:
-                result = st.session_state.rag_chain.query(user_question)
-                # Display answer
-                st.markdown("### Answer")
-                st.markdown(result["answer"])
-                # Display sources
-                if result["sources"]:
-                    st.markdown("### Sources")
-                    for i, source in enumerate(result["sources"], 1):
-                        with st.expander(f"Source {i} - {source['metadata'].get('filename', 'Unknown')}"):
-                            st.markdown(f"**Type:** {source['metadata'].get('type', 'Unknown')}")
-                            st.markdown(f"**Page:** {source['metadata'].get('page', 'Unknown')}")
-                            st.markdown(f"**Content:** {source['content']}")
-            except Exception as e:
-                st.error(f"Error processing question: {str(e)}")
-    # Footer
-    st.markdown("---")
-    st.markdown("""
-    <div style="text-align: center; color: gray; font-size: 0.8rem;">
-    Powered by LangChain, ChromaDB, CLIP, and OpenAI
-    </div>
-    """, unsafe_allow_html=True)

+# app_with_upload_simple.py
+import streamlit as st
+import logging
+import os
+from pathlib import Path
+from datetime import datetime
+import base64
+# Setup logging
+logging.getLogger("pdfminer").setLevel(logging.ERROR)
+from pdf_processor import PDFProcessor, prepare_documents_for_embedding
+from embeddings_handler import CLIPLangChainEmbeddings
+from vectorstore_manager import VectorStoreManager
+from rag_chain import RAGChain
+from langchain_core.documents import Document
+# ============================================================================
+# PAGE CONFIGURATION
+# ============================================================================
+st.set_page_config(
+    page_title="Multimodal RAG Assistant",
+    page_icon="📄",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main { padding: 2rem; }
+    .stTabs [data-baseweb="tab-list"] { gap: 2rem; }
+    .metric-card { background-color: #f8f9fa; padding: 15px; border-radius: 5px; }
+</style>
+""", unsafe_allow_html=True)
+# ============================================================================
+# SESSION STATE INITIALIZATION
+# ============================================================================
+if "processor" not in st.session_state:
+    st.session_state.processor = None
+if "vector_store" not in st.session_state:
+    st.session_state.vector_store = None
+if "rag_chain" not in st.session_state:
+    st.session_state.rag_chain = None
+if "embeddings" not in st.session_state:
+    st.session_state.embeddings = None
+if "documents_processed" not in st.session_state:
+    st.session_state.documents_processed = 0
+if "extracted_content" not in st.session_state:
+    st.session_state.extracted_content = []
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+@st.cache_resource
+def init_processor(pdf_dir="./pdfs"):
+    """Initialize PDF processor."""
+    return PDFProcessor(pdf_dir=pdf_dir)
+@st.cache_resource
+def init_embeddings():
+    """Initialize CLIP embeddings."""
+    return CLIPLangChainEmbeddings(model_name="ViT-B-32", pretrained="openai")
+@st.cache_resource
+def init_vector_store(embeddings):
+    """Initialize vector store."""
+    return VectorStoreManager(
+        persist_dir="./chroma_db",
+        collection_name="pdf_documents",
+        embeddings=embeddings
+    )
+def save_uploaded_files(uploaded_files, target_dir="./pdfs"):
+    """Save uploaded files to directory."""
+    os.makedirs(target_dir, exist_ok=True)
+    saved_files = []
+    for uploaded_file in uploaded_files:
+        filepath = os.path.join(target_dir, uploaded_file.name)
+        with open(filepath, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        saved_files.append(uploaded_file.name)
+    return saved_files
+def get_document_stats(content):
+    """Get statistics from extracted content."""
+    stats = {
+        "pages": len(content.get("pages", [])),
+        "total_text": sum(len(p.get("text", "")) for p in content.get("pages", [])),
+        "tables": sum(len(p.get("tables", [])) for p in content.get("pages", [])),
+        "images": sum(len(p.get("images", [])) for p in content.get("pages", []))
+    }
+    return stats
+# ============================================================================
+# MAIN APP
+# ============================================================================
+st.title("📄 Multimodal PDF RAG Assistant")
+st.markdown("Upload PDFs, extract content, and query with multimodal embeddings.")
+# ============================================================================
+# SIDEBAR - CONFIGURATION & UPLOAD
+# ============================================================================
+with st.sidebar:
+    st.header("⚙️ Configuration & Upload")
+    # API Key
+    api_key = st.text_input(
+        "OpenAI API Key",
+        type="password",
+        value=os.getenv("OPENAI_API_KEY", ""),
+        help="Your OpenAI API key"
+    )
+    if api_key:
+        os.environ["OPENAI_API_KEY"] = api_key
+    st.markdown("---")
+    # PDF Upload Section
+    st.markdown("### 📤 Upload PDFs")
+    uploaded_pdfs = st.file_uploader(
+        "Choose PDF files",
+        type="pdf",
+        accept_multiple_files=True,
+        key="pdf_uploader",
+        help="Upload one or more PDF files"
+    )
+    if uploaded_pdfs:
+        st.info(f"📦 {len(uploaded_pdfs)} file(s) selected")
+        if st.button("💾 Save & Process PDFs", use_container_width=True):
+            # Save files
+            with st.spinner("📥 Saving files..."):
+                saved_files = save_uploaded_files(uploaded_pdfs)
+                st.success(f"✅ Saved {len(saved_files)} file(s)")
+            # Initialize processor
+            with st.spinner("🔄 Initializing processor..."):
+                processor = init_processor()
+                st.session_state.processor = processor
+            # Process PDFs
+            with st.spinner("📖 Processing PDFs..."):
+                documents = processor.process_all_pdfs()
+                st.session_state.extracted_content = documents
+                st.session_state.documents_processed = len(documents)
+                # Prepare chunks for embedding
+                all_chunks = []
+                for doc_content in documents:
+                    chunks = prepare_documents_for_embedding(doc_content)
+                    all_chunks.extend(chunks)
+                st.success(f"✅ Processed {len(documents)} PDF(s), {len(all_chunks)} chunks")
+                # Initialize embeddings and vector store
+                with st.spinner("🔗 Creating vector store..."):
+                    embeddings = init_embeddings()
+                    st.session_state.embeddings = embeddings
+                    vector_store = init_vector_store(embeddings)
+                    st.session_state.vector_store = vector_store
+                    # Add documents to vector store
+                    docs_for_store = [
+                        Document(page_content=text, metadata=meta)
+                        for text, meta in all_chunks
+                    ]
+                    vector_store.add_documents(docs_for_store)
+                    # Initialize RAG chain
+                    retriever = vector_store.get_retriever()
+                    rag_chain = RAGChain(retriever, api_key=api_key)
+                    st.session_state.rag_chain = rag_chain
+                    st.success("✅ Ready to query!")
+    st.markdown("---")
+    # Status
+    st.markdown("### 📊 Status")
+    if st.session_state.documents_processed > 0:
+        st.metric("Documents Processed", st.session_state.documents_processed)
+        total_pages = sum(
+            len(doc.get("pages", []))
+            for doc in st.session_state.extracted_content
+        )
+        st.metric("Total Pages", total_pages)
+        total_images = sum(
+            sum(len(p.get("images", [])) for p in doc.get("pages", []))
+            for doc in st.session_state.extracted_content
+        )
+        st.metric("Total Images", total_images)
+    else:
+        st.info("Upload and process PDFs to get started")
+# ============================================================================
+# MAIN CONTENT AREA - TABS
+# ============================================================================
+if st.session_state.documents_processed == 0:
+    st.warning("👈 Upload PDFs in the sidebar to get started")
+else:
+    tab1, tab2, tab3, tab4 = st.tabs(["🔍 Query", "📊 Documents", "🖼️ Images", "ℹ️ Info"])
+    # ====================================================================
+    # TAB 1: QUERY
+    # ====================================================================
+    with tab1:
+        st.header("🔍 Ask Questions")
+        st.markdown("Ask questions about your PDF documents.")
+        if st.session_state.rag_chain is None:
+            st.warning("⚠️ Please process PDFs first using the sidebar.")
+        else:
+            col1, col2 = st.columns([5, 1])
+            with col1:
+                user_query = st.text_input(
+                    "Your question:",
+                    placeholder="What is this document about?",
+                    label_visibility="collapsed"
+                )
+            with col2:
+                search_button = st.button("🔍 Search", use_container_width=True)
+            if search_button and user_query:
+                with st.spinner("🤖 Searching and generating response..."):
+                    try:
+                        result = st.session_state.rag_chain.query(user_query)
+                        # Display answer
+                        st.markdown("### 📝 Answer")
+                        st.markdown(result["answer"])
+                        # Display sources
+                        if result["sources"]:
+                            st.markdown("### 📚 Sources")
+                            for i, source in enumerate(result["sources"], 1):
+                                with st.expander(f"Source {i} - {source['metadata'].get('filename', 'Unknown')}"):
+                                    st.markdown(f"**Type:** {source['metadata'].get('type', 'Unknown')}")
+                                    st.markdown(f"**Page:** {source['metadata'].get('page', 'Unknown')}")
+                                    st.markdown(f"**Content:** {source['content'][:500]}...")
+                    except Exception as e:
+                        st.error(f"❌ Error: {str(e)}")
+    # ====================================================================
+    # TAB 2: DOCUMENTS
+    # ====================================================================
+    with tab2:
+        st.header("📊 Processed Documents")
+        if not st.session_state.extracted_content:
+            st.info("No documents processed yet.")
+        else:
+            # Overall statistics
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric("Documents", len(st.session_state.extracted_content))
+            with col2:
+                total_pages = sum(
+                    len(doc.get("pages", []))
+                    for doc in st.session_state.extracted_content
+                )
+                st.metric("Pages", total_pages)
+            with col3:
+                total_images = sum(
+                    sum(len(p.get("images", [])) for p in doc.get("pages", []))
+                    for doc in st.session_state.extracted_content
+                )
+                st.metric("Images", total_images)
+            with col4:
+                total_tables = sum(
+                    sum(len(p.get("tables", [])) for p in doc.get("pages", []))
+                    for doc in st.session_state.extracted_content
+                )
+                st.metric("Tables", total_tables)
+            st.markdown("---")
+            # Document details
+            st.markdown("### 📄 Document Details")
+            for idx, doc in enumerate(st.session_state.extracted_content, 1):
+                filename = doc.get("filename", f"Document {idx}")
+                stats = get_document_stats(doc)
+                with st.expander(f"📑 {filename}"):
+                    col1, col2, col3, col4 = st.columns(4)
+                    with col1:
+                        st.metric("Pages", stats["pages"])
+                    with col2:
+                        st.metric("Images", stats["images"])
+                    with col3:
+                        st.metric("Tables", stats["tables"])
+                    with col4:
+                        st.metric("Text (KB)", round(stats["total_text"] / 1024, 1))
+                    # Preview pages
+                    st.markdown("#### First 3 Pages Preview:")
+                    for page in doc.get("pages", [])[:3]:
+                        page_num = page.get("page_number")
+                        text = page.get("text", "")[:200]
+                        st.write(f"**Page {page_num}:** {text}...")
+    # ====================================================================
+    # TAB 3: IMAGES
+    # ====================================================================
+    with tab3:
+        st.header("🖼️ Extracted Images")
+        if not st.session_state.extracted_content:
+            st.info("No images extracted yet.")
+        else:
+            image_count = 0
+            for doc_idx, doc in enumerate(st.session_state.extracted_content, 1):
+                filename = doc.get("filename", f"Document {doc_idx}")
+                for page in doc.get("pages", []):
+                    page_num = page.get("page_number")
+                    images = page.get("images", [])
+                    if images:
+                        st.markdown(f"### 📄 {filename} - Page {page_num}")
+                        img_cols = st.columns(min(len(images), 2))
+                        for idx, image in enumerate(images):
+                            with img_cols[idx % 2]:
+                                # Try to display image
+                                if image.get("base64"):
+                                    try:
+                                        st.image(
+                                            f"data:image/{image.get('format', 'png')};base64,{image.get('base64')}",
+                                            caption=f"Image {image.get('index')}",
+                                            use_column_width=True
+                                        )
+                                        image_count += 1
+                                    except Exception as e:
+                                        st.warning(f"Could not display image: {e}")
+                                else:
+                                    st.warning("No image data available")
+            if image_count == 0:
+                st.info("No images were successfully extracted from the PDFs.")
+    # ====================================================================
+    # TAB 4: INFO
+    # ====================================================================
+    with tab4:
+        st.header("ℹ️ System Information")
+        st.markdown("### 🎯 Features")
+        features = {
+            "✅ PDF Upload": "Upload multiple PDFs via UI",
+            "✅ Text Extraction": "Extract text from documents",
+            "✅ Table Detection": "Identify and extract tables",
+            "✅ Image Extraction": "Extract and display images",
+            "✅ CLIP Embeddings": "Multimodal embeddings",
+            "✅ Vector Store": "ChromaDB for similarity search",
+            "✅ RAG Chain": "LangChain with OpenAI",
+            "✅ Russian Support": "Queries answered in Russian",
+        }
+        for feature, description in features.items():
+            st.markdown(f"**{feature}** - {description}")
+        st.markdown("---")
+        st.markdown("### 📦 System Status")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            if st.session_state.processor:
+                st.success("✅ Processor Ready")
+            else:
+                st.warning("⚠️ Processor Not Initialized")
+        with col2:
+            if st.session_state.embeddings:
+                st.success("✅ Embeddings Ready")
+            else:
+                st.warning("⚠️ Embeddings Not Initialized")
+        with col3:
+            if st.session_state.rag_chain:
+                st.success("✅ RAG Chain Ready")
+            else:
+                st.warning("⚠️ RAG Chain Not Initialized")
+        st.markdown("---")
+        st.markdown("### 🚀 How It Works")
+        st.markdown("""
+        1. **Upload**: Select one or more PDF files
+        2. **Process**: System extracts text, tables, and images
+        3. **Embed**: Content converted to multimodal embeddings
+        4. **Store**: Vectors stored in ChromaDB
+        5. **Query**: Ask questions about documents
+        6. **Retrieve**: Relevant content fetched from store
+        7. **Generate**: OpenAI creates response
+        8. **Display**: Answer and sources shown in UI
+        """)
+        st.markdown("---")
+        st.markdown("### 🔗 Technology Stack")
+        tech_info = {
+            "PDF Processing": "PyMuPDF, pdfplumber",
+            "Embeddings": "CLIP ViT-B-32 (open-clip-torch)",
+            "Vector Store": "ChromaDB",
+            "LLM Framework": "LangChain",
+            "Language Model": "OpenAI GPT-4o-mini",
+            "Web UI": "Streamlit",
+        }
+        for tech, details in tech_info.items():
+            st.write(f"**{tech}:** {details}")
+# ============================================================================
+# FOOTER
+# ============================================================================
+st.markdown("---")
+st.markdown(
+    "<div style='text-align: center; color: gray; font-size: 0.8rem;'>"
+    "Multimodal RAG LLM System | Powered by LangChain, ChromaDB, CLIP, and OpenAI"
+    "</div>",
+    unsafe_allow_html=True
+)