Spaces:

dnj0
/

final_project

Sleeping

File size: 17,786 Bytes

893bbbd

# app_with_upload_simple.py

import streamlit as st
import logging
import os
from pathlib import Path
from datetime import datetime
import base64

# Setup logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

from pdf_processor import PDFProcessor, prepare_documents_for_embedding
from embeddings_handler import CLIPLangChainEmbeddings
from vectorstore_manager import VectorStoreManager
from rag_chain import RAGChain
from langchain_core.documents import Document

# ============================================================================
# PAGE CONFIGURATION
# ============================================================================

st.set_page_config(
    page_title="Multimodal RAG Assistant",
    page_icon="📄",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main { padding: 2rem; }
    .stTabs [data-baseweb="tab-list"] { gap: 2rem; }
    .metric-card { background-color: #f8f9fa; padding: 15px; border-radius: 5px; }
</style>
""", unsafe_allow_html=True)

# ============================================================================
# SESSION STATE INITIALIZATION
# ============================================================================

if "processor" not in st.session_state:
    st.session_state.processor = None

if "vector_store" not in st.session_state:
    st.session_state.vector_store = None

if "rag_chain" not in st.session_state:
    st.session_state.rag_chain = None

if "embeddings" not in st.session_state:
    st.session_state.embeddings = None

if "documents_processed" not in st.session_state:
    st.session_state.documents_processed = 0

if "extracted_content" not in st.session_state:
    st.session_state.extracted_content = []

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

@st.cache_resource
def init_processor(pdf_dir="./pdfs"):
    """Initialize PDF processor."""
    return PDFProcessor(pdf_dir=pdf_dir)

@st.cache_resource
def init_embeddings():
    """Initialize CLIP embeddings."""
    return CLIPLangChainEmbeddings(model_name="ViT-B-32", pretrained="openai")

@st.cache_resource
def init_vector_store(embeddings):
    """Initialize vector store."""
    return VectorStoreManager(
        persist_dir="./chroma_db",
        collection_name="pdf_documents",
        embeddings=embeddings
    )

def save_uploaded_files(uploaded_files, target_dir="./pdfs"):
    """Save uploaded files to directory."""
    os.makedirs(target_dir, exist_ok=True)
    saved_files = []
    
    for uploaded_file in uploaded_files:
        filepath = os.path.join(target_dir, uploaded_file.name)
        with open(filepath, "wb") as f:
            f.write(uploaded_file.getbuffer())
        saved_files.append(uploaded_file.name)
    
    return saved_files

def get_document_stats(content):
    """Get statistics from extracted content."""
    stats = {
        "pages": len(content.get("pages", [])),
        "total_text": sum(len(p.get("text", "")) for p in content.get("pages", [])),
        "tables": sum(len(p.get("tables", [])) for p in content.get("pages", [])),
        "images": sum(len(p.get("images", [])) for p in content.get("pages", []))
    }
    return stats

# ============================================================================
# MAIN APP
# ============================================================================

st.title("📄 Multimodal PDF RAG Assistant")
st.markdown("Upload PDFs, extract content, and query with multimodal embeddings.")

# ============================================================================
# SIDEBAR - CONFIGURATION & UPLOAD
# ============================================================================

with st.sidebar:
    st.header("⚙️ Configuration & Upload")
    
    # API Key
    api_key = st.text_input(
        "OpenAI API Key",
        type="password",
        value=os.getenv("OPENAI_API_KEY", ""),
        help="Your OpenAI API key"
    )
    
    if api_key:
        os.environ["OPENAI_API_KEY"] = api_key
    
    st.markdown("---")
    
    # PDF Upload Section
    st.markdown("### 📤 Upload PDFs")
    
    uploaded_pdfs = st.file_uploader(
        "Choose PDF files",
        type="pdf",
        accept_multiple_files=True,
        key="pdf_uploader",
        help="Upload one or more PDF files"
    )
    
    if uploaded_pdfs:
        st.info(f"📦 {len(uploaded_pdfs)} file(s) selected")
        
        if st.button("💾 Save & Process PDFs", use_container_width=True):
            # Save files
            with st.spinner("📥 Saving files..."):
                saved_files = save_uploaded_files(uploaded_pdfs)
                st.success(f"✅ Saved {len(saved_files)} file(s)")
            
            # Initialize processor
            with st.spinner("🔄 Initializing processor..."):
                processor = init_processor()
                st.session_state.processor = processor
            
            # Process PDFs
            with st.spinner("📖 Processing PDFs..."):
                documents = processor.process_all_pdfs()
                st.session_state.extracted_content = documents
                st.session_state.documents_processed = len(documents)
                
                # Prepare chunks for embedding
                all_chunks = []
                for doc_content in documents:
                    chunks = prepare_documents_for_embedding(doc_content)
                    all_chunks.extend(chunks)
                
                st.success(f"✅ Processed {len(documents)} PDF(s), {len(all_chunks)} chunks")
                
                # Initialize embeddings and vector store
                with st.spinner("🔗 Creating vector store..."):
                    embeddings = init_embeddings()
                    st.session_state.embeddings = embeddings
                    
                    vector_store = init_vector_store(embeddings)
                    st.session_state.vector_store = vector_store
                    
                    # Add documents to vector store
                    docs_for_store = [
                        Document(page_content=text, metadata=meta)
                        for text, meta in all_chunks
                    ]
                    vector_store.add_documents(docs_for_store)
                    
                    # Initialize RAG chain
                    retriever = vector_store.get_retriever()
                    rag_chain = RAGChain(retriever, api_key=api_key)
                    st.session_state.rag_chain = rag_chain
                    
                    st.success("✅ Ready to query!")
    
    st.markdown("---")
    
    # Status
    st.markdown("### 📊 Status")
    
    if st.session_state.documents_processed > 0:
        st.metric("Documents Processed", st.session_state.documents_processed)
        
        total_pages = sum(
            len(doc.get("pages", []))
            for doc in st.session_state.extracted_content
        )
        st.metric("Total Pages", total_pages)
        
        total_images = sum(
            sum(len(p.get("images", [])) for p in doc.get("pages", []))
            for doc in st.session_state.extracted_content
        )
        st.metric("Total Images", total_images)
    else:
        st.info("Upload and process PDFs to get started")

# ============================================================================
# MAIN CONTENT AREA - TABS
# ============================================================================

if st.session_state.documents_processed == 0:
    st.warning("👈 Upload PDFs in the sidebar to get started")
else:
    tab1, tab2, tab3, tab4 = st.tabs(["🔍 Query", "📊 Documents", "🖼️ Images", "ℹ️ Info"])
    
    # ====================================================================
    # TAB 1: QUERY
    # ====================================================================
    
    with tab1:
        st.header("🔍 Ask Questions")
        st.markdown("Ask questions about your PDF documents.")
        
        if st.session_state.rag_chain is None:
            st.warning("⚠️ Please process PDFs first using the sidebar.")
        else:
            col1, col2 = st.columns([5, 1])
            
            with col1:
                user_query = st.text_input(
                    "Your question:",
                    placeholder="What is this document about?",
                    label_visibility="collapsed"
                )
            
            with col2:
                search_button = st.button("🔍 Search", use_container_width=True)
            
            if search_button and user_query:
                with st.spinner("🤖 Searching and generating response..."):
                    try:
                        result = st.session_state.rag_chain.query(user_query)
                        
                        # Display answer
                        st.markdown("### 📝 Answer")
                        st.markdown(result["answer"])
                        
                        # Display sources
                        if result["sources"]:
                            st.markdown("### 📚 Sources")
                            for i, source in enumerate(result["sources"], 1):
                                with st.expander(f"Source {i} - {source['metadata'].get('filename', 'Unknown')}"):
                                    st.markdown(f"**Type:** {source['metadata'].get('type', 'Unknown')}")
                                    st.markdown(f"**Page:** {source['metadata'].get('page', 'Unknown')}")
                                    st.markdown(f"**Content:** {source['content'][:500]}...")
                    
                    except Exception as e:
                        st.error(f"❌ Error: {str(e)}")
    
    # ====================================================================
    # TAB 2: DOCUMENTS
    # ====================================================================
    
    with tab2:
        st.header("📊 Processed Documents")
        
        if not st.session_state.extracted_content:
            st.info("No documents processed yet.")
        else:
            # Overall statistics
            col1, col2, col3, col4 = st.columns(4)
            
            with col1:
                st.metric("Documents", len(st.session_state.extracted_content))
            
            with col2:
                total_pages = sum(
                    len(doc.get("pages", []))
                    for doc in st.session_state.extracted_content
                )
                st.metric("Pages", total_pages)
            
            with col3:
                total_images = sum(
                    sum(len(p.get("images", [])) for p in doc.get("pages", []))
                    for doc in st.session_state.extracted_content
                )
                st.metric("Images", total_images)
            
            with col4:
                total_tables = sum(
                    sum(len(p.get("tables", [])) for p in doc.get("pages", []))
                    for doc in st.session_state.extracted_content
                )
                st.metric("Tables", total_tables)
            
            st.markdown("---")
            
            # Document details
            st.markdown("### 📄 Document Details")
            
            for idx, doc in enumerate(st.session_state.extracted_content, 1):
                filename = doc.get("filename", f"Document {idx}")
                stats = get_document_stats(doc)
                
                with st.expander(f"📑 {filename}"):
                    col1, col2, col3, col4 = st.columns(4)
                    
                    with col1:
                        st.metric("Pages", stats["pages"])
                    with col2:
                        st.metric("Images", stats["images"])
                    with col3:
                        st.metric("Tables", stats["tables"])
                    with col4:
                        st.metric("Text (KB)", round(stats["total_text"] / 1024, 1))
                    
                    # Preview pages
                    st.markdown("#### First 3 Pages Preview:")
                    for page in doc.get("pages", [])[:3]:
                        page_num = page.get("page_number")
                        text = page.get("text", "")[:200]
                        st.write(f"**Page {page_num}:** {text}...")
    
    # ====================================================================
    # TAB 3: IMAGES
    # ====================================================================
    
    with tab3:
        st.header("🖼️ Extracted Images")
        
        if not st.session_state.extracted_content:
            st.info("No images extracted yet.")
        else:
            image_count = 0
            
            for doc_idx, doc in enumerate(st.session_state.extracted_content, 1):
                filename = doc.get("filename", f"Document {doc_idx}")
                
                for page in doc.get("pages", []):
                    page_num = page.get("page_number")
                    images = page.get("images", [])
                    
                    if images:
                        st.markdown(f"### 📄 {filename} - Page {page_num}")
                        
                        img_cols = st.columns(min(len(images), 2))
                        
                        for idx, image in enumerate(images):
                            with img_cols[idx % 2]:
                                # Try to display image
                                if image.get("base64"):
                                    try:
                                        st.image(
                                            f"data:image/{image.get('format', 'png')};base64,{image.get('base64')}",
                                            caption=f"Image {image.get('index')}",
                                            use_column_width=True
                                        )
                                        image_count += 1
                                    except Exception as e:
                                        st.warning(f"Could not display image: {e}")
                                else:
                                    st.warning("No image data available")
            
            if image_count == 0:
                st.info("No images were successfully extracted from the PDFs.")
    
    # ====================================================================
    # TAB 4: INFO
    # ====================================================================
    
    with tab4:
        st.header("ℹ️ System Information")
        
        st.markdown("### 🎯 Features")
        
        features = {
            "✅ PDF Upload": "Upload multiple PDFs via UI",
            "✅ Text Extraction": "Extract text from documents",
            "✅ Table Detection": "Identify and extract tables",
            "✅ Image Extraction": "Extract and display images",
            "✅ CLIP Embeddings": "Multimodal embeddings",
            "✅ Vector Store": "ChromaDB for similarity search",
            "✅ RAG Chain": "LangChain with OpenAI",
            "✅ Russian Support": "Queries answered in Russian",
        }
        
        for feature, description in features.items():
            st.markdown(f"**{feature}** - {description}")
        
        st.markdown("---")
        
        st.markdown("### 📦 System Status")
        
        col1, col2, col3 = st.columns(3)
        
        with col1:
            if st.session_state.processor:
                st.success("✅ Processor Ready")
            else:
                st.warning("⚠️ Processor Not Initialized")
        
        with col2:
            if st.session_state.embeddings:
                st.success("✅ Embeddings Ready")
            else:
                st.warning("⚠️ Embeddings Not Initialized")
        
        with col3:
            if st.session_state.rag_chain:
                st.success("✅ RAG Chain Ready")
            else:
                st.warning("⚠️ RAG Chain Not Initialized")
        
        st.markdown("---")
        
        st.markdown("### 🚀 How It Works")
        
        st.markdown("""
        1. **Upload**: Select one or more PDF files
        2. **Process**: System extracts text, tables, and images
        3. **Embed**: Content converted to multimodal embeddings
        4. **Store**: Vectors stored in ChromaDB
        5. **Query**: Ask questions about documents
        6. **Retrieve**: Relevant content fetched from store
        7. **Generate**: OpenAI creates response
        8. **Display**: Answer and sources shown in UI
        """)
        
        st.markdown("---")
        
        st.markdown("### 🔗 Technology Stack")
        
        tech_info = {
            "PDF Processing": "PyMuPDF, pdfplumber",
            "Embeddings": "CLIP ViT-B-32 (open-clip-torch)",
            "Vector Store": "ChromaDB",
            "LLM Framework": "LangChain",
            "Language Model": "OpenAI GPT-4o-mini",
            "Web UI": "Streamlit",
        }
        
        for tech, details in tech_info.items():
            st.write(f"**{tech}:** {details}")

# ============================================================================
# FOOTER
# ============================================================================

st.markdown("---")
st.markdown(
    "<div style='text-align: center; color: gray; font-size: 0.8rem;'>"
    "Multimodal RAG LLM System | Powered by LangChain, ChromaDB, CLIP, and OpenAI"
    "</div>",
    unsafe_allow_html=True
)