Spaces:

dnj0
/

project

Sleeping

App Files Files Community

dnj0 commited on Nov 16, 2025

Commit

16691ee

verified ·

1 Parent(s): 8099442

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +327 -195

src/app.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import streamlit as st
 import os
 from pathlib import Path
 from pdf_parser import PDFParser
 from embedder import ChromaDBManager
 from rag_pipeline import RAGPipeline
-import torch
 # ============================================================================
@@ -27,16 +28,25 @@ st.markdown("""
 .main {
     padding: 2rem;
 }
-.error-box {
-    background-color: #ffcccc;
-    border: 1px solid #ff0000;
     border-radius: 4px;
     padding: 10px;
     margin: 10px 0;
 }
-.warning-box {
-    background-color: #ffffcc;
-    border: 1px solid #ffcc00;
     border-radius: 4px;
     padding: 10px;
     margin: 10px 0;
@@ -61,6 +71,7 @@ def initialize_system():
         st.error(f"Error initializing system: {e}")
         return None, None, None, None
 # Initialize
 pdf_parser, chroma_manager, rag_pipeline, device = initialize_system()
@@ -68,214 +79,233 @@ if pdf_parser is None:
     st.error("Failed to initialize RAG system. Please check your installation.")
     st.stop()
 # ============================================================================
 # MAIN UI
 # ============================================================================
-st.title("📄 Multimodal PDF RAG System (Improved)")
 st.markdown("**Local AI-powered document analysis with Qwen2.5-VL and ChromaDB**")
-st.markdown("*Fixes: Better error handling, token management, robust processing*")
-# Sidebar
-with st.sidebar:
-    st.header("⚙️ Configuration")
-    # PDF directory
-    pdf_dir = st.text_input(
-        "PDF Directory Path",
-        value="./pdf_documents",
-        help="Directory containing PDF files to process"
-    )
-    # Create directory if it doesn't exist
-    os.makedirs(pdf_dir, exist_ok=True)
-    st.divider()
-    # Load/Refresh documents
-    col1, col2 = st.columns(2)
-    with col1:
-        if st.button("📁 Load PDFs", use_container_width=True):
-            with st.spinner("Processing PDFs..."):
-                try:
-                    documents = pdf_parser.process_pdf_directory(pdf_dir)
-                    if documents:
-                        chroma_manager.add_documents(documents)
-                        st.success(f"✅ Loaded {len(documents)} documents!")
-                    else:
-                        st.warning("⚠️ No PDFs found in directory")
-                except Exception as e:
-                    st.error(f"❌ Error loading PDFs: {e}")
-    with col2:
-        if st.button("🔄 Refresh", use_container_width=True):
-            st.rerun()
-    st.divider()
-    # Statistics
-    st.subheader("📊 Statistics")
-    try:
-        collection_info = chroma_manager.get_collection_info()
-        st.metric("Documents in DB", collection_info['document_count'])
-    except Exception as e:
-        st.warning(f"Could not load statistics: {e}")
-    st.divider()
-    # Device info
-    device_name = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
-    st.info(f"Running on: {device_name}")
-# Main content with tabs
-tab1, tab2, tab3, tab4 = st.tabs(["🔍 Ask Question", "📝 Document Summary", "ℹ️ About", "🛠️ Database"])
 # ============================================================================
-# TAB 1: ASK QUESTIONS
 # ============================================================================
-with tab1:
-    st.header("🔍 Ask Questions About Your Documents")
     col1, col2 = st.columns([3, 1])
     with col1:
-        query = st.text_input(
-            "Enter your question (in Russian or English):",
-            placeholder="Например: Какие ключевые моменты описаны в документе?",
-            help="Ask any question about your uploaded documents"
         )
     with col2:
-        n_docs = st.number_input("Retrieved docs:", value=5, min_value=1, max_value=10)
-    if st.button("🚀 Get Answer", use_container_width=True, type="primary"):
-        try:
-            collection_info = chroma_manager.get_collection_info()
-            if collection_info['document_count'] == 0:
-                st.warning("⚠️ No documents loaded. Please load PDFs from the sidebar first.")
-            elif not query:
-                st.warning("⚠️ Please enter a question.")
-            else:
-                with st.spinner("🤖 Generating answer... (this may take 10-60 seconds)"):
-                    result = rag_pipeline.answer_question(
-                        query=query,
-                        n_retrieved=n_docs,
-                        max_new_tokens=512
-                    )
-                    # Check for errors
-                    if "error" in result and result["error"]:
-                        st.error(f"⚠️ {result['error']}")
-                    # Display answer
-                    st.success("✅ Answer Generated")
-                    st.markdown("### Answer")
-                    st.write(result['answer'])
-                    # Display retrieved documents
-                    with st.expander("📚 Retrieved Documents", expanded=False):
-                        st.markdown(f"#### {result['doc_count']} Relevant Document Chunks:")
-                        for idx, doc in enumerate(result['retrieved_docs'], 1):
-                            with st.container():
-                                col_rel, col_meta = st.columns([3, 1])
-                                with col_rel:
-                                    st.markdown(f"**Document {idx}**")
-                                with col_meta:
-                                    st.caption(f"Relevance: {doc['relevance_score']:.2%}")
-                                # Truncate for display
-                                preview = doc['document'][:300] + "..." if len(doc['document']) > 300 else doc['document']
-                                st.write(preview)
-                                if doc['metadata']:
-                                    st.caption(f"Source: {doc['metadata'].get('filename', 'Unknown')}")
-        except Exception as e:
-            st.error(f"❌ Error processing question: {e}")
-# ============================================================================
-# TAB 2: DOCUMENT SUMMARY
-# ============================================================================
-with tab2:
-    st.header("📝 Document Summary")
-    st.markdown("Generate a summary of all indexed documents")
-    if st.button("📊 Generate Summary of All Documents", use_container_width=True, type="primary"):
-        try:
             collection_info = chroma_manager.get_collection_info()
-            if collection_info['document_count'] == 0:
-                st.warning("⚠️ No documents loaded. Please load PDFs first.")
-            else:
-                with st.spinner("🤖 Generating summary... (this may take 20-60 seconds)"):
-                    summary = rag_pipeline.summarize_all_documents()
-                    st.markdown("### Summary")
-                    st.write(summary)
-        except Exception as e:
-            st.error(f"❌ Error generating summary: {e}")
 # ============================================================================
-# TAB 3: ABOUT
 # ============================================================================
-with tab3:
-    st.header("ℹ️ About This System")
-    st.markdown("""
-    ### Overview
-    This is an **improved Local Multimodal RAG System** with enhanced error handling and token management.
-    ### Key Improvements (Fixed Version)
-    ✅ **Token Management**: Automatic context truncation to prevent model errors
-    ✅ **Error Handling**: Comprehensive try-catch blocks throughout
-    ✅ **Image Extraction**: Fixed PyMuPDF xref handling
-    ✅ **Better Limits**: Resource limits on text, tables, and images
-    ✅ **Performance**: Optimized for large PDFs (400+ pages)
-    ✅ **Robustness**: Graceful degradation on errors
-    ### Core Features
-    - **📄 PDF Processing**: Text, tables, and images extraction
-    - **🔍 Vector Search**: ChromaDB with CLIP embeddings
-    - **🤖 AI Generation**: Qwen2.5-VL-3B model
-    - **🌐 Russian Support**: Full support for Russian language
-    - **💾 Persistent Storage**: Local ChromaDB database
-    - **⚡ Lightweight**: Runs on consumer hardware
-    ### Technology Stack
-    - **LLM Model**: Qwen2.5-VL-3B-Instruct
-    - **Embeddings**: CLIP (clip-vit-base-patch32)
-    - **Vector DB**: ChromaDB with persistent storage
-    - **UI**: Streamlit
-    - **PDF Tools**: pdfplumber + PyMuPDF
-    ### System Requirements
-    - Python 3.9+
-    - RAM: 8GB minimum (12GB+ recommended)
-    - Storage: 15GB for models
-    - GPU optional (CUDA for faster inference)
-    ### Performance
-    - Model Load: ~30 seconds
-    - Query Response (CPU): 20-60 seconds
-    - Query Response (GPU): 5-15 seconds
-    - PDF Processing: 1-2 seconds per page
-    ### What's Fixed
-    - ✅ Token limit errors (uses chunking + truncation)
-    - ✅ Image extraction errors (proper xref handling)
-    - ✅ Memory issues (resource limits on text/tables/images)
-    - ✅ PyTorch GPU loading (fbgemm.dll issues)
-    - ✅ Error reporting (detailed error messages)
-    """)
 # ============================================================================
-# TAB 4: DATABASE MANAGEMENT
 # ============================================================================
-with tab4:
     st.header("🛠️ Database Management")
     col1, col2, col3 = st.columns(3)
@@ -294,10 +324,17 @@ with tab4:
                 all_docs = chroma_manager.collection.get(include=['documents'])
                 if all_docs['ids']:
                     st.write(f"Total documents: {len(all_docs['ids'])}")
-                    for idx, doc_id in enumerate(all_docs['ids'][:15], 1):
-                        st.write(f"{idx}. {doc_id}")
-                    if len(all_docs['ids']) > 15:
-                        st.write(f"... and {len(all_docs['ids']) - 15} more")
                 else:
                     st.info("No documents in database")
             except Exception as e:
@@ -318,14 +355,109 @@ with tab4:
     st.divider()
-    st.markdown("### Quick Stats")
-    stats_col1, stats_col2 = st.columns(2)
-    with stats_col1:
-        st.metric("PDF Extraction Dir", "./pdf_extractions")
-    with stats_col2:
-        st.metric("ChromaDB Location", "./chroma_db")
 # ============================================================================
 # FOOTER
@@ -334,6 +466,6 @@ with tab4:
 st.divider()
 st.markdown("""
 <div style='text-align: center; color: #666; font-size: 0.9rem;'>
-    Multimodal RAG System (Improved) | Qwen2.5-VL + ChromaDB + Streamlit | v1.1
 </div>
 """, unsafe_allow_html=True)

 import streamlit as st
 import os
+import tempfile
 from pathlib import Path
+import torch
 from pdf_parser import PDFParser
 from embedder import ChromaDBManager
 from rag_pipeline import RAGPipeline
 # ============================================================================
 .main {
     padding: 2rem;
 }
+.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p {
+    font-size: 1.2rem;
+}
+.upload-area {
+    border: 2px dashed #ccc;
+    border-radius: 5px;
+    padding: 20px;
+    text-align: center;
+}
+.success-box {
+    background-color: #d4edda;
+    border: 1px solid #28a745;
     border-radius: 4px;
     padding: 10px;
     margin: 10px 0;
 }
+.error-box {
+    background-color: #f8d7da;
+    border: 1px solid #f5c6cb;
     border-radius: 4px;
     padding: 10px;
     margin: 10px 0;
         st.error(f"Error initializing system: {e}")
         return None, None, None, None
 # Initialize
 pdf_parser, chroma_manager, rag_pipeline, device = initialize_system()
     st.error("Failed to initialize RAG system. Please check your installation.")
     st.stop()
+# Initialize session state for uploaded files
+if 'uploaded_files' not in st.session_state:
+    st.session_state.uploaded_files = []
+if 'processing_status' not in st.session_state:
+    st.session_state.processing_status = {}
 # ============================================================================
 # MAIN UI
 # ============================================================================
+st.title("📄 Multimodal PDF RAG System")
 st.markdown("**Local AI-powered document analysis with Qwen2.5-VL and ChromaDB**")
+st.markdown("*Upload PDFs directly and ask questions about them*")
+# Create main tabs
+tab_upload, tab_query, tab_manage, tab_about = st.tabs(["📤 Upload PDFs", "🔍 Ask Questions", "🛠️ Manage", "ℹ️ About"])
 # ============================================================================
+# TAB 1: UPLOAD PDFs
 # ============================================================================
+with tab_upload:
+    st.header("📤 Upload PDF Documents")
     col1, col2 = st.columns([3, 1])
     with col1:
+        st.markdown("**Upload your PDF files below. They will be automatically processed and stored.**")
+        # File uploader
+        uploaded_files = st.file_uploader(
+            "Choose PDF files",
+            type=["pdf"],
+            accept_multiple_files=True,
+            help="You can upload multiple PDF files at once"
         )
     with col2:
+        st.info(f"📊 Documents in DB: {chroma_manager.get_collection_info()['document_count']}")
+    # Process uploaded files
+    if uploaded_files:
+        st.divider()
+        st.subheader("Processing Uploaded Files")
+        # Create a temporary directory for uploads
+        temp_dir = tempfile.mkdtemp()
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        results_container = st.container()
+        total_files = len(uploaded_files)
+        processed_files = []
+        failed_files = []
+        for idx, uploaded_file in enumerate(uploaded_files):
+            try:
+                # Update progress
+                status_text.text(f"Processing {idx + 1}/{total_files}: {uploaded_file.name}")
+                # Save uploaded file to temp directory
+                temp_file_path = os.path.join(temp_dir, uploaded_file.name)
+                with open(temp_file_path, "wb") as f:
+                    f.write(uploaded_file.getbuffer())
+                # Process PDF
+                with st.spinner(f"Extracting content from {uploaded_file.name}..."):
+                    try:
+                        result = pdf_parser.process_pdf(temp_file_path)
+                        # Add to ChromaDB
+                        chroma_manager.add_documents([result])
+                        processed_files.append({
+                            'name': uploaded_file.name,
+                            'size': uploaded_file.size,
+                            'text_length': len(result.get('text', '')),
+                            'tables': len(result.get('tables', [])),
+                            'images': len(result.get('images', []))
+                        })
+                        st.success(f"✅ {uploaded_file.name} processed successfully")
+                    except Exception as e:
+                        failed_files.append({
+                            'name': uploaded_file.name,
+                            'error': str(e)
+                        })
+                        st.error(f"❌ Error processing {uploaded_file.name}: {e}")
+                # Update progress
+                progress_bar.progress((idx + 1) / total_files)
+            except Exception as e:
+                failed_files.append({
+                    'name': uploaded_file.name,
+                    'error': str(e)
+                })
+                st.error(f"❌ Error with {uploaded_file.name}: {e}")
+        # Show summary
+        st.divider()
+        st.subheader("Upload Summary")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Successfully Processed", len(processed_files))
+        with col2:
+            st.metric("Failed", len(failed_files))
+        with col3:
             collection_info = chroma_manager.get_collection_info()
+            st.metric("Total in Database", collection_info['document_count'])
+        # Show details of processed files
+        if processed_files:
+            st.markdown("#### ✅ Processed Files:")
+            for file_info in processed_files:
+                col1, col2, col3, col4 = st.columns(4)
+                with col1:
+                    st.text(file_info['name'])
+                with col2:
+                    st.text(f"{file_info['size'] / 1024:.1f} KB")
+                with col3:
+                    st.text(f"{file_info['text_length']:,} chars")
+                with col4:
+                    st.text(f"{file_info['tables']} tables, {file_info['images']} imgs")
+        # Show failed files
+        if failed_files:
+            st.markdown("#### ❌ Failed Files:")
+            for file_info in failed_files:
+                st.error(f"**{file_info['name']}**: {file_info['error']}")
 # ============================================================================
+# TAB 2: ASK QUESTIONS
 # ============================================================================
+with tab_query:
+    st.header("🔍 Ask Questions About Your Documents")
+    collection_info = chroma_manager.get_collection_info()
+    if collection_info['document_count'] == 0:
+        st.warning("⚠️ No documents uploaded yet. Please upload PDFs in the 'Upload PDFs' tab first.")
+    else:
+        st.success(f"✅ {collection_info['document_count']} documents in database")
+        col1, col2, col3 = st.columns([2, 1, 1])
+        with col1:
+            query = st.text_input(
+                "Enter your question:",
+                placeholder="Например: Какие ключевые моменты описаны в документе?",
+                help="Ask any question about your uploaded documents"
+            )
+        with col2:
+            n_docs = st.number_input("Retrieved docs:", value=3, min_value=1, max_value=10)
+        with col3:
+            max_tokens = st.number_input("Max tokens:", value=256, min_value=128, max_value=512, step=128)
+        if st.button("🚀 Get Answer", use_container_width=True, type="primary"):
+            if not query:
+                st.warning("⚠️ Please enter a question.")
+            else:
+                try:
+                    with st.spinner("🤖 Generating answer... (this may take 10-30 seconds)"):
+                        st.info("Processing query - please wait...")
+                        # Generate answer with error handling
+                        try:
+                            result = rag_pipeline.answer_question(
+                                query=query,
+                                n_retrieved=n_docs,
+                                max_new_tokens=max_tokens
+                            )
+                            # Check for errors
+                            if "error" in result and result["error"]:
+                                st.error(f"⚠️ {result['error']}")
+                            # Display answer
+                            st.success("✅ Answer Generated")
+                            st.markdown("### Answer")
+                            st.write(result['answer'])
+                            # Display retrieved documents
+                            with st.expander("📚 Retrieved Documents", expanded=False):
+                                st.markdown(f"#### {result['doc_count']} Relevant Document Chunks:")
+                                for idx, doc in enumerate(result['retrieved_docs'], 1):
+                                    with st.container():
+                                        col_rel, col_score = st.columns([3, 1])
+                                        with col_rel:
+                                            st.markdown(f"**Document {idx}**")
+                                        with col_score:
+                                            st.caption(f"Score: {doc['relevance_score']:.1%}")
+                                        # Truncate for display
+                                        preview = doc['document'][:400]
+                                        if len(doc['document']) > 400:
+                                            preview += "..."
+                                        st.write(preview)
+                                        if doc['metadata']:
+                                            st.caption(f"Source: {doc['metadata'].get('filename', 'Unknown')}")
+                        except Exception as e:
+                            st.error(f"❌ Error during generation: {e}")
+                            st.info("Possible causes:")
+                            st.write("- Out of memory (try reducing 'Max tokens' or 'Retrieved docs')")
+                            st.write("- Model inference timeout")
+                            st.write("- Invalid input format")
+                except Exception as e:
+                    st.error(f"❌ Unexpected error: {e}")
 # ============================================================================
+# TAB 3: MANAGE DATABASE
 # ============================================================================
+with tab_manage:
     st.header("🛠️ Database Management")
     col1, col2, col3 = st.columns(3)
                 all_docs = chroma_manager.collection.get(include=['documents'])
                 if all_docs['ids']:
                     st.write(f"Total documents: {len(all_docs['ids'])}")
+                    col1_list, col2_list = st.columns(2)
+                    with col1_list:
+                        st.write("**First 10:**")
+                        for idx, doc_id in enumerate(all_docs['ids'][:10], 1):
+                            st.write(f"{idx}. {doc_id[:50]}...")
+                    with col2_list:
+                        if len(all_docs['ids']) > 10:
+                            st.write(f"**... and {len(all_docs['ids']) - 10} more**")
                 else:
                     st.info("No documents in database")
             except Exception as e:
     st.divider()
+    st.markdown("### Storage Information")
+    col1, col2 = st.columns(2)
+    with col1:
+        extraction_size = sum(
+            os.path.getsize(os.path.join(dirpath, filename))
+            for dirpath, dirnames, filenames in os.walk("./pdf_extractions")
+            for filename in filenames
+        ) / (1024 * 1024) if os.path.exists("./pdf_extractions") else 0
+        st.metric("PDF Extractions", f"{extraction_size:.1f} MB")
+    with col2:
+        chroma_size = sum(
+            os.path.getsize(os.path.join(dirpath, filename))
+            for dirpath, dirnames, filenames in os.walk("./chroma_db")
+            for filename in filenames
+        ) / (1024 * 1024) if os.path.exists("./chroma_db") else 0
+        st.metric("ChromaDB Storage", f"{chroma_size:.1f} MB")
+# ============================================================================
+# TAB 4: ABOUT
+# ============================================================================
+with tab_about:
+    st.header("ℹ️ About This System")
+    st.markdown("""
+    ### Multimodal RAG System with PDF Upload
+    This is a **local, privacy-first AI document analysis system** that allows you to:
+    #### ✨ Features
+    - **📤 Easy PDF Upload**: Drag & drop or select multiple PDF files
+    - **🔍 Smart Search**: Semantic search across documents with CLIP embeddings
+    - **🤖 AI-Powered Answers**: Ask questions and get answers from Qwen2.5-VL-3B
+    - **🌐 Russian & English**: Full support for both languages
+    - **💾 Local Storage**: All data stays on your machine
+    - **⚡ Fast Processing**: Automatic caching to avoid re-processing
+    #### 🏗️ How It Works
+    1. Upload PDF documents
+    2. System extracts text, tables, and images
+    3. Content is embedded with CLIP and stored in ChromaDB
+    4. Ask questions about your documents
+    5. AI retrieves relevant sections and generates answers
+    #### 🔐 Privacy & Security
+    - ✅ All processing happens locally
+    - ✅ No internet required (after model download)
+    - ✅ No cloud APIs used
+    - ✅ Full data control
+    - ✅ Open-source code
+    #### 💻 Technology Stack
+    - **LLM**: Qwen2.5-VL-3B (multimodal)
+    - **Embeddings**: CLIP (clip-vit-base-patch32)
+    - **Vector DB**: ChromaDB
+    - **UI**: Streamlit
+    - **PDF Processing**: pdfplumber + PyMuPDF
+    #### 📊 System Info
+    """)
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        device_name = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
+        st.metric("Device", device_name)
+    with col2:
+        collection_info = chroma_manager.get_collection_info()
+        st.metric("Documents in DB", collection_info['document_count'])
+    with col3:
+        st.metric("Version", "1.2 (Upload)")
+    st.divider()
+    st.markdown("""
+    #### 📝 How to Use
+    1. **Upload PDFs**: Go to the "Upload PDFs" tab and select your files
+    2. **Wait for Processing**: System automatically extracts content
+    3. **Ask Questions**: Switch to "Ask Questions" tab and type your query
+    4. **Review Results**: See generated answers and relevant document chunks
+    5. **Manage**: Use "Manage" tab to view or clear database
+    #### ⚙️ Tips for Best Results
+    - Start with smaller PDFs to test
+    - Ask specific questions for better answers
+    - Reduce "Retrieved docs" if responses are slow
+    - Use Russian for Russian documents (better accuracy)
+    #### 🔧 Performance Tuning
+    - **Slow responses**: Reduce "Max tokens" from 512 to 256
+    - **Out of memory**: Use fewer "Retrieved docs" (1-3)
+    - **Better quality**: Increase "Max tokens" to 512
+    #### ❓ Troubleshooting
+    - **App closes**: Reduce "Max tokens" and "Retrieved docs"
+    - **Slow processing**: First upload takes time (model loading)
+    - **Memory issues**: Use CPU mode (edit in sidebar)
+    """)
 # ============================================================================
 # FOOTER
 st.divider()
 st.markdown("""
 <div style='text-align: center; color: #666; font-size: 0.9rem;'>
+    Multimodal RAG System with PDF Upload | Qwen2.5-VL + ChromaDB + Streamlit | v1.2
 </div>
 """, unsafe_allow_html=True)