Spaces:

dnj0
/

project

Sleeping

App Files Files Community

dnj0 commited on Nov 16, 2025

Commit

8099442

verified ·

1 Parent(s): de47ca3

Upload 4 files

Browse files

Files changed (4) hide show

src/app.py +339 -0
src/embedder.py +126 -0
src/pdf_parser.py +257 -0
src/rag_pipeline.py +417 -0

src/app.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import streamlit as st
+import os
+from pathlib import Path
+from pdf_parser import PDFParser
+from embedder import ChromaDBManager
+from rag_pipeline import RAGPipeline
+import torch
+# ============================================================================
+# PAGE CONFIGURATION
+# ============================================================================
+st.set_page_config(
+    page_title="Multimodal PDF RAG System",
+    page_icon="📄",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# ============================================================================
+# CUSTOM STYLING
+# ============================================================================
+st.markdown("""
+<style>
+.main {
+    padding: 2rem;
+}
+.error-box {
+    background-color: #ffcccc;
+    border: 1px solid #ff0000;
+    border-radius: 4px;
+    padding: 10px;
+    margin: 10px 0;
+}
+.warning-box {
+    background-color: #ffffcc;
+    border: 1px solid #ffcc00;
+    border-radius: 4px;
+    padding: 10px;
+    margin: 10px 0;
+}
+</style>
+""", unsafe_allow_html=True)
+# ============================================================================
+# SESSION STATE INITIALIZATION
+# ============================================================================
+@st.cache_resource
+def initialize_system():
+    """Initialize RAG system components once."""
+    try:
+        parser = PDFParser(extraction_dir="./pdf_extractions")
+        chroma = ChromaDBManager(db_dir="./chroma_db")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        rag = RAGPipeline(chroma, device=device)
+        return parser, chroma, rag, device
+    except Exception as e:
+        st.error(f"Error initializing system: {e}")
+        return None, None, None, None
+# Initialize
+pdf_parser, chroma_manager, rag_pipeline, device = initialize_system()
+if pdf_parser is None:
+    st.error("Failed to initialize RAG system. Please check your installation.")
+    st.stop()
+# ============================================================================
+# MAIN UI
+# ============================================================================
+st.title("📄 Multimodal PDF RAG System (Improved)")
+st.markdown("**Local AI-powered document analysis with Qwen2.5-VL and ChromaDB**")
+st.markdown("*Fixes: Better error handling, token management, robust processing*")
+# Sidebar
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    # PDF directory
+    pdf_dir = st.text_input(
+        "PDF Directory Path",
+        value="./pdf_documents",
+        help="Directory containing PDF files to process"
+    )
+    # Create directory if it doesn't exist
+    os.makedirs(pdf_dir, exist_ok=True)
+    st.divider()
+    # Load/Refresh documents
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("📁 Load PDFs", use_container_width=True):
+            with st.spinner("Processing PDFs..."):
+                try:
+                    documents = pdf_parser.process_pdf_directory(pdf_dir)
+                    if documents:
+                        chroma_manager.add_documents(documents)
+                        st.success(f"✅ Loaded {len(documents)} documents!")
+                    else:
+                        st.warning("⚠️ No PDFs found in directory")
+                except Exception as e:
+                    st.error(f"❌ Error loading PDFs: {e}")
+    with col2:
+        if st.button("🔄 Refresh", use_container_width=True):
+            st.rerun()
+    st.divider()
+    # Statistics
+    st.subheader("📊 Statistics")
+    try:
+        collection_info = chroma_manager.get_collection_info()
+        st.metric("Documents in DB", collection_info['document_count'])
+    except Exception as e:
+        st.warning(f"Could not load statistics: {e}")
+    st.divider()
+    # Device info
+    device_name = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
+    st.info(f"Running on: {device_name}")
+# Main content with tabs
+tab1, tab2, tab3, tab4 = st.tabs(["🔍 Ask Question", "📝 Document Summary", "ℹ️ About", "🛠️ Database"])
+# ============================================================================
+# TAB 1: ASK QUESTIONS
+# ============================================================================
+with tab1:
+    st.header("🔍 Ask Questions About Your Documents")
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        query = st.text_input(
+            "Enter your question (in Russian or English):",
+            placeholder="Например: Какие ключевые моменты описаны в документе?",
+            help="Ask any question about your uploaded documents"
+        )
+    with col2:
+        n_docs = st.number_input("Retrieved docs:", value=5, min_value=1, max_value=10)
+    if st.button("🚀 Get Answer", use_container_width=True, type="primary"):
+        try:
+            collection_info = chroma_manager.get_collection_info()
+            if collection_info['document_count'] == 0:
+                st.warning("⚠️ No documents loaded. Please load PDFs from the sidebar first.")
+            elif not query:
+                st.warning("⚠️ Please enter a question.")
+            else:
+                with st.spinner("🤖 Generating answer... (this may take 10-60 seconds)"):
+                    result = rag_pipeline.answer_question(
+                        query=query,
+                        n_retrieved=n_docs,
+                        max_new_tokens=512
+                    )
+                    # Check for errors
+                    if "error" in result and result["error"]:
+                        st.error(f"⚠️ {result['error']}")
+                    # Display answer
+                    st.success("✅ Answer Generated")
+                    st.markdown("### Answer")
+                    st.write(result['answer'])
+                    # Display retrieved documents
+                    with st.expander("📚 Retrieved Documents", expanded=False):
+                        st.markdown(f"#### {result['doc_count']} Relevant Document Chunks:")
+                        for idx, doc in enumerate(result['retrieved_docs'], 1):
+                            with st.container():
+                                col_rel, col_meta = st.columns([3, 1])
+                                with col_rel:
+                                    st.markdown(f"**Document {idx}**")
+                                with col_meta:
+                                    st.caption(f"Relevance: {doc['relevance_score']:.2%}")
+                                # Truncate for display
+                                preview = doc['document'][:300] + "..." if len(doc['document']) > 300 else doc['document']
+                                st.write(preview)
+                                if doc['metadata']:
+                                    st.caption(f"Source: {doc['metadata'].get('filename', 'Unknown')}")
+        except Exception as e:
+            st.error(f"❌ Error processing question: {e}")
+# ============================================================================
+# TAB 2: DOCUMENT SUMMARY
+# ============================================================================
+with tab2:
+    st.header("📝 Document Summary")
+    st.markdown("Generate a summary of all indexed documents")
+    if st.button("📊 Generate Summary of All Documents", use_container_width=True, type="primary"):
+        try:
+            collection_info = chroma_manager.get_collection_info()
+            if collection_info['document_count'] == 0:
+                st.warning("⚠️ No documents loaded. Please load PDFs first.")
+            else:
+                with st.spinner("🤖 Generating summary... (this may take 20-60 seconds)"):
+                    summary = rag_pipeline.summarize_all_documents()
+                    st.markdown("### Summary")
+                    st.write(summary)
+        except Exception as e:
+            st.error(f"❌ Error generating summary: {e}")
+# ============================================================================
+# TAB 3: ABOUT
+# ============================================================================
+with tab3:
+    st.header("ℹ️ About This System")
+    st.markdown("""
+    ### Overview
+    This is an **improved Local Multimodal RAG System** with enhanced error handling and token management.
+    ### Key Improvements (Fixed Version)
+    ✅ **Token Management**: Automatic context truncation to prevent model errors
+    ✅ **Error Handling**: Comprehensive try-catch blocks throughout
+    ✅ **Image Extraction**: Fixed PyMuPDF xref handling
+    ✅ **Better Limits**: Resource limits on text, tables, and images
+    ✅ **Performance**: Optimized for large PDFs (400+ pages)
+    ✅ **Robustness**: Graceful degradation on errors
+    ### Core Features
+    - **📄 PDF Processing**: Text, tables, and images extraction
+    - **🔍 Vector Search**: ChromaDB with CLIP embeddings
+    - **🤖 AI Generation**: Qwen2.5-VL-3B model
+    - **🌐 Russian Support**: Full support for Russian language
+    - **💾 Persistent Storage**: Local ChromaDB database
+    - **⚡ Lightweight**: Runs on consumer hardware
+    ### Technology Stack
+    - **LLM Model**: Qwen2.5-VL-3B-Instruct
+    - **Embeddings**: CLIP (clip-vit-base-patch32)
+    - **Vector DB**: ChromaDB with persistent storage
+    - **UI**: Streamlit
+    - **PDF Tools**: pdfplumber + PyMuPDF
+    ### System Requirements
+    - Python 3.9+
+    - RAM: 8GB minimum (12GB+ recommended)
+    - Storage: 15GB for models
+    - GPU optional (CUDA for faster inference)
+    ### Performance
+    - Model Load: ~30 seconds
+    - Query Response (CPU): 20-60 seconds
+    - Query Response (GPU): 5-15 seconds
+    - PDF Processing: 1-2 seconds per page
+    ### What's Fixed
+    - ✅ Token limit errors (uses chunking + truncation)
+    - ✅ Image extraction errors (proper xref handling)
+    - ✅ Memory issues (resource limits on text/tables/images)
+    - ✅ PyTorch GPU loading (fbgemm.dll issues)
+    - ✅ Error reporting (detailed error messages)
+    """)
+# ============================================================================
+# TAB 4: DATABASE MANAGEMENT
+# ============================================================================
+with tab4:
+    st.header("🛠️ Database Management")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        if st.button("ℹ️ Database Info", use_container_width=True):
+            try:
+                info = chroma_manager.get_collection_info()
+                st.json(info)
+            except Exception as e:
+                st.error(f"Error: {e}")
+    with col2:
+        if st.button("📋 List Documents", use_container_width=True):
+            try:
+                all_docs = chroma_manager.collection.get(include=['documents'])
+                if all_docs['ids']:
+                    st.write(f"Total documents: {len(all_docs['ids'])}")
+                    for idx, doc_id in enumerate(all_docs['ids'][:15], 1):
+                        st.write(f"{idx}. {doc_id}")
+                    if len(all_docs['ids']) > 15:
+                        st.write(f"... and {len(all_docs['ids']) - 15} more")
+                else:
+                    st.info("No documents in database")
+            except Exception as e:
+                st.error(f"Error: {e}")
+    with col3:
+        if st.button("🗑️ Clear Database", use_container_width=True):
+            try:
+                collection_info = chroma_manager.get_collection_info()
+                if collection_info['document_count'] > 0:
+                    chroma_manager.clear_collection()
+                    st.success("✅ Database cleared!")
+                    st.rerun()
+                else:
+                    st.info("Database is already empty")
+            except Exception as e:
+                st.error(f"Error: {e}")
+    st.divider()
+    st.markdown("### Quick Stats")
+    stats_col1, stats_col2 = st.columns(2)
+    with stats_col1:
+        st.metric("PDF Extraction Dir", "./pdf_extractions")
+    with stats_col2:
+        st.metric("ChromaDB Location", "./chroma_db")
+# ============================================================================
+# FOOTER
+# ============================================================================
+st.divider()
+st.markdown("""
+<div style='text-align: center; color: #666; font-size: 0.9rem;'>
+    Multimodal RAG System (Improved) | Qwen2.5-VL + ChromaDB + Streamlit | v1.1
+</div>
+""", unsafe_allow_html=True)

src/embedder.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# ============================================================================
+# STEP 2: EMBEDDER MODULE
+# Generate embeddings using CLIP and store in ChromaDB
+# ============================================================================
+import os
+import json
+from typing import List, Dict, Optional
+import chromadb
+from chromadb import Documents, EmbeddingFunction, Embeddings
+from sentence_transformers import SentenceTransformer
+import numpy as np
+class CLIPEmbeddingFunction(EmbeddingFunction):
+    """Custom embedding function using CLIP model."""
+    def __init__(self, model_name: str = "sentence-transformers/clip-ViT-B-32"):
+        """Initialize CLIP embedder."""
+        self.model = SentenceTransformer(model_name)
+    def __call__(self, input: Documents) -> Embeddings:
+        """Generate embeddings for input documents."""
+        # Handle both text and list inputs
+        if isinstance(input, str):
+            embeddings = self.model.encode([input]).tolist()
+        else:
+            embeddings = self.model.encode(list(input)).tolist()
+        return embeddings
+class ChromaDBManager:
+    """Manage ChromaDB vector storage with persistent data."""
+    def __init__(self, db_dir: str = "./chroma_db"):
+        """Initialize ChromaDB with persistent storage."""
+        self.db_dir = db_dir
+        os.makedirs(db_dir, exist_ok=True)
+        # Initialize persistent client
+        self.client = chromadb.PersistentClient(path=db_dir)
+        # Initialize embedding function with CLIP
+        self.embedding_function = CLIPEmbeddingFunction(
+            model_name="sentence-transformers/clip-ViT-B-32"
+        )
+        # Get or create collection
+        self.collection = self.client.get_or_create_collection(
+            name="pdf_documents",
+            embedding_function=self.embedding_function,
+            metadata={"hnsw:space": "cosine"}
+        )
+        print(f"ChromaDB initialized. Database location: {db_dir}")
+    def add_documents(self, documents: List[Dict]) -> None:
+        """Add documents to ChromaDB."""
+        if not documents:
+            print("No documents to add")
+            return
+        doc_ids = []
+        doc_texts = []
+        doc_metadatas = []
+        for idx, doc in enumerate(documents):
+            doc_id = f"doc_{doc.get('filename', 'unknown')}_{idx}"
+            doc_text = doc.get('text', '') + " " + " ".join([table[1] for table in doc.get('tables', [])])
+            doc_ids.append(doc_id)
+            doc_texts.append(doc_text)
+            doc_metadatas.append({
+                "filename": doc.get('filename', ''),
+                "page": str(doc.get('page', 0)),
+                "source": "pdf"
+            })
+        # Add to collection
+        self.collection.add(
+            ids=doc_ids,
+            documents=doc_texts,
+            metadatas=doc_metadatas
+        )
+        print(f"Added {len(documents)} documents to ChromaDB")
+    def search(self, query: str, n_results: int = 5) -> List[Dict]:
+        """Search for documents similar to query."""
+        results = self.collection.query(
+            query_texts=[query],
+            n_results=n_results
+        )
+        retrieved_docs = []
+        if results['documents']:
+            for doc, distance, metadata in zip(
+                results['documents'][0],
+                results['distances'][0],
+                results['metadatas'][0]
+            ):
+                retrieved_docs.append({
+                    'document': doc,
+                    'distance': distance,
+                    'metadata': metadata,
+                    'relevance_score': 1 - distance  # Convert distance to similarity score
+                })
+        return retrieved_docs
+    def get_all_documents_count(self) -> int:
+        """Get total number of documents in collection."""
+        return self.collection.count()
+    def clear_collection(self) -> None:
+        """Clear all documents from collection (for reset)."""
+        self.collection.delete(where={})
+        print("Collection cleared")
+    def get_collection_info(self) -> Dict:
+        """Get information about the collection."""
+        return {
+            "name": self.collection.name,
+            "document_count": self.collection.count(),
+            "metadata": self.collection.metadata
+        }

src/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import os
+import json
+from pathlib import Path
+from typing import Dict, List, Tuple
+import pdfplumber
+import fitz  # PyMuPDF
+from PIL import Image
+import io
+class PDFParser:
+    """Parse PDF documents and extract text, tables, and images."""
+    def __init__(self, extraction_dir: str = "./pdf_extractions"):
+        self.extraction_dir = extraction_dir
+        self.state_file = os.path.join(extraction_dir, "processing_state.json")
+        os.makedirs(extraction_dir, exist_ok=True)
+        self.processed_files = self._load_processing_state()
+    def _load_processing_state(self) -> Dict:
+        """Load state of already processed files to avoid re-processing."""
+        if os.path.exists(self.state_file):
+            try:
+                with open(self.state_file, 'r') as f:
+                    return json.load(f)
+            except Exception as e:
+                print(f"Warning: Could not load processing state: {e}")
+                return {}
+        return {}
+    def _save_processing_state(self):
+        """Save processing state to disk."""
+        try:
+            with open(self.state_file, 'w') as f:
+                json.dump(self.processed_files, f, indent=2)
+        except Exception as e:
+            print(f"Warning: Could not save processing state: {e}")
+    def _get_file_hash(self, pdf_path: str) -> str:
+        """Generate a simple hash for the file (file size + modification time)."""
+        try:
+            stat = os.stat(pdf_path)
+            return f"{stat.st_size}_{stat.st_mtime}"
+        except Exception as e:
+            print(f"Error getting file hash: {e}")
+            return "unknown"
+    def extract_text_with_pdfplumber(self, pdf_path: str, max_chars: int = 1000000) -> str:
+        """Extract text from PDF using pdfplumber (handles complex layouts)."""
+        text = ""
+        char_count = 0
+        try:
+            with pdfplumber.open(pdf_path) as pdf:
+                for page_num, page in enumerate(pdf.pages, 1):
+                    if char_count >= max_chars:
+                        print(f"Text extraction reached maximum chars limit ({max_chars})")
+                        break
+                    try:
+                        page_text = page.extract_text()
+                        if page_text:
+                            # Limit per-page text to avoid token explosion
+                            page_text = page_text[:50000]
+                            text += f"\n--- Page {page_num} ---\n{page_text}"
+                            char_count += len(page_text)
+                    except Exception as e:
+                        print(f"Error extracting text from page {page_num}: {e}")
+                        continue
+        except Exception as e:
+            print(f"Error opening PDF with pdfplumber: {e}")
+        return text
+    def extract_tables_from_pdf(self, pdf_path: str, max_tables: int = 50) -> List[Tuple[int, str]]:
+        """Extract tables from PDF and return as formatted text."""
+        tables = []
+        table_count = 0
+        try:
+            with pdfplumber.open(pdf_path) as pdf:
+                for page_num, page in enumerate(pdf.pages, 1):
+                    if table_count >= max_tables:
+                        print(f"Table extraction reached maximum tables limit ({max_tables})")
+                        break
+                    try:
+                        page_tables = page.extract_tables()
+                        if page_tables:
+                            for table_idx, table in enumerate(page_tables):
+                                # Convert table to text format
+                                table_text = f"TABLE on page {page_num}:\n"
+                                for row in table:
+                                    row_str = " | ".join([str(cell) if cell else "" for cell in row])
+                                    # Limit row length
+                                    if len(row_str) > 1000:
+                                        row_str = row_str[:1000] + "..."
+                                    table_text += row_str + "\n"
+                                tables.append((page_num, table_text))
+                                table_count += 1
+                    except Exception as e:
+                        print(f"Error extracting tables from page {page_num}: {e}")
+                        continue
+        except Exception as e:
+            print(f"Error opening PDF for table extraction: {e}")
+        return tables
+    def extract_images_from_pdf(self, pdf_path: str, output_dir: str = None, max_images: int = 100) -> List[Tuple[int, str]]:
+        """
+        Extract images from PDF using PyMuPDF.
+        FIXED: Properly handles xref tuples from get_images()
+        """
+        if output_dir is None:
+            output_dir = os.path.join(self.extraction_dir, "images")
+        os.makedirs(output_dir, exist_ok=True)
+        images = []
+        image_count = 0
+        try:
+            pdf_name = Path(pdf_path).stem
+            pdf_file = fitz.open(pdf_path)
+            for page_num in range(len(pdf_file)):
+                if image_count >= max_images:
+                    print(f"Image extraction reached maximum images limit ({max_images})")
+                    break
+                try:
+                    page = pdf_file[page_num]
+                    pix_list = page.get_images()
+                    for image_idx, img_info in enumerate(pix_list):
+                        if image_count >= max_images:
+                            break
+                        try:
+                            # FIXED: Extract xref from tuple properly
+                            # get_images() returns tuples: (xref, smask, width, height, ...)
+                            xref = img_info[0]  # Get xref as integer
+                            # Extract image
+                            base_image = pdf_file.extract_image(xref)
+                            if base_image and "image" in base_image:
+                                image_bytes = base_image["image"]
+                                image_ext = base_image["ext"]
+                                image_name = f"{pdf_name}_page{page_num+1}_img{image_idx}.{image_ext}"
+                                image_path = os.path.join(output_dir, image_name)
+                                with open(image_path, "wb") as f:
+                                    f.write(image_bytes)
+                                images.append((page_num + 1, image_path))
+                                image_count += 1
+                        except TypeError as e:
+                            # Handle comparison errors with tuple
+                            print(f"Error with image data type on page {page_num}, image {image_idx}: {e}")
+                            continue
+                        except Exception as e:
+                            print(f"Error extracting image {image_idx} from page {page_num}: {e}")
+                            continue
+                except Exception as e:
+                    print(f"Error processing page {page_num}: {e}")
+                    continue
+            pdf_file.close()
+        except Exception as e:
+            print(f"Error opening PDF for image extraction: {e}")
+        return images
+    def process_pdf(self, pdf_path: str) -> Dict:
+        """Process entire PDF and extract all content."""
+        file_hash = self._get_file_hash(pdf_path)
+        # Check if already processed
+        if pdf_path in self.processed_files and self.processed_files[pdf_path] == file_hash:
+            print(f"File {pdf_path} already processed. Loading cached results.")
+            return self._load_cached_results(pdf_path)
+        print(f"Processing PDF: {pdf_path}")
+        result = {
+            "pdf_path": pdf_path,
+            "filename": Path(pdf_path).name,
+            "text": self.extract_text_with_pdfplumber(pdf_path, max_chars=1000000),
+            "tables": self.extract_tables_from_pdf(pdf_path, max_tables=50),
+            "images": self.extract_images_from_pdf(pdf_path, max_images=100)
+        }
+        # Save results to cache
+        self._save_cached_results(pdf_path, result)
+        # Update processing state
+        self.processed_files[pdf_path] = file_hash
+        self._save_processing_state()
+        return result
+    def _save_cached_results(self, pdf_path: str, result: Dict):
+        """Save extraction results to a JSON file."""
+        safe_name = Path(pdf_path).stem
+        cache_file = os.path.join(self.extraction_dir, f"{safe_name}_cache.json")
+        # Don't save image paths in cache, just metadata
+        cache_data = {
+            "pdf_path": result["pdf_path"],
+            "filename": result["filename"],
+            "text": result["text"],
+            "tables": result["tables"],
+            "image_count": len(result["images"])
+        }
+        try:
+            with open(cache_file, 'w', encoding='utf-8') as f:
+                json.dump(cache_data, f, ensure_ascii=False, indent=2)
+        except Exception as e:
+            print(f"Warning: Could not save cache: {e}")
+    def _load_cached_results(self, pdf_path: str) -> Dict:
+        """Load cached extraction results."""
+        safe_name = Path(pdf_path).stem
+        cache_file = os.path.join(self.extraction_dir, f"{safe_name}_cache.json")
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            print(f"Error loading cache: {e}")
+            return {"text": "", "tables": [], "images": []}
+    def process_pdf_directory(self, pdf_dir: str) -> List[Dict]:
+        """Process all PDFs in a directory."""
+        results = []
+        pdf_files = list(Path(pdf_dir).glob("*.pdf"))
+        if not pdf_files:
+            print(f"No PDF files found in {pdf_dir}")
+            return results
+        print(f"Found {len(pdf_files)} PDF files to process")
+        for idx, pdf_file in enumerate(pdf_files, 1):
+            try:
+                print(f"Processing {idx}/{len(pdf_files)}: {pdf_file.name}")
+                result = self.process_pdf(str(pdf_file))
+                results.append(result)
+            except Exception as e:
+                print(f"Error processing {pdf_file}: {e}")
+                continue
+        print(f"Completed processing {len(results)} PDFs")
+        return results

src/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,417 @@

+from typing import List, Dict, Optional, Tuple
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, AutoTokenizer
+from qwen_vl_utils import process_vision_info
+from PIL import Image
+import io
+class TokenChunker:
+    """Handle token counting and chunking for model context limits."""
+    def __init__(self, model_name: str = "Qwen/Qwen2.5-VL-3B-Instruct"):
+        """Initialize tokenizer for token counting."""
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        # Qwen2.5-VL has max context of 131,072 tokens
+        self.max_tokens = 100000  # Conservative limit (use 100K of 131K available)
+    def count_tokens(self, text: str) -> int:
+        """Count tokens in text."""
+        try:
+            tokens = self.tokenizer.encode(text, add_special_tokens=False)
+            return len(tokens)
+        except Exception as e:
+            print(f"Error counting tokens: {e}")
+            # Rough estimate: 1 token ≈ 4 characters for English/Russian
+            return len(text) // 4
+    def chunk_text(self, text: str, chunk_size: int = 50000) -> List[str]:
+        """Split text into chunks that fit within token limits."""
+        if len(text) <= chunk_size:
+            return [text]
+        chunks = []
+        current_chunk = ""
+        # Split by paragraphs first
+        paragraphs = text.split("\n\n")
+        for paragraph in paragraphs:
+            if len(current_chunk) + len(paragraph) < chunk_size:
+                current_chunk += paragraph + "\n\n"
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = paragraph + "\n\n"
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def truncate_to_token_limit(self, text: str, token_limit: int = 50000) -> str:
+        """Truncate text to fit within token limit."""
+        current_tokens = self.count_tokens(text)
+        if current_tokens <= token_limit:
+            return text
+        print(f"Text too long ({current_tokens} tokens). Truncating to {token_limit}...")
+        # Estimate characters per token
+        char_per_token = len(text) / current_tokens
+        target_chars = int(token_limit * char_per_token * 0.9)  # 90% to be safe
+        truncated = text[:target_chars]
+        return truncated
+class Qwen25VLInferencer:
+    """Handle inference with Qwen2.5-VL-3B model - FIXED meta tensor issue."""
+class Qwen25VLInferencer:
+    """Handle inference with Qwen2.5-VL-3B model - FIXED meta tensor issue."""
+    def __init__(self, model_name: str = "Qwen/Qwen2.5-VL-3B-Instruct", device: str = "cuda"):
+        """Initialize Qwen2.5-VL model with proper device handling."""
+        self.device = device if torch.cuda.is_available() else "cpu"
+        print(f"Loading Qwen2.5-VL-3B model on device: {self.device}")
+        try:
+            # FIXED: Load model without device_map first, then move to device
+            # This avoids the meta tensor issue
+            # Determine data type based on device
+            if self.device == "cuda":
+                dtype = torch.float16  # GPU: use half precision
+            else:
+                dtype = torch.float32  # CPU: use full precision
+            print(f"Using dtype: {dtype}")
+            # Load model
+            print("Loading model weights...")
+            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_name,
+                torch_dtype=dtype,
+                trust_remote_code=True,
+                # IMPORTANT: Don't use device_map="auto" here - causes meta tensor issue
+            )
+            # Move to device explicitly AFTER loading
+            print(f"Moving model to {self.device}...")
+            if self.device == "cuda":
+                self.model = self.model.to("cuda")
+            else:
+                self.model = self.model.to("cpu")
+            # Set to evaluation mode
+            self.model.eval()
+            print("✅ Model loaded successfully")
+        except RuntimeError as e:
+            if "meta tensor" in str(e):
+                print(f"⚠️ Meta tensor error detected: {e}")
+                print("Falling back to CPU mode...")
+                self.device = "cpu"
+                self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                    model_name,
+                    torch_dtype=torch.float32,
+                    trust_remote_code=True,
+                )
+                self.model = self.model.to("cpu")
+                self.model.eval()
+                print("✅ Model loaded on CPU")
+            else:
+                raise
+        except Exception as e:
+            print(f"❌ Error loading model: {e}")
+            print("Trying fallback CPU loading...")
+            self.device = "cpu"
+            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_name,
+                torch_dtype=torch.float32,
+                trust_remote_code=True,
+            )
+            self.model = self.model.to("cpu")
+            self.model.eval()
+        # Load processor
+        print("Loading processor...")
+        self.processor = AutoProcessor.from_pretrained(
+            model_name,
+            trust_remote_code=True
+        )
+        # Initialize token chunker
+        self.token_chunker = TokenChunker(model_name)
+        print("✅ Model initialization complete")
+    def _prepare_text_message(self, text: str) -> List[Dict]:
+        """Prepare text-only message for the model."""
+        return [{"type": "text", "text": text}]
+    def _prepare_image_text_message(self, image_path: str, text: str) -> List[Dict]:
+        """Prepare message with image and text."""
+        return [
+            {"type": "image", "image": image_path},
+            {"type": "text", "text": text}
+        ]
+    def generate_answer(
+        self,
+        query: str,
+        retrieved_docs: List[Dict],
+        retrieved_images: List[str] = None,
+        max_new_tokens: int = 128
+    ) -> str:
+        """
+        Generate answer based on query and retrieved documents.
+        FIXED: Includes token chunking and context length management
+        """
+        # Build context from retrieved documents
+        context = "КОНТЕКСТ ИЗ ДОКУМЕНТОВ:\n"
+        for doc in retrieved_docs:
+            relevance = doc.get('relevance_score', 0)
+            context += f"\n[Релевантность: {relevance:.2f}]\n{doc['document']}\n"
+        # FIXED: Truncate context if too long
+        context = self.token_chunker.truncate_to_token_limit(context, token_limit=50000)
+        # Build system prompt
+        system_prompt = "Ты помощник для анализа документов. Используй предоставленный контекст для ответа на вопросы. Отвечай на русском языке. Будь кратким и точным."
+        # Prepare the full query
+        full_query = f"{system_prompt}\n\n{context}\n\nВопрос: {query}\n\nОтвет:"
+        # FIXED: Check and limit token count
+        query_tokens = self.token_chunker.count_tokens(full_query)
+        print(f"Query token count: {query_tokens}")
+        if query_tokens > 100000:
+            print(f"Query exceeds token limit. Reducing context...")
+            # Keep only first 3 documents instead of all
+            context = "КОНТЕКСТ ИЗ ДОКУМЕНТОВ:\n"
+            for doc in retrieved_docs[:3]:
+                relevance = doc.get('relevance_score', 0)
+                context += f"\n[Релевантность: {relevance:.2f}]\n{doc['document']}\n"
+            context = self.token_chunker.truncate_to_token_limit(context, token_limit=30000)
+            full_query = f"{system_prompt}\n\n{context}\n\nВопрос: {query}\n\nОтвет:"
+        # Prepare messages
+        messages = self._prepare_text_message(full_query)
+        # If images are provided, add them
+        if retrieved_images and len(retrieved_images) > 0:
+            try:
+                image_message = self._prepare_image_text_message(
+                    retrieved_images[0],
+                    f"Проанализируй это изображение в контексте вопроса: {query}"
+                )
+                messages = image_message + [{"type": "text", "text": full_query}]
+            except Exception as e:
+                print(f"Warning: Could not include images: {e}")
+        # Process vision info if images are included
+        image_inputs = []
+        video_inputs = []
+        try:
+            if any(msg.get('type') == 'image' for msg in messages):
+                image_inputs, video_inputs = process_vision_info(messages)
+        except Exception as e:
+            print(f"Warning: Could not process images: {e}")
+        # Prepare inputs for model
+        try:
+            inputs = self.processor(
+                text=[full_query],
+                images=image_inputs if image_inputs else None,
+                videos=video_inputs if video_inputs else None,
+                padding=True,
+                return_tensors='pt',
+            )
+        except Exception as e:
+            print(f"Error preparing inputs: {e}")
+            return f"Error preparing inputs: {e}"
+        # Move inputs to device
+        if self.device == "cuda":
+            inputs = inputs.to("cuda")
+        # Generate response with error handling
+        try:
+            with torch.no_grad():
+                generated_ids = self.model.generate(
+                    **inputs,
+                    max_new_tokens=min(max_new_tokens, 512),  # Cap at 512
+                    num_beams=1,
+                    do_sample=False
+                )
+        except Exception as e:
+            print(f"Error during generation: {e}")
+            return f"Error generating response: {e}"
+        # Decode output
+        try:
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):]
+                for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            ]
+            response = self.processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )
+            return response[0] if response else "Could not generate response"
+        except Exception as e:
+            print(f"Error decoding response: {e}")
+            return f"Error decoding response: {e}"
+    def summarize_document(
+        self,
+        document_text: str,
+        max_new_tokens: int = 512
+    ) -> str:
+        """Summarize a document with token limit management."""
+        # FIXED: Truncate document to fit in context
+        document_text = self.token_chunker.truncate_to_token_limit(
+            document_text,
+            token_limit=40000
+        )
+        prompt = f"""Пожалуйста, создай подробное резюме следующего документа на русском языке.
+Документ:
+{document_text}
+Резюме:"""
+        messages = self._prepare_text_message(prompt)
+        try:
+            inputs = self.processor(
+                text=[prompt],
+                padding=True,
+                return_tensors='pt',
+            )
+            if self.device == "cuda":
+                inputs = inputs.to("cuda")
+            with torch.no_grad():
+                generated_ids = self.model.generate(
+                    **inputs,
+                    max_new_tokens=min(max_new_tokens, 512),
+                    num_beams=1,
+                    do_sample=False
+                )
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):]
+                for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            ]
+            response = self.processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )
+            return response[0] if response else "Could not generate summary"
+        except Exception as e:
+            print(f"Error generating summary: {e}")
+            return f"Error: {e}"
+class RAGPipeline:
+    """Complete RAG pipeline combining retrieval and generation."""
+    def __init__(self, chroma_manager, device: str = "cuda"):
+        """Initialize RAG pipeline."""
+        self.chroma_manager = chroma_manager
+        self.inferencer = Qwen25VLInferencer(device=device)
+    def answer_question(
+        self,
+        query: str,
+        n_retrieved: int = 5,
+        max_new_tokens: int = 512
+    ) -> Dict:
+        """
+        Answer user question using RAG pipeline.
+        1. Retrieve relevant documents
+        2. Generate answer using Qwen2.5-VL
+        """
+        # Step 1: Retrieve
+        retrieved_docs = self.chroma_manager.search(query, n_results=n_retrieved)
+        if not retrieved_docs:
+            return {
+                "answer": "Не найдены релевантные документы для ответа на вопрос.",
+                "retrieved_docs": [],
+                "query": query,
+                "error": "No documents found"
+            }
+        # Extract images from retrieved results if available
+        retrieved_images = []
+        # Step 2: Generate
+        try:
+            answer = self.inferencer.generate_answer(
+                query=query,
+                retrieved_docs=retrieved_docs,
+                retrieved_images=retrieved_images,
+                max_new_tokens=max_new_tokens
+            )
+        except Exception as e:
+            answer = f"Error generating answer: {e}"
+        return {
+            "answer": answer,
+            "retrieved_docs": retrieved_docs,
+            "query": query,
+            "model": "Qwen2.5-VL-3B",
+            "doc_count": len(retrieved_docs)
+        }
+    def summarize_all_documents(self, max_chars: int = 100000) -> str:
+        """Create summary of all indexed documents with token limits."""
+        collection_info = self.chroma_manager.get_collection_info()
+        doc_count = collection_info['document_count']
+        if doc_count == 0:
+            return "No documents in database to summarize."
+        # Retrieve documents
+        try:
+            all_docs = self.chroma_manager.collection.get(include=['documents'])
+            if not all_docs['documents']:
+                return "Could not retrieve documents for summarization."
+            # Combine first documents with char limit
+            combined_text = ""
+            for doc in all_docs['documents'][:10]:  # Max 10 docs
+                if len(combined_text) + len(doc) < max_chars:
+                    combined_text += doc + "\n\n"
+                else:
+                    break
+            if not combined_text:
+                combined_text = all_docs['documents'][0][:max_chars]
+            summary = self.inferencer.summarize_document(combined_text)
+            return summary
+        except Exception as e:
+            return f"Error summarizing documents: {e}"