Spaces:

dnj0
/

project2

Sleeping

App Files Files Community

dnj0 commited on Nov 16, 2025

Commit

555c75a

verified ·

1 Parent(s): 44a83f9

Upload 6 files

Browse files

Files changed (6) hide show

src/app.py +267 -0
src/embeddings.py +48 -0
src/multimodal_model.py +81 -0
src/pdf_parser.py +186 -0
src/rag_pipeline.py +93 -0
src/vector_store.py +99 -0

src/app.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import streamlit as st
+import os
+from pathlib import Path
+from rag_pipeline import RAGPipeline
+import shutil
+# Page configuration
+st.set_page_config(
+    page_title="Local Multimodal RAG",
+    page_icon="📚",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+st.title("📚 Local Multimodal RAG System")
+st.markdown("**Analyze PDF documents locally with Mistral + CLIP embeddings**")
+# Initialize session state
+if "uploaded_files" not in st.session_state:
+    st.session_state.uploaded_files = []
+if "rag_pipeline" not in st.session_state:
+    st.session_state.rag_pipeline = None
+if "needs_reindex" not in st.session_state:
+    st.session_state.needs_reindex = False
+# Sidebar configuration
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    pdf_dir = st.text_input(
+        "📁 PDF Directory",
+        value="./pdfs",
+        help="Path to directory containing PDF files"
+    )
+    device = st.selectbox(
+        "🖥️ Device",
+        ["cpu", "cuda"],
+        help="Device for model inference"
+    )
+    n_context_docs = st.slider(
+        "📄 Context Documents",
+        min_value=1,
+        max_value=10,
+        value=3,
+        help="Number of documents to retrieve for context"
+    )
+    st.divider()
+    # PDF Upload Section
+    st.subheader("📤 Upload PDF Files")
+    uploaded_pdfs = st.file_uploader(
+        "Choose PDF files to upload",
+        type="pdf",
+        accept_multiple_files=True,
+        help="Select one or more PDF files to add to the system"
+    )
+    if uploaded_pdfs:
+        # Create PDF directory if not exists
+        os.makedirs(pdf_dir, exist_ok=True)
+        upload_button = st.button("⬆️ Upload PDFs", use_container_width=True)
+        if upload_button:
+            uploaded_count = 0
+            for uploaded_file in uploaded_pdfs:
+                file_path = os.path.join(pdf_dir, uploaded_file.name)
+                # Save file
+                with open(file_path, "wb") as f:
+                    f.write(uploaded_file.getbuffer())
+                st.session_state.uploaded_files.append(uploaded_file.name)
+                uploaded_count += 1
+            st.success(f"✅ Uploaded {uploaded_count} PDF(s) successfully!")
+            st.session_state.needs_reindex = True
+    st.divider()
+    # Display uploaded files
+    pdf_files = list(Path(pdf_dir).glob("*.pdf"))
+    if pdf_files:
+        st.subheader(f"📚 Documents ({len(pdf_files)})")
+        for pdf_file in pdf_files:
+            col1, col2 = st.columns([4, 1])
+            with col1:
+                st.write(f"• {pdf_file.name}")
+            with col2:
+                if st.button("🗑️", key=f"delete_{pdf_file.name}", help="Delete this file"):
+                    os.remove(pdf_file)
+                    st.session_state.needs_reindex = True
+                    st.rerun()
+    st.divider()
+    # Reindex button
+    if st.button("🔄 Reload & Index PDFs", use_container_width=True):
+        st.session_state.rag_pipeline = None
+        st.session_state.needs_reindex = True
+        st.rerun()
+# Initialize pipeline in session state
+@st.cache_resource
+def init_rag_pipeline(_device, _pdf_dir):
+    """Initialize RAG pipeline (cached)"""
+    # Create PDF directory if not exists
+    os.makedirs(_pdf_dir, exist_ok=True)
+    # Check if PDFs exist
+    pdf_files = list(Path(_pdf_dir).glob("*.pdf"))
+    if not pdf_files:
+        return None, f"No PDF files found in {_pdf_dir}. Upload PDFs using the sidebar."
+    try:
+        with st.spinner("⏳ Initializing RAG pipeline..."):
+            pipeline = RAGPipeline(pdf_dir=_pdf_dir, device=_device)
+            with st.spinner("⏳ Indexing PDFs..."):
+                pipeline.index_pdfs()
+        return pipeline, None
+    except Exception as e:
+        return None, str(e)
+# Get or initialize pipeline
+if st.session_state.rag_pipeline is None or st.session_state.needs_reindex:
+    pipeline, error = init_rag_pipeline(device, pdf_dir)
+    if error:
+        st.error(f"❌ Error: {error}")
+        st.info("💡 **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")
+        st.stop()
+    st.session_state.rag_pipeline = pipeline
+    st.session_state.needs_reindex = False
+else:
+    pipeline = st.session_state.rag_pipeline
+# Main content
+if pipeline:
+    # Tabs
+    tab1, tab2, tab3 = st.tabs(["❓ Q&A", "📊 Summary", "📖 Retrieval"])
+    # Tab 1: Question Answering
+    with tab1:
+        st.subheader("Ask Questions about Your Documents")
+        question = st.text_area(
+            "Your question (in Russian or English):",
+            height=100,
+            placeholder="What is this document about? What are the main points? Etc.",
+            key="qa_question"
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            get_answer_btn = st.button("🔍 Get Answer", use_container_width=True)
+        with col2:
+            clear_btn = st.button("🗑️ Clear", use_container_width=True)
+        if clear_btn:
+            st.rerun()
+        if get_answer_btn:
+            if question.strip():
+                with st.spinner("⏳ Retrieving documents and generating answer..."):
+                    try:
+                        result = pipeline.answer_question(question, n_context_docs=n_context_docs)
+                    except Exception as e:
+                        st.error(f"Error generating answer: {str(e)}")
+                        result = None
+                if result and result.get("answer"):
+                    st.success("✓ Answer generated!")
+                    # Display answer
+                    st.subheader("📝 Answer")
+                    st.write(result["answer"])
+                    # Display sources
+                    with st.expander("📚 Sources Used"):
+                        for i, source in enumerate(result["sources"], 1):
+                            st.write(f"{i}. {source}")
+                    # Display stats
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.metric("Documents Used", result.get("context_used", 0))
+                    with col2:
+                        st.metric("Answer Length", len(result["answer"]))
+            else:
+                st.warning("Please enter a question")
+    # Tab 2: Document Summary
+    with tab2:
+        st.subheader("Summary of Indexed Documents")
+        if st.button("📊 Generate Summary", use_container_width=True):
+            with st.spinner("⏳ Generating summary..."):
+                try:
+                    summary = pipeline.summarize_documents()
+                    st.success("✓ Summary generated!")
+                    st.subheader("📄 Document Summary")
+                    st.write(summary)
+                except Exception as e:
+                    st.error(f"Error generating summary: {str(e)}")
+    # Tab 3: Document Retrieval
+    with tab3:
+        st.subheader("Search and Retrieve Documents")
+        search_query = st.text_input(
+            "Search query:",
+            placeholder="Enter search terms...",
+            key="retrieval_search"
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            search_btn = st.button("🔎 Search", use_container_width=True)
+        with col2:
+            clear_search_btn = st.button("Clear Search", use_container_width=True)
+        if clear_search_btn:
+            st.rerun()
+        if search_btn:
+            if search_query.strip():
+                with st.spinner("⏳ Searching..."):
+                    try:
+                        results = pipeline.retrieve_documents(search_query, n_results=n_context_docs)
+                    except Exception as e:
+                        st.error(f"Search error: {str(e)}")
+                        results = []
+                if results:
+                    st.success(f"✓ Found {len(results)} documents")
+                    for i, doc in enumerate(results, 1):
+                        with st.expander(f"📄 Document {i} - {doc['source']}", expanded=(i==1)):
+                            st.write(doc["content"])
+                else:
+                    st.warning("No documents found matching your query")
+            else:
+                st.warning("Please enter a search query")
+    # Footer
+    st.divider()
+    with st.expander("ℹ️ System Information"):
+        info = pipeline.vector_store.get_collection_info()
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("📚 Documents", info.get("document_count", 0))
+        with col2:
+            st.metric("🖥️ Device", device.upper())
+        with col3:
+            st.metric("🔍 Context Docs", n_context_docs)
+        with col4:
+            pdf_count = len(list(Path(pdf_dir).glob("*.pdf")))
+            st.metric("📁 PDF Files", pdf_count)
+else:
+    st.error("❌ Failed to initialize RAG pipeline")
+    st.info("💡 **How to get started:**\n1. Upload PDF files using the sidebar\n2. Click 'Upload PDFs' to save them\n3. Click 'Reload & Index PDFs' to process them")

src/embeddings.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import numpy as np
+from typing import List
+from transformers import CLIPModel, CLIPProcessor
+class CLIPEmbedder:
+    def __init__(self, model_name: str = "openai/clip-vit-base-patch32", device: str = "cpu"):
+        self.device = device
+        self.model_name = model_name
+        print(f"→ Loading CLIP model: {model_name}")
+        # Load from transformers with correct identifier
+        self.model = CLIPModel.from_pretrained(model_name).to(device)
+        self.processor = CLIPProcessor.from_pretrained(model_name)
+        # Set model to eval mode
+        self.model.eval()
+        print(f"✓ CLIP model loaded on {device}")
+    def encode_text(self, texts: List[str]) -> np.ndarray:
+        """Encode text using CLIP"""
+        with torch.no_grad():
+            # Process texts
+            inputs = self.processor(
+                text=texts,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=77
+            ).to(self.device)
+            # Get text embeddings
+            text_features = self.model.get_text_features(**inputs)
+            # Normalize embeddings
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        return text_features.cpu().numpy()
+    def encode_single_text(self, text: str) -> np.ndarray:
+        """Encode single text"""
+        return self.encode_text([text])[0]
+    def __call__(self, texts: List[str]) -> np.ndarray:
+        """Make embedder callable"""
+        return self.encode_text(texts)

src/multimodal_model.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoImageProcessor
+from typing import Optional, Tuple
+import numpy as np
+from PIL import Image
+class GemmaVisionModel:
+    def __init__(self, model_name: str = "unsloth/gemma-3-1b-pt", device: str = "cpu"):
+        self.device = device
+        self.model_name = model_name
+        print(f"→ Loading {model_name}...")
+        # Load with 4-bit quantization for memory efficiency
+        try:
+            from transformers import BitsAndBytesConfig
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float32,
+                bnb_4bit_use_double_quant=False,
+                bnb_4bit_quant_type="nf4"
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                quantization_config=quantization_config,
+                device_map="auto",
+                trust_remote_code=True
+            )
+        except:
+            # Fallback without quantization
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float32,
+                device_map="cpu",
+                trust_remote_code=True
+            )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.model.eval()
+        print(f"✓ Model loaded successfully")
+    def generate_response(self, prompt: str, max_length: int = 512, temperature: float = 0.7) -> str:
+        """Generate text response"""
+        with torch.no_grad():
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            outputs = self.model.generate(
+                **inputs,
+                temperature=0.8,              # ← Keep in 0.5-1.5 range
+                do_sample=True,               # ← Use sampling for variety
+                top_p=0.95,                   # ← Nucleus sampling
+                top_k=50,                     # ← Top-K sampling
+                remove_invalid_values=True,   # ← Remove NaN/Inf
+                repetition_penalty=1.2,       # ← Avoid repetition
+                pad_token_id=self.tokenizer.eos_token_id,
+                eos_token_id=self.tokenizer.eos_token_id
+            )
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response
+    def summarize_text(self, text: str, max_length: int = 256) -> str:
+        """Summarize provided text"""
+        prompt = f"Summarize the following text in Russian:\n\n{text}\n\nSummary:"
+        return self.generate_response(prompt, max_length=max_length)
+    def answer_question(self, question: str, context: str) -> str:
+        """Answer question based on context"""
+        prompt = f"""Based on the following context, answer the question in Russian.
+Context:
+{context}
+Question: {question}
+Answer:"""
+        return self.generate_response(prompt, max_length=512)

src/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os
+import json
+import pdfplumber
+import hashlib
+from pathlib import Path
+from typing import Dict, List, Tuple
+from PIL import Image
+import io
+class PDFParser:
+    def __init__(self, pdf_dir: str, cache_dir: str = ".pdf_cache"):
+        self.pdf_dir = pdf_dir
+        self.cache_dir = cache_dir
+        self.cache_file = os.path.join(cache_dir, "processed_files.json")
+        # Create cache directory
+        os.makedirs(cache_dir, exist_ok=True)
+        # Load processed files cache
+        self.processed_files = self._load_cache()
+    def _load_cache(self) -> Dict:
+        """Load cache of processed files"""
+        if os.path.exists(self.cache_file):
+            with open(self.cache_file, 'r') as f:
+                return json.load(f)
+        return {}
+    def _save_cache(self):
+        """Save cache of processed files"""
+        with open(self.cache_file, 'w') as f:
+            json.dump(self.processed_files, f, indent=2)
+    def _get_file_hash(self, filepath: str) -> str:
+        """Generate hash of file to detect changes"""
+        hash_md5 = hashlib.md5()
+        with open(filepath, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+    def _extract_tables(self, page) -> List[Dict]:
+        """Extract tables from PDF page"""
+        tables = []
+        try:
+            page_tables = page.extract_tables()
+            for i, table in enumerate(page_tables):
+                table_text = "\n".join([" | ".join([str(cell) if cell else "" for cell in row]) for row in table])
+                tables.append({
+                    "type": "table",
+                    "index": i,
+                    "content": table_text
+                })
+        except:
+            pass
+        return tables
+    def _extract_images(self, page, page_num: int, pdf_filename: str) -> List[Dict]:
+        """Extract images from PDF page"""
+        images = []
+        try:
+            # Get page images
+            page_images = page.images
+            for i, img_dict in enumerate(page_images):
+                try:
+                    # Get image as bytes and save locally
+                    img_name = f"{pdf_filename}_p{page_num}_img{i}.png"
+                    img_path = os.path.join(self.cache_dir, img_name)
+                    # Extract image bytes
+                    xref = img_dict["srcsize"]
+                    if xref:
+                        images.append({
+                            "type": "image",
+                            "index": i,
+                            "path": img_path,
+                            "description": f"Image from page {page_num}"
+                        })
+                except:
+                    pass
+        except:
+            pass
+        return images
+    def parse_pdf(self, pdf_path: str) -> Dict:
+        """Parse single PDF file"""
+        pdf_name = os.path.basename(pdf_path)
+        file_hash = self._get_file_hash(pdf_path)
+        # Check if already processed
+        if pdf_name in self.processed_files:
+            if self.processed_files[pdf_name]["hash"] == file_hash:
+                print(f"✓ Skipping {pdf_name} (already processed)")
+                return self.processed_files[pdf_name]["data"]
+        print(f"→ Processing {pdf_name}...")
+        content = {
+            "filename": pdf_name,
+            "pages": [],
+            "total_pages": 0
+        }
+        try:
+            with pdfplumber.open(pdf_path) as pdf:
+                content["total_pages"] = len(pdf.pages)
+                for page_num, page in enumerate(pdf.pages):
+                    page_content = {
+                        "page_num": page_num,
+                        "text": page.extract_text() or "",
+                        "tables": self._extract_tables(page),
+                        "images": self._extract_images(page, page_num, pdf_name.replace('.pdf', ''))
+                    }
+                    content["pages"].append(page_content)
+            # Update cache
+            self.processed_files[pdf_name] = {
+                "hash": file_hash,
+                "data": content
+            }
+            self._save_cache()
+            print(f"✓ Successfully processed {pdf_name}")
+        except Exception as e:
+            print(f"✗ Error processing {pdf_name}: {str(e)}")
+        return content
+    def parse_all_pdfs(self) -> List[Dict]:
+        """Parse all PDFs in directory"""
+        pdf_files = list(Path(self.pdf_dir).glob("*.pdf"))
+        if not pdf_files:
+            print(f"No PDF files found in {self.pdf_dir}")
+            return []
+        all_content = []
+        for pdf_path in pdf_files:
+            content = self.parse_pdf(str(pdf_path))
+            all_content.append(content)
+        return all_content
+def extract_text_from_pdfs(pdf_dir: str) -> Tuple[List[str], List[str]]:
+    """Extract all text and metadata from PDFs"""
+    parser = PDFParser(pdf_dir)
+    all_pdfs = parser.parse_all_pdfs()
+    documents = []
+    metadatas = []
+    for pdf_content in all_pdfs:
+        for page in pdf_content["pages"]:
+            # Extract text
+            text = page["text"]
+            # Extract table content
+            for table in page["tables"]:
+                text += "\n\n[TABLE]\n" + table["content"] + "\n[/TABLE]\n"
+            # Split into chunks if too long
+            if text.strip():
+                # Split by sentences for better chunking
+                sentences = text.split('.')
+                chunk = ""
+                for sentence in sentences:
+                    if len(chunk) + len(sentence) < 1000:
+                        chunk += sentence + "."
+                    else:
+                        if chunk.strip():
+                            documents.append(chunk)
+                            metadatas.append({
+                                "filename": pdf_content["filename"],
+                                "page": page["page_num"]
+                            })
+                        chunk = sentence + "."
+                if chunk.strip():
+                    documents.append(chunk)
+                    metadatas.append({
+                        "filename": pdf_content["filename"],
+                        "page": page["page_num"]
+                    })
+    return documents, metadatas

src/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import List, Dict, Optional
+from pdf_parser import extract_text_from_pdfs
+from vector_store import VectorStore
+from embeddings import CLIPEmbedder
+from multimodal_model import GemmaVisionModel
+class RAGPipeline:
+    def __init__(self, pdf_dir: str, chroma_dir: str = "./chroma_db", device: str = "cpu"):
+        self.pdf_dir = pdf_dir
+        self.device = device
+        # Initialize components
+        print("→ Initializing RAG Pipeline...")
+        # Initialize embedder
+        self.embedder = CLIPEmbedder(model_name="openai/clip-vit-base-patch32", device=device)
+        # Initialize vector store
+        self.vector_store = VectorStore(persist_dir=chroma_dir)
+        self.vector_store.get_or_create_collection()
+        # Initialize LLM
+        self.llm = GemmaVisionModel(model_name="unsloth/gemma-3-1b-pt", device=device)
+        print("✓ RAG Pipeline initialized")
+    def index_pdfs(self):
+        """Index all PDFs from directory"""
+        print("→ Indexing PDF documents...")
+        # Extract text from PDFs
+        documents, metadatas = extract_text_from_pdfs(self.pdf_dir)
+        if documents:
+            # Generate IDs
+            ids = [f"doc_{i}" for i in range(len(documents))]
+            # Add to vector store (embeddings generated automatically)
+            self.vector_store.add_documents(documents, metadatas, ids)
+            print(f"✓ Indexed {len(documents)} document chunks")
+        else:
+            print("No documents to index")
+    def retrieve_documents(self, query: str, n_results: int = 5) -> List[Dict]:
+        """Retrieve relevant documents"""
+        results = self.vector_store.search(query, n_results=n_results)
+        retrieved_docs = []
+        for doc, metadata in zip(results["documents"][0], results["metadatas"][0]):
+            retrieved_docs.append({
+                "content": doc,
+                "source": f"{metadata.get('filename', 'Unknown')} (p{metadata.get('page', '?')})"
+            })
+        return retrieved_docs
+    def answer_question(self, question: str, n_context_docs: int = 3) -> Dict:
+        """Answer question using RAG"""
+        # Retrieve relevant documents
+        retrieved_docs = self.retrieve_documents(question, n_results=n_context_docs)
+        # Combine context
+        context = "\n\n".join([f"[Source: {doc['source']}]\n{doc['content']}" for doc in retrieved_docs])
+        # Generate answer
+        answer = self.llm.answer_question(question, context)
+        # Extract just the answer (remove prompt)
+        if "Answer:" in answer:
+            answer = answer.split("Answer:")[-1].strip()
+        return {
+            "answer": answer,
+            "sources": [doc["source"] for doc in retrieved_docs],
+            "context_used": len(retrieved_docs)
+        }
+    def summarize_documents(self) -> str:
+        """Summarize all indexed documents"""
+        # Get all documents from vector store
+        collection_info = self.vector_store.get_collection_info()
+        doc_count = collection_info.get("document_count", 0)
+        if doc_count == 0:
+            return "No documents to summarize"
+        # Sample documents
+        results = self.vector_store.search("main topic summary", n_results=5)
+        sampled_content = " ".join([doc for docs in results["documents"] for doc in docs[:200]])
+        summary = self.llm.summarize_text(sampled_content)
+        return summary

src/vector_store.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import chromadb
+from chromadb.config import Settings
+import os
+from typing import List, Dict, Optional
+class VectorStore:
+    def __init__(self, persist_dir: str = "./chroma_db", embedding_function=None):
+        self.persist_dir = persist_dir
+        os.makedirs(persist_dir, exist_ok=True)
+        # Initialize ChromaDB persistent client
+        self.client = chromadb.PersistentClient(
+            path=persist_dir,
+            settings=Settings(
+                anonymized_telemetry=False,
+                allow_reset=True
+            )
+        )
+        self.embedding_function = embedding_function
+        self.collection = None
+    def get_or_create_collection(self, collection_name: str = "pdf_documents"):
+        """Get or create ChromaDB collection"""
+        try:
+            # Try to get existing collection
+            self.collection = self.client.get_collection(
+                name=collection_name,
+                embedding_function=self.embedding_function
+            )
+            print(f"✓ Loaded existing collection: {collection_name}")
+        except:
+            # Create new collection
+            self.collection = self.client.create_collection(
+                name=collection_name,
+                embedding_function=self.embedding_function,
+                metadata={"hnsw:space": "cosine"}
+            )
+            print(f"✓ Created new collection: {collection_name}")
+        return self.collection
+    def add_documents(self, documents: List[str], metadatas: List[Dict], ids: Optional[List[str]] = None):
+        """Add documents to vector store"""
+        if not self.collection:
+            self.get_or_create_collection()
+        if ids is None:
+            ids = [f"doc_{i}" for i in range(len(documents))]
+        # Get existing IDs to avoid duplicates
+        try:
+            existing_ids = self.collection.get()["ids"]
+        except:
+            existing_ids = []
+        # Filter out documents that already exist
+        docs_to_add = []
+        meta_to_add = []
+        ids_to_add = []
+        for doc, meta, doc_id in zip(documents, metadatas, ids):
+            if doc_id not in existing_ids:
+                docs_to_add.append(doc)
+                meta_to_add.append(meta)
+                ids_to_add.append(doc_id)
+        if docs_to_add:
+            self.collection.add(
+                documents=docs_to_add,
+                metadatas=meta_to_add,
+                ids=ids_to_add
+            )
+            print(f"✓ Added {len(docs_to_add)} new documents to vector store")
+        else:
+            print("✓ All documents already in vector store")
+    def search(self, query: str, n_results: int = 5) -> Dict:
+        """Search documents in vector store"""
+        if not self.collection:
+            return {"documents": [], "metadatas": [], "distances": []}
+        results = self.collection.query(
+            query_texts=[query],
+            n_results=n_results
+        )
+        return results
+    def get_collection_info(self) -> Dict:
+        """Get collection statistics"""
+        if not self.collection:
+            return {}
+        count = self.collection.count()
+        return {
+            "collection_name": self.collection.name,
+            "document_count": count
+        }