Spaces:

dnj0
/

final_project

Sleeping

App Files Files Community

dnj0 commited on Nov 17, 2025

Commit

835ecb4

verified ·

1 Parent(s): 2b7f331

Upload 7 files

Browse files

Files changed (7) hide show

src/.env +1 -0
src/app.py +223 -0
src/embeddings_handler.py +95 -0
src/image_summarizer.py +79 -0
src/pdf_processor.py +276 -0
src/rag_chain.py +132 -0
src/vectorstore_manager.py +110 -0

src/.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_MODEL=gpt-4o-mini

src/app.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import streamlit as st
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Import custom modules
+from pdf_processor import PDFProcessor, prepare_documents_for_embedding
+from embeddings_handler import CLIPLangChainEmbeddings
+from vectorstore_manager import VectorStoreManager
+from image_summarizer import ImageSummarizer, process_images_in_documents
+from rag_chain import RAGChain
+from langchain_core.documents import Document
+# Page configuration
+st.set_page_config(
+    page_title="Multimodal RAG Assistant",
+    page_icon="📄",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+st.markdown("""
+<style>
+    .main {
+        padding: 2rem;
+    }
+    .stChatMessage {
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin-bottom: 1rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Initialize session state
+if "vector_store" not in st.session_state:
+    st.session_state.vector_store = None
+if "rag_chain" not in st.session_state:
+    st.session_state.rag_chain = None
+if "document_count" not in st.session_state:
+    st.session_state.document_count = 0
+# Sidebar configuration
+st.sidebar.title("⚙️ Configuration")
+st.sidebar.markdown("---")
+# OpenAI API Key
+api_key = st.sidebar.text_input(
+    "OpenAI API Key",
+    type="password",
+    value=os.getenv("OPENAI_API_KEY", ""),
+    help="Enter your OpenAI API key"
+)
+if api_key:
+    os.environ["OPENAI_API_KEY"] = api_key
+# PDF directory setup
+pdf_dir = st.sidebar.text_input(
+    "PDF Directory",
+    value="./pdfs",
+    help="Directory containing PDF files"
+)
+# Vector store settings
+st.sidebar.markdown("### Vector Store")
+collection_name = st.sidebar.text_input(
+    "Collection Name",
+    value="pdf_documents",
+    help="ChromaDB collection name"
+)
+persist_dir = st.sidebar.text_input(
+    "Persist Directory",
+    value="./chroma_db",
+    help="Directory for ChromaDB storage"
+)
+# Initialize vector store button
+if st.sidebar.button("🔄 Initialize Vector Store", use_container_width=True):
+    with st.spinner("Initializing vector store..."):
+        try:
+            # Initialize embeddings
+            embeddings = CLIPLangChainEmbeddings(
+                model_name="ViT-B-32",
+                pretrained="openai"
+            )
+            # Initialize vector store
+            st.session_state.vector_store = VectorStoreManager(
+                persist_dir=persist_dir,
+                collection_name=collection_name,
+                embeddings=embeddings
+            )
+            # Initialize RAG chain
+            retriever = st.session_state.vector_store.get_retriever()
+            st.session_state.rag_chain = RAGChain(retriever, api_key=api_key)
+            st.session_state.document_count = st.session_state.vector_store.collection_count()
+            st.success("✅ Vector store initialized!")
+        except Exception as e:
+            st.error(f"❌ Error initializing vector store: {str(e)}")
+# Load and process PDFs button
+if st.sidebar.button("📥 Load & Process PDFs", use_container_width=True):
+    if not api_key:
+        st.error("Please enter OpenAI API Key first")
+    elif st.session_state.vector_store is None:
+        st.error("Please initialize vector store first")
+    else:
+        with st.spinner("Processing PDFs..."):
+            try:
+                # Process PDFs
+                pdf_processor = PDFProcessor(pdf_dir=pdf_dir)
+                documents_data = pdf_processor.process_all_pdfs()
+                if not documents_data:
+                    st.warning(f"No PDFs found in {pdf_dir}")
+                else:
+                    # Summarize images
+                    image_summarizer = ImageSummarizer(api_key=api_key)
+                    documents_data = process_images_in_documents(
+                        documents_data,
+                        image_summarizer
+                    )
+                    # Prepare documents for embedding
+                    all_documents = []
+                    for doc_data in documents_data:
+                        doc_tuples = prepare_documents_for_embedding(doc_data)
+                        for text, metadata in doc_tuples:
+                            all_documents.append(
+                                Document(page_content=text, metadata=metadata)
+                            )
+                    # Add to vector store
+                    st.session_state.vector_store.add_documents(all_documents)
+                    st.session_state.document_count = st.session_state.vector_store.collection_count()
+                    # Reinitialize RAG chain
+                    retriever = st.session_state.vector_store.get_retriever()
+                    st.session_state.rag_chain = RAGChain(retriever, api_key=api_key)
+                    st.success(f"✅ Processed {len(documents_data)} PDFs with {len(all_documents)} chunks")
+                    st.info(f"Total documents in store: {st.session_state.document_count}")
+            except Exception as e:
+                st.error(f"❌ Error processing PDFs: {str(e)}")
+# Display vector store status
+st.sidebar.markdown("### Status")
+if st.session_state.vector_store:
+    doc_count = st.session_state.vector_store.collection_count()
+    st.sidebar.success(f"✅ Vector Store Ready")
+    st.sidebar.metric("Documents in Store", doc_count)
+else:
+    st.sidebar.warning("⚠️ Vector Store Not Initialized")
+# Main content area
+st.title("📄 Multimodal PDF RAG Assistant")
+st.markdown("Ask questions about your PDF documents. Responses will be provided in Russian.")
+# Check if system is ready
+if st.session_state.rag_chain is None:
+    st.info("""
+    ### Getting Started:
+    1. Enter your OpenAI API Key in the sidebar
+    2. Click "Initialize Vector Store"
+    3. Place PDF files in the configured directory
+    4. Click "Load & Process PDFs"
+    5. Ask questions in the chat below
+    """)
+else:
+    # Chat interface
+    st.markdown("---")
+    st.markdown("### Ask a Question")
+    col1, col2 = st.columns([1, 0.15])
+    with col1:
+        user_question = st.text_input(
+            "Your question:",
+            placeholder="Ask about your documents...",
+            label_visibility="collapsed"
+        )
+    with col2:
+        search_button = st.button("🔍 Search", use_container_width=True)
+    # Process question
+    if search_button and user_question:
+        with st.spinner("🤖 Searching documents and generating response..."):
+            try:
+                result = st.session_state.rag_chain.query(user_question)
+                # Display answer
+                st.markdown("### Answer")
+                st.markdown(result["answer"])
+                # Display sources
+                if result["sources"]:
+                    st.markdown("### Sources")
+                    for i, source in enumerate(result["sources"], 1):
+                        with st.expander(f"Source {i} - {source['metadata'].get('filename', 'Unknown')}"):
+                            st.markdown(f"**Type:** {source['metadata'].get('type', 'Unknown')}")
+                            st.markdown(f"**Page:** {source['metadata'].get('page', 'Unknown')}")
+                            st.markdown(f"**Content:** {source['content']}")
+            except Exception as e:
+                st.error(f"Error processing question: {str(e)}")
+    # Footer
+    st.markdown("---")
+    st.markdown("""
+    <div style="text-align: center; color: gray; font-size: 0.8rem;">
+    Powered by LangChain, ChromaDB, CLIP, and OpenAI
+    </div>
+    """, unsafe_allow_html=True)

src/embeddings_handler.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+import open_clip
+from typing import List
+import numpy as np
+class CLIPEmbeddingsHandler:
+    """Handles CLIP embeddings for multimodal content."""
+    def __init__(self, model_name: str = "ViT-B-32", pretrained: str = "openai"):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        try:
+            # FIXED: Correctly unpack 3 return values
+            self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+                model_name,
+                pretrained=pretrained,
+                device=self.device
+            )
+            self.tokenizer = open_clip.get_tokenizer(model_name)
+            self.model.eval()  # Set to evaluation mode
+            print(f"✅ CLIP model loaded on {self.device}")
+            print(f"   Model: {model_name}")
+        except Exception as e:
+            print(f"❌ Error loading CLIP model: {e}")
+            raise
+    def embed_text(self, texts: List[str]) -> np.ndarray:
+        """Generate embeddings for text."""
+        embeddings = []
+        with torch.no_grad():
+            for text in texts:
+                try:
+                    tokens = self.tokenizer(text).to(self.device)
+                    text_features = self.model.encode_text(tokens)
+                    text_features /= text_features.norm(dim=-1, keepdim=True)
+                    embeddings.append(text_features.cpu().numpy())
+                except Exception as e:
+                    print(f"⚠️  Error embedding text: {e}")
+                    embeddings.append(np.zeros(512))
+        result = np.array(embeddings).squeeze()
+        if len(result.shape) == 1:
+            result = np.expand_dims(result, axis=0)
+        return result
+    def embed_image_base64(self, image_base64: str) -> np.ndarray:
+        """Generate embedding for base64 encoded image."""
+        import base64
+        import io
+        from PIL import Image
+        try:
+            image_data = base64.b64decode(image_base64)
+            image = Image.open(io.BytesIO(image_data)).convert("RGB")
+            # Use the evaluation preprocessing
+            image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
+            with torch.no_grad():
+                image_features = self.model.encode_image(image_tensor)
+                image_features /= image_features.norm(dim=-1, keepdim=True)
+            return image_features.cpu().numpy().squeeze()
+        except Exception as e:
+            print(f"❌ Error embedding image: {e}")
+            return np.zeros(512)
+# LangChain wrapper
+from langchain_core.embeddings import Embeddings
+class CLIPLangChainEmbeddings(Embeddings):
+    """LangChain wrapper for CLIP embeddings."""
+    def __init__(self, model_name: str = "ViT-B-32", pretrained: str = "openai"):
+        self.handler = CLIPEmbeddingsHandler(model_name, pretrained)
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed search docs."""
+        embeddings = self.handler.embed_text(texts)
+        if len(embeddings.shape) == 1:
+            return [embeddings.tolist()]
+        return embeddings.tolist()
+    def embed_query(self, text: str) -> List[float]:
+        """Embed query text."""
+        embedding = self.handler.embed_text([text])
+        if len(embedding.shape) == 1:
+            return embedding.tolist()
+        return embedding[0].tolist()

src/image_summarizer.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import base64
+import os
+from typing import Optional
+from openai import OpenAI
+class ImageSummarizer:
+    """Summarizes images using OpenAI's vision API."""
+    def __init__(self, api_key: Optional[str] = None):
+        """Initialize OpenAI client."""
+        self.client = OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))
+    def summarize_image_base64(self,
+                               image_base64: str,
+                               image_format: str = "png") -> str:
+        """
+        Summarize image using OpenAI vision.
+        Args:
+            image_base64: Base64 encoded image
+            image_format: Image format (png, jpg, etc.)
+        Returns:
+            Image description/summary
+        """
+        try:
+            response = self.client.chat.completions.create(
+                model="gpt-4o-mini",  # or "gpt-4-vision-preview"
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/{image_format};base64,{image_base64}"
+                                }
+                            },
+                            {
+                                "type": "text",
+                                "text": "Пожалуйста, опишите детально содержание этого изображения на русском языке. Укажите все видимые объекты, текст, диаграммы, графики и их взаимосвязь."
+                            }
+                        ]
+                    }
+                ],
+                max_tokens=500
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            print(f"Error summarizing image: {e}")
+            return f"Изображение на странице (ошибка обработки: {str(e)})"
+def process_images_in_documents(documents_data: list,
+                                image_summarizer: ImageSummarizer) -> list:
+    """
+    Process images in extracted PDF documents and add summaries.
+    Args:
+        documents_data: List of document content dictionaries
+        image_summarizer: ImageSummarizer instance
+    Returns:
+        Updated documents with image summaries
+    """
+    for doc in documents_data:
+        for page in doc.get("pages", []):
+            for image in page.get("images", []):
+                if image.get("base64"):
+                    print(f"Summarizing image from page {page.get('page_number')}")
+                    summary = image_summarizer.summarize_image_base64(
+                        image.get("base64"),
+                        image.get("format", "png")
+                    )
+                    image["summary"] = summary
+    return documents_data

src/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import os
+import json
+import base64
+import hashlib
+from pathlib import Path
+from typing import List, Dict, Tuple
+import pdfplumber
+import pymupdf
+from PIL import Image
+import io
+class PDFProcessor:
+    """Processes PDFs to extract text, tables, and images."""
+    def __init__(self, pdf_dir: str = "./pdfs", cache_file: str = ".pdf_cache.json"):
+        self.pdf_dir = pdf_dir
+        self.cache_file = cache_file
+        self.cache = self._load_cache()
+        os.makedirs(pdf_dir, exist_ok=True)
+    def _load_cache(self) -> Dict:
+        """Load processing cache to avoid reprocessing PDFs."""
+        if os.path.exists(self.cache_file):
+            with open(self.cache_file, 'r') as f:
+                return json.load(f)
+        return {}
+    def _save_cache(self):
+        """Save processing cache."""
+        with open(self.cache_file, 'w') as f:
+            json.dump(self.cache, f, indent=2)
+    def _get_file_hash(self, filepath: str) -> str:
+        """Generate hash of file for change detection."""
+        hash_md5 = hashlib.md5()
+        with open(filepath, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()
+    def _extract_images_from_page(self, pdf_path: str, page_num: int) -> List[Dict]:
+        """Extract images from specific page using PyMuPDF."""
+        images = []
+        try:
+            doc = pymupdf.open(pdf_path)
+            # Verify page exists
+            if page_num >= len(doc):
+                print(f"⚠️  Page {page_num} does not exist")
+                doc.close()
+                return images
+            page = doc[page_num]
+            # Get image list - returns list of tuples
+            image_list = page.get_images()
+            if not image_list:
+                doc.close()
+                return images
+            print(f"Found {len(image_list)} images on page {page_num}")
+            # Process each image
+            for img_index, img_info in enumerate(image_list):
+                try:
+                    # FIXED: Extract xref from tuple (first element)
+                    xref = img_info[0]
+                    # Validate xref is integer
+                    if not isinstance(xref, int):
+                        print(f"⚠️  Invalid xref type: {type(xref).__name__}")
+                        continue
+                    # Extract image
+                    img_data = doc.extract_image(xref)
+                    if not img_data or "image" not in img_data:
+                        print(f"⚠️  No image data at xref {xref}")
+                        continue
+                    # Encode to base64
+                    image_bytes = img_data["image"]
+                    img_base64 = base64.b64encode(image_bytes).decode()
+                    images.append({
+                        "type": "image",
+                        "format": img_data.get("ext", "png"),
+                        "base64": img_base64,
+                        "page": page_num,
+                        "index": img_index,
+                        "xref": xref
+                    })
+                    print(f"✅ Image {img_index + 1}/{len(image_list)}")
+                except ValueError as e:
+                    if "bad xref" in str(e).lower():
+                        print(f"⚠️  Bad xref {xref}: {e}")
+                    else:
+                        print(f"⚠️  Error at xref {xref}: {e}")
+                    continue
+                except Exception as e:
+                    print(f"⚠️  Error extracting image {img_index}: {e}")
+                    continue
+            doc.close()
+        except Exception as e:
+            print(f"❌ Error in _extract_images_from_page: {e}")
+        return images
+    def _extract_tables_from_page(self, pdf_path: str, page_num: int) -> List[Dict]:
+        """Extract tables from specific page using pdfplumber."""
+        tables = []
+        try:
+            with pdfplumber.open(pdf_path) as pdf:
+                page = pdf.pages[page_num]
+                extracted_tables = page.extract_tables()
+                for table_idx, table in enumerate(extracted_tables or []):
+                    # Convert table to markdown format
+                    table_md = self._table_to_markdown(table)
+                    tables.append({
+                        "type": "table",
+                        "content": table_md,
+                        "page": page_num,
+                        "index": table_idx
+                    })
+        except Exception as e:
+            print(f"Error extracting tables from page {page_num}: {e}")
+        return tables
+    def _table_to_markdown(self, table: List[List]) -> str:
+        """Convert table to markdown format."""
+        if not table:
+            return ""
+        md = "| " + " | ".join(str(cell or "") for cell in table[0]) + " |\n"
+        md += "| " + " | ".join(["---"] * len(table[0])) + " |\n"
+        for row in table[1:]:
+            md += "| " + " | ".join(str(cell or "") for cell in row) + " |\n"
+        return md
+    def extract_pdf_content(self, pdf_path: str) -> Dict:
+        """
+        Extract all content from PDF (text, tables, images).
+        Uses cache to avoid reprocessing.
+        """
+        pdf_name = os.path.basename(pdf_path)
+        file_hash = self._get_file_hash(pdf_path)
+        # Check cache
+        if pdf_name in self.cache and self.cache[pdf_name].get("hash") == file_hash:
+            print(f"Using cached data for {pdf_name}")
+            return self.cache[pdf_name]["content"]
+        print(f"Processing PDF: {pdf_name}")
+        content = {
+            "filename": pdf_name,
+            "pages": []
+        }
+        try:
+            # Count pages
+            with pdfplumber.open(pdf_path) as pdf:
+                num_pages = len(pdf.pages)
+            # Process each page
+            for page_num in range(num_pages):
+                page_content = {
+                    "page_number": page_num + 1,
+                    "text": "",
+                    "tables": [],
+                    "images": []
+                }
+                # Extract text
+                with pdfplumber.open(pdf_path) as pdf:
+                    page = pdf.pages[page_num]
+                    page_content["text"] = page.extract_text() or ""
+                # Extract tables
+                page_content["tables"] = self._extract_tables_from_page(pdf_path, page_num)
+                # Extract images
+                page_content["images"] = self._extract_images_from_page(pdf_path, page_num)
+                content["pages"].append(page_content)
+        except Exception as e:
+            print(f"Error processing {pdf_path}: {e}")
+            return None
+        # Cache the result
+        self.cache[pdf_name] = {
+            "hash": file_hash,
+            "content": content
+        }
+        self._save_cache()
+        return content
+    def process_all_pdfs(self, pdf_dir: str = None) -> List[Dict]:
+        """Process all PDFs in directory."""
+        if pdf_dir is None:
+            pdf_dir = self.pdf_dir
+        all_content = []
+        pdf_files = list(Path(pdf_dir).glob("*.pdf"))
+        if not pdf_files:
+            print(f"No PDF files found in {pdf_dir}")
+            return all_content
+        for pdf_file in pdf_files:
+            content = self.extract_pdf_content(str(pdf_file))
+            if content:
+                all_content.append(content)
+        return all_content
+def prepare_documents_for_embedding(pdf_content: Dict) -> List[Tuple[str, Dict]]:
+    """
+    Prepare extracted PDF content for embedding.
+    Returns list of (text, metadata) tuples.
+    """
+    documents = []
+    for page in pdf_content.get("pages", []):
+        page_num = page.get("page_number")
+        filename = pdf_content.get("filename")
+        # Add text chunks
+        if page.get("text"):
+            documents.append((
+                page["text"],
+                {
+                    "type": "text",
+                    "page": page_num,
+                    "filename": filename
+                }
+            ))
+        # Add table summaries
+        for table in page.get("tables", []):
+            documents.append((
+                f"Table on page {page_num}:\n{table['content']}",
+                {
+                    "type": "table",
+                    "page": page_num,
+                    "filename": filename
+                }
+            ))
+        # Add image descriptions (we'll get these from OpenAI)
+        for image in page.get("images", []):
+            documents.append((
+                f"Image on page {page_num}",
+                {
+                    "type": "image",
+                    "page": page_num,
+                    "filename": filename,
+                    "image_base64": image.get("base64"),
+                    "image_format": image.get("format")
+                }
+            ))
+    return documents

src/rag_chain.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from langchain_openai import ChatOpenAI
+from langchain.chains import RetrievalQA
+from langchain_core.prompts import PromptTemplate
+from typing import Optional
+import os
+class RAGChain:
+    """RAG chain using OpenAI API with Russian language support."""
+    def __init__(self,
+                 retriever,
+                 model_name: str = "gpt-4o-mini",
+                 temperature: float = 0.3,
+                 api_key: Optional[str] = None):
+        """
+        Initialize RAG chain.
+        Args:
+            retriever: LangChain retriever (from vector store)
+            model_name: OpenAI model name
+            temperature: Temperature for LLM
+            api_key: OpenAI API key
+        """
+        self.llm = ChatOpenAI(
+            model_name=model_name,
+            temperature=temperature,
+            api_key=api_key or os.getenv("OPENAI_API_KEY"),
+            max_tokens=1024
+        )
+        self.retriever = retriever
+        # Custom prompt for Russian language
+        self.prompt_template = PromptTemplate(
+            template="""Вы - полезный ассистент, специализирующийся на анализе документов.
+Используя следующий контекст из документов, ответьте на вопрос.
+Контекст:
+{context}
+Вопрос: {question}
+Инструкции:
+1. Ответьте только на основе информации из контекста
+2. Если информация не найдена в контексте, скажите "Информация не найдена в документах"
+3. Ответьте на русском языке
+4. Будьте кратким и точным
+5. Цитируйте источники если возможно
+Ответ:""",
+            input_variables=["context", "question"]
+        )
+        # Create RetrievalQA chain
+        self.chain = RetrievalQA.from_chain_type(
+            llm=self.llm,
+            chain_type="stuff",
+            retriever=self.retriever,
+            return_source_documents=True,
+            chain_type_kwargs={"prompt": self.prompt_template}
+        )
+    def query(self, question: str) -> dict:
+        """
+        Query the RAG chain.
+        Args:
+            question: User question (can be in any language)
+        Returns:
+            Dictionary with answer and source documents
+        """
+        try:
+            result = self.chain.invoke({"query": question})
+            return {
+                "answer": result.get("result", ""),
+                "sources": [
+                    {
+                        "content": doc.page_content[:200],  # First 200 chars
+                        "metadata": doc.metadata
+                    }
+                    for doc in result.get("source_documents", [])
+                ]
+            }
+        except Exception as e:
+            return {
+                "answer": f"Ошибка при обработке запроса: {str(e)}",
+                "sources": []
+            }
+    def query_with_context(self, question: str, context_limit: int = 5) -> dict:
+        """
+        Query with explicit context retrieval.
+        Args:
+            question: User question
+            context_limit: Number of context chunks to retrieve
+        Returns:
+            Dictionary with answer and context
+        """
+        # Retrieve relevant documents
+        relevant_docs = self.retriever.get_relevant_documents(
+            question,
+            search_kwargs={"k": context_limit}
+        )
+        # Format context
+        context = "\n\n".join([
+            f"Источник: {doc.metadata}\n{doc.page_content}"
+            for doc in relevant_docs
+        ])
+        # Create prompt
+        prompt = self.prompt_template.format(context=context, question=question)
+        # Get response
+        response = self.llm.invoke(prompt)
+        return {
+            "answer": response.content,
+            "context_documents": [
+                {
+                    "content": doc.page_content[:300],
+                    "metadata": doc.metadata
+                }
+                for doc in relevant_docs
+            ]
+        }

src/vectorstore_manager.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import chromadb
+from chromadb.config import Settings
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
+from typing import List, Dict, Optional
+import os
+class VectorStoreManager:
+    """Manages ChromaDB vector store for persistent storage."""
+    def __init__(self,
+                 persist_dir: str = "./chroma_db",
+                 collection_name: str = "pdf_documents",
+                 embeddings=None):
+        """
+        Initialize vector store.
+        Args:
+            persist_dir: Directory for persistent storage
+            collection_name: Name of the collection
+            embeddings: LangChain embeddings instance
+        """
+        self.persist_dir = persist_dir
+        self.collection_name = collection_name
+        self.embeddings = embeddings
+        os.makedirs(persist_dir, exist_ok=True)
+        # Initialize ChromaDB persistent client
+        self.client = chromadb.PersistentClient(path=persist_dir)
+        # Initialize LangChain Chroma wrapper
+        self.vector_store = Chroma(
+            client=self.client,
+            collection_name=collection_name,
+            embedding_function=embeddings,
+            persist_directory=persist_dir
+        )
+        print(f"Vector store initialized: {persist_dir}/{collection_name}")
+    def add_documents(self, documents: List[Document], batch_size: int = 50):
+        """
+        Add documents to vector store.
+        Args:
+            documents: List of LangChain Document objects
+            batch_size: Number of documents per batch
+        """
+        # Process in batches
+        for i in range(0, len(documents), batch_size):
+            batch = documents[i:i + batch_size]
+            try:
+                self.vector_store.add_documents(batch)
+                print(f"Added {len(batch)} documents (batch {i//batch_size + 1})")
+            except Exception as e:
+                print(f"Error adding documents: {e}")
+    def search(self, query: str, k: int = 5) -> List[Dict]:
+        """
+        Search for similar documents.
+        Args:
+            query: Search query
+            k: Number of results to return
+        Returns:
+            List of documents with scores
+        """
+        results = self.vector_store.similarity_search_with_score(query, k=k)
+        search_results = []
+        for doc, score in results:
+            search_results.append({
+                "content": doc.page_content,
+                "metadata": doc.metadata,
+                "similarity": score
+            })
+        return search_results
+    def get_retriever(self, search_kwargs: Optional[Dict] = None):
+        """Get retriever for RAG chain."""
+        if search_kwargs is None:
+            search_kwargs = {"k": 5}
+        return self.vector_store.as_retriever(search_kwargs=search_kwargs)
+    def collection_count(self) -> int:
+        """Get number of documents in collection."""
+        try:
+            collection = self.client.get_collection(self.collection_name)
+            return collection.count()
+        except Exception as e:
+            print(f"Error getting collection count: {e}")
+            return 0
+    def clear_collection(self):
+        """Clear all documents from collection."""
+        try:
+            self.client.delete_collection(self.collection_name)
+            self.vector_store = Chroma(
+                client=self.client,
+                collection_name=self.collection_name,
+                embedding_function=self.embeddings,
+                persist_directory=self.persist_dir
+            )
+            print(f"Collection cleared: {self.collection_name}")
+        except Exception as e:
+            print(f"Error clearing collection: {e}")