Spaces:

MokhlesBR
/

RAG-Insurance

Sleeping

App Files Files Community

mokhles commited on Nov 20, 2025

Commit

af37875

1 Parent(s): b891d55

Initial commit: Insurance RAG API

Browse files

Files changed (7) hide show

Dockerfile +16 -0
README.md +5 -4
app.py +23 -0
chroma.py +302 -0
requirements.txt +19 -0
retrieval.py +208 -0
vector_store.py +228 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
-title: RAG Insurance
-emoji: 😻
-colorFrom: green
-colorTo: indigo
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Insurance Rag Api
+emoji: 🌖
+colorFrom: blue
+colorTo: red
 sdk: docker
 pinned: false
+short_description: Production‑ready FastAPI Retrieval‑Augmented Generation (RAG
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# app.py
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from retrieval import router as retrieval_router
+app = FastAPI(title="Insurance RAG API", version="1.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(retrieval_router)
+@app.get("/")
+async def root():
+    return {"message": "Insurance RAG API is running", "docs": "/docs"}

chroma.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# chroma.py  (minimal, no visualization, WITH sentence-transformers, with .env)
+import os
+import warnings
+from pathlib import Path
+from typing import List, Dict
+import pandas as pd  # (currently unused but kept if you need it later)
+from dotenv import load_dotenv
+from llama_parse import LlamaParse
+from llama_index.core.node_parser import SentenceSplitter
+import chromadb
+from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+from openai import OpenAI
+import nest_asyncio
+nest_asyncio.apply()
+warnings.filterwarnings("ignore")
+# ---------- LOAD .env ----------
+load_dotenv()
+# ---------- CONFIG ----------
+CONFIG = {
+    "pdf_directory": r"C:\Users\Legion\Documents\Ominimo Job\Pdfs for RAG",
+    "output_directory": "./output/",
+    "llm_model": "gpt-4.1-mini",
+    "chunk_size": 512,
+    "chunk_overlap": 50,
+    "top_k_retrieval": 3,
+    # ✅ SentenceTransformer embedding model (384-D for MiniLM)
+    # Must match your retrieval embedding model.
+    "embedding_model": "all-MiniLM-L6-v2",
+    # Optional: force device ("cpu" or "cuda")
+    "embedding_device": os.getenv("EMB_DEVICE", "cpu"),
+}
+Path(CONFIG["output_directory"]).mkdir(parents=True, exist_ok=True)
+# ---------- OPENAI CLIENT (for summaries only) ----------
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    raise RuntimeError("OPENAI_API_KEY is not set in the environment or .env file.")
+client = OpenAI(api_key=OPENAI_API_KEY)
+document_summaries: Dict[str, str] = {}
+def summarize_document(text: str, client: OpenAI, model: str) -> str:
+    """Generate a summary of the document using OpenAI (used only for summaries)."""
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "system",
+                "content": (
+                    "You are a helpful assistant that creates concise "
+                    "summaries of documents."
+                ),
+            },
+            {
+                "role": "user",
+                "content": (
+                    "Please provide a comprehensive summary of the "
+                    "following document:\n\n"
+                    f"{text[:4000]}"
+                ),
+            },
+        ],
+        temperature=0.3,
+        max_tokens=500,
+    )
+    return response.choices[0].message.content
+# ---------- PDF PARSING ----------
+def parse_pdfs_with_llamaparse(pdf_directory: str) -> List[Dict]:
+    """Parse PDFs using LlamaParse with batch processing."""
+    pdf_files = list(Path(pdf_directory).glob("*.pdf"))
+    print(f"Found {len(pdf_files)} PDF files")
+    llama_key = os.environ.get("LLAMA_CLOUD_API_KEY")
+    if not llama_key:
+        raise RuntimeError("LLAMA_CLOUD_API_KEY is not set in the environment or .env.")
+    parser = LlamaParse(
+        api_key=llama_key,
+        result_type="markdown",
+        verbose=True,
+        language="en",
+        num_workers=4,
+    )
+    all_documents: List[Dict] = []
+    try:
+        print("\nParsing all PDFs in batch...")
+        pdf_paths = [str(pdf) for pdf in pdf_files]
+        documents_batch = parser.load_data(pdf_paths)
+        print(f"✓ Successfully parsed {len(documents_batch)} document sections")
+        doc_index = 0
+        for pdf_path in pdf_files:
+            print(f"\nProcessing: {pdf_path.name}")
+            pdf_docs = []
+            while doc_index < len(documents_batch):
+                doc = documents_batch[doc_index]
+                if hasattr(doc, "metadata") and doc.metadata.get("file_path"):
+                    if pdf_path.name in doc.metadata.get("file_path", ""):
+                        pdf_docs.append(doc)
+                        doc_index += 1
+                    else:
+                        break
+                else:
+                    pdf_docs.append(doc)
+                    doc_index += 1
+                    if doc_index >= len(documents_batch):
+                        break
+            if pdf_docs:
+                full_text = " ".join([d.text for d in pdf_docs])
+                summary = summarize_document(full_text, client, CONFIG["llm_model"])
+                document_summaries[pdf_path.name] = summary
+                print(f"Summary for {pdf_path.name}:")
+                print(summary[:200] + "...\n")
+                for d in pdf_docs:
+                    all_documents.append(
+                        {
+                            "text": d.text,
+                            "source": pdf_path.name,
+                            "metadata": d.metadata if hasattr(d, "metadata") else {},
+                        }
+                    )
+            else:
+                print(f"Warning: No content extracted from {pdf_path.name}")
+                document_summaries[pdf_path.name] = "No content extracted"
+    except Exception as e:
+        print(f"Batch processing failed: {str(e)}")
+        print("\nFalling back to individual file processing with sleep delays...")
+        import time
+        for pdf_path in pdf_files:
+            print(f"\nParsing: {pdf_path.name}")
+            try:
+                time.sleep(2)
+                documents = parser.load_data(str(pdf_path))
+                if documents:
+                    full_text = " ".join([d.text for d in documents])
+                    summary = summarize_document(full_text, client, CONFIG["llm_model"])
+                    document_summaries[pdf_path.name] = summary
+                    print(f"Summary for {pdf_path.name}:")
+                    print(summary[:200] + "...\n")
+                    for d in documents:
+                        all_documents.append(
+                            {
+                                "text": d.text,
+                                "source": pdf_path.name,
+                                "metadata": d.metadata if hasattr(d, "metadata") else {},
+                            }
+                        )
+                else:
+                    print(f"Warning: No content extracted from {pdf_path.name}")
+                    document_summaries[pdf_path.name] = "No content extracted"
+            except Exception as e2:
+                print(f"Error parsing {pdf_path.name}: {str(e2)}")
+                document_summaries[pdf_path.name] = f"Failed to parse: {str(e2)}"
+                continue
+    return all_documents
+# ---------- CHUNKING ----------
+def chunk_documents(
+    documents: List[Dict],
+    chunk_size: int = 512,
+    chunk_overlap: int = 50,
+) -> List[Dict]:
+    """Chunk documents using semantic splitting."""
+    text_splitter = SentenceSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    )
+    all_chunks: List[Dict] = []
+    chunk_id = 0
+    for doc in documents:
+        chunks = text_splitter.split_text(doc["text"])
+        for chunk in chunks:
+            all_chunks.append(
+                {
+                    "chunk_id": f"chunk_{chunk_id}",
+                    "text": chunk,
+                    "source": doc["source"],
+                    "metadata": doc["metadata"],
+                }
+            )
+            chunk_id += 1
+    return all_chunks
+# ---------- CHROMA (SBERT EMBEDDINGS, 384-D) ----------
+def create_chromadb_collection(
+    chunks: List[Dict],
+    collection_name: str = "rag_documents",
+) -> chromadb.Collection:
+    """Create and populate ChromaDB collection using SentenceTransformer embeddings."""
+    sbert_ef = SentenceTransformerEmbeddingFunction(
+        model_name=CONFIG["embedding_model"],
+        device=CONFIG["embedding_device"],
+    )
+    client_db = chromadb.PersistentClient(
+        path=os.path.join(CONFIG["output_directory"], "chromadb")
+    )
+    # ✅ Delete existing collection to avoid old 1536-D vectors
+    try:
+        client_db.delete_collection(collection_name)
+        print(f"Deleted existing collection: {collection_name}")
+    except Exception:
+        pass
+    collection = client_db.create_collection(
+        name=collection_name,
+        metadata={
+            "description": "RAG document chunks",
+            "embedding_model": CONFIG["embedding_model"],
+            "embedding_dim": 384,  # MiniLM dim
+        },
+        embedding_function=sbert_ef,
+    )
+    ids = [chunk["chunk_id"] for chunk in chunks]
+    documents = [chunk["text"] for chunk in chunks]
+    metadatas = [
+        {"source": chunk["source"], **(chunk["metadata"] or {})}
+        for chunk in chunks
+    ]
+    batch_size = 100
+    for i in range(0, len(ids), batch_size):
+        batch_end = min(i + batch_size, len(ids))
+        collection.add(
+            ids=ids[i:batch_end],
+            documents=documents[i:batch_end],
+            metadatas=metadatas[i:batch_end],
+        )
+        print(
+            f"Added batch {i // batch_size + 1}/"
+            f"{(len(ids) - 1) // batch_size + 1}"
+        )
+    print(f"✓ ChromaDB collection created with {len(ids)} chunks")
+    return collection
+# ---------- MAIN ----------
+def main():
+    print("✓ Starting pipeline with .env configuration (SentenceTransformer embeddings)")
+    print("Starting PDF parsing...")
+    parsed_documents = parse_pdfs_with_llamaparse(CONFIG["pdf_directory"])
+    print(f"\n✓ Parsed {len(parsed_documents)} document sections from PDFs")
+    chunks = chunk_documents(
+        parsed_documents,
+        CONFIG["chunk_size"],
+        CONFIG["chunk_overlap"],
+    )
+    print(f"✓ Created {len(chunks)} chunks")
+    if chunks:
+        print("\nSample chunk:")
+        print(chunks[0])
+    chroma_collection = create_chromadb_collection(chunks)
+    print("ChromaDB collection ready for querying.")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+fastapi
+uvicorn[standard]
+pydantic
+python-dotenv
+pandas
+llama-index-core
+llama-parse
+chromadb
+sentence-transformers
+rank-bm25
+openai
+nest-asyncio
+numpy

retrieval.py ADDED Viewed

	@@ -0,0 +1,208 @@

+from fastapi import APIRouter, Depends, HTTPException, status, Query
+from pydantic import BaseModel, Field, computed_field
+from typing import List, Optional, Dict, Any
+import logging
+import numpy as np
+from sentence_transformers import CrossEncoder
+from vector_store import get_vector_store, VectorStoreManager
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/retrieval", tags=["retrieval"])
+_reranker = None
+def get_reranker():
+    global _reranker
+    if _reranker is None:
+        logger.info("Loading cross-encoder reranker...")
+        _reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+    return _reranker
+class RetrievalRequest(BaseModel):
+    question: str = Field(..., min_length=1, max_length=500)
+    top_k: int = Field(default=5, ge=1, le=20)
+    filter_by_cluster: Optional[str] = None
+    filter_by_source: Optional[str] = None
+    filter_by_topic: Optional[str] = None
+    contains_text: Optional[str] = None
+    similarity_threshold: float = Field(default=1.0, ge=0.0, le=2.0)
+    # ✅ Hybrid retrieval toggles
+    enable_bm25: bool = Field(
+        default=False,
+        description="Enable BM25 + semantic hybrid retrieval",
+    )
+    bm25_k: int = Field(
+        default=20,
+        ge=5,
+        le=100,
+        description="How many BM25 candidates to consider",
+    )
+    hybrid_alpha: float = Field(
+        default=0.4,
+        ge=0.0,
+        le=1.0,
+        description="Dense weight in hybrid fusion (alpha=1 => semantic only)",
+    )
+    # Reranking
+    enable_rerank: bool = Field(default=False)
+    rerank_top_k: int = Field(default=3, ge=1, le=10)
+class DocumentResult(BaseModel):
+    chunk_id: str
+    text: str
+    source: str
+    topic: Optional[str]
+    cluster: Optional[str]
+    distance: float
+    rerank_score: Optional[float] = None
+    @computed_field
+    @property
+    def relevance_label(self) -> str:
+        if self.distance < 0.8:
+            return "Highly Relevant"
+        elif self.distance < 1.0:
+            return "Relevant"
+        elif self.distance < 1.5:
+            return "Somewhat Relevant"
+        return "Low Relevance"
+class RetrievalResponse(BaseModel):
+    documents: List[DocumentResult]
+    count: int
+    query: str
+    filters_applied: Dict[str, Any]
+    retrieval_stats: Dict[str, Any]
+def rerank_documents(query: str, documents: List[DocumentResult], top_k: int = 3):
+    if not documents or len(documents) <= 1:
+        return documents
+    try:
+        reranker = get_reranker()
+        pairs = [[query, doc.text[:1500]] for doc in documents]
+        scores = reranker.predict(pairs)
+        for doc, score in zip(documents, scores):
+            doc.rerank_score = float(score)
+        reranked = sorted(documents, key=lambda x: x.rerank_score or 0.0, reverse=True)
+        return reranked[:top_k]
+    except Exception as e:
+        logger.error(f"Reranking failed: {str(e)}, returning original results")
+        return documents[:top_k]
+@router.post("/search", response_model=RetrievalResponse)
+async def retrieve_documents_endpoint(
+    request: RetrievalRequest,
+    vector_store: VectorStoreManager = Depends(get_vector_store),
+):
+    try:
+        logger.info(f"Processing query: '{request.question}' top_k={request.top_k}")
+        where_filters: Dict[str, Any] = {}
+        if request.filter_by_cluster:
+            where_filters["cluster"] = request.filter_by_cluster
+        if request.filter_by_source:
+            where_filters["source"] = request.filter_by_source
+        if request.filter_by_topic:
+            where_filters["topic"] = request.filter_by_topic
+        where_document = {"$contains": request.contains_text} if request.contains_text else None
+        # If reranking or hybrid, fetch more candidates
+        n_candidates = request.top_k * 3 if (request.enable_rerank or request.enable_bm25) else request.top_k
+        candidates = vector_store.retrieve_documents(
+            question=request.question,
+            n_results=n_candidates,
+            where_filters=where_filters if where_filters else None,
+            where_document=where_document,
+            enable_bm25=request.enable_bm25,
+            bm25_k=request.bm25_k,
+            alpha=request.hybrid_alpha,
+        )
+        documents: List[DocumentResult] = []
+        filtered_count = 0
+        for c in candidates:
+            distance = c.get("distance")
+            # if candidate came only from BM25, distance may be None
+            if distance is None:
+                distance = 1.5  # treat as weak semantic match
+            if distance <= request.similarity_threshold:
+                meta = c.get("metadata") or {}
+                documents.append(
+                    DocumentResult(
+                        chunk_id=c["id"],
+                        text=c["text"],
+                        source=meta.get("source", "Unknown"),
+                        topic=meta.get("topic"),
+                        cluster=meta.get("cluster"),
+                        distance=float(distance),
+                    )
+                )
+            else:
+                filtered_count += 1
+        total_retrieved = len(candidates)
+        # Rerank if enabled
+        if request.enable_rerank and len(documents) > 1:
+            documents = rerank_documents(request.question, documents, request.rerank_top_k)
+            retrieval_method = "hybrid_with_rerank" if request.enable_bm25 else "semantic_with_rerank"
+        else:
+            documents = documents[:request.top_k]
+            retrieval_method = "hybrid" if request.enable_bm25 else "semantic"
+        distances = [d.distance for d in documents]
+        avg_distance = float(np.mean(distances)) if distances else None
+        best_distance = min(distances) if distances else None
+        return RetrievalResponse(
+            documents=documents,
+            count=len(documents),
+            query=request.question,
+            filters_applied={
+                "cluster": request.filter_by_cluster,
+                "source": request.filter_by_source,
+                "topic": request.filter_by_topic,
+                "contains_text": request.contains_text,
+                "similarity_threshold": request.similarity_threshold,
+                "enable_bm25": request.enable_bm25,
+                "bm25_k": request.bm25_k,
+                "hybrid_alpha": request.hybrid_alpha,
+            },
+            retrieval_stats={
+                "method": retrieval_method,
+                "total_retrieved": total_retrieved,
+                "filtered_by_threshold": filtered_count,
+                "returned": len(documents),
+                "best_distance": best_distance,
+                "avg_distance": avg_distance,
+                "reranking_applied": request.enable_rerank,
+                "bm25_applied": request.enable_bm25,
+            },
+        )
+    except Exception as e:
+        logger.error(f"Retrieval failed: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Retrieval failed: {str(e)}",
+        )

vector_store.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import logging
+from typing import Optional, Dict, Any, List
+import threading
+import re
+import numpy as np
+import chromadb
+from rank_bm25 import BM25Okapi
+logger = logging.getLogger(__name__)
+class VectorStoreManager:
+    _instance = None
+    _lock = threading.Lock()
+    _initialized = False
+    def __new__(cls):
+        with cls._lock:
+            if cls._instance is None:
+                cls._instance = super().__new__(cls)
+        return cls._instance
+    def __init__(self):
+        with self._lock:
+            if not self._initialized:
+                self._initialize()
+                VectorStoreManager._initialized = True
+    def _initialize(self):
+        """Initialize vector store with single collection + BM25 index"""
+        try:
+            logger.info("Initializing vector store components...")
+            self.client = None
+            self.collection = None
+            db_path = "output/chromadb"  # Match your pipeline path
+            self.client = chromadb.PersistentClient(path=db_path)
+            logger.info(f"ChromaDB client initialized at path: {db_path}")
+            available_collections = [col.name for col in self.client.list_collections()]
+            logger.info(f"Available collections: {available_collections}")
+            try:
+                self.collection = self.client.get_collection("rag_documents")
+                collection_count = self.collection.count()
+                logger.info(
+                    f"Collection 'rag_documents' loaded with {collection_count} documents"
+                )
+            except Exception as e:
+                logger.error(f"Collection 'rag_documents' not found: {str(e)}")
+                raise ValueError(
+                    "Required collection 'rag_documents' not found. "
+                    f"Available: {available_collections}"
+                )
+            # ---- Build BM25 index from all stored docs ----
+            logger.info("Building BM25 index from Chroma collection...")
+            data = self.collection.get(include=["documents", "metadatas"])
+            self.all_ids: List[str] = data["ids"]
+            self.all_docs: List[str] = data["documents"]
+            self.all_metas: List[Dict[str, Any]] = data["metadatas"]
+            self.tokenized_corpus = [self._tokenize(d) for d in self.all_docs]
+            self.bm25 = BM25Okapi(self.tokenized_corpus)
+            logger.info(f"BM25 index ready with {len(self.all_docs)} chunks")
+            logger.info("Vector store initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize vector store: {str(e)}")
+            VectorStoreManager._initialized = False
+            raise
+    # ----------------- Helpers -----------------
+    def _tokenize(self, text: str) -> List[str]:
+        return re.findall(r"\w+", (text or "").lower())
+    def _matches_filters(
+        self,
+        meta: Dict[str, Any],
+        doc_text: str,
+        where_filters: Optional[Dict[str, Any]],
+        where_document: Optional[Dict[str, Any]],
+    ) -> bool:
+        if where_filters:
+            for k, v in where_filters.items():
+                if meta.get(k) != v:
+                    return False
+        if where_document:
+            # you only use {"$contains": "..."}
+            contains = where_document.get("$contains")
+            if contains and contains.lower() not in (doc_text or "").lower():
+                return False
+        return True
+    def _rrf_fuse(
+        self,
+        dense_ranked: List[Dict[str, Any]],
+        sparse_ranked: List[Dict[str, Any]],
+        k: int = 60,
+        w_dense: float = 0.6,
+        w_sparse: float = 0.4,
+    ) -> List[Dict[str, Any]]:
+        """
+        Reciprocal Rank Fusion
+        score = w_dense/(k+rank_dense) + w_sparse/(k+rank_sparse)
+        """
+        scores: Dict[str, Dict[str, Any]] = {}
+        for rank, item in enumerate(dense_ranked):
+            doc_id = item["id"]
+            scores.setdefault(doc_id, {"score": 0.0, "item": item})
+            scores[doc_id]["score"] += w_dense / (k + rank + 1)
+        for rank, item in enumerate(sparse_ranked):
+            doc_id = item["id"]
+            scores.setdefault(doc_id, {"score": 0.0, "item": item})
+            scores[doc_id]["score"] += w_sparse / (k + rank + 1)
+        fused = sorted(scores.values(), key=lambda x: x["score"], reverse=True)
+        return [x["item"] for x in fused]
+    # ----------------- Main retrieval -----------------
+    def retrieve_documents(
+        self,
+        question: str,
+        n_results: int = 5,
+        where_filters: Optional[Dict[str, Any]] = None,
+        where_document: Optional[Dict[str, Any]] = None,
+        enable_bm25: bool = False,
+        bm25_k: Optional[int] = None,
+        alpha: float = 0.6,   # dense weight in hybrid fusion
+    ) -> List[Dict[str, Any]]:
+        """
+        Retrieve documents using:
+        - semantic-only (Chroma)
+        - or hybrid semantic + BM25 (RRF fusion)
+        Returns a list of dicts:
+        {id, text, metadata, distance, bm25_score(optional)}
+        """
+        if not self._initialized or self.collection is None:
+            raise RuntimeError("VectorStoreManager not properly initialized")
+        logger.info(f"Retrieving documents for query: {question[:50]}...")
+        dense_k = n_results
+        bm25_k = bm25_k or n_results
+        # ----- Dense retrieval (semantic via Chroma) -----
+        try:
+            dense_res = self.collection.query(
+                query_texts=[question],
+                n_results=dense_k,
+                include=["documents", "metadatas", "distances"],
+                where=where_filters if where_filters else None,
+                where_document=where_document if where_document else None,
+            )
+        except Exception as e:
+            logger.error(f"Dense retrieval failed: {str(e)}")
+            raise
+        dense_ranked: List[Dict[str, Any]] = []
+        if dense_res and dense_res.get("documents") and dense_res["documents"][0]:
+            for i in range(len(dense_res["documents"][0])):
+                meta = dense_res["metadatas"][0][i]
+                dense_ranked.append({
+                    "id": dense_res["ids"][0][i],
+                    "text": dense_res["documents"][0][i],
+                    "metadata": meta,
+                    "distance": float(dense_res["distances"][0][i]),
+                    "source": meta.get("source", "Unknown"),
+                })
+        if not enable_bm25:
+            logger.info(f"Semantic-only retrieved {len(dense_ranked)} docs")
+            return dense_ranked
+        # ----- Sparse retrieval (BM25) -----
+        q_tokens = self._tokenize(question)
+        scores = self.bm25.get_scores(q_tokens)
+        # Apply same filters to BM25 corpus
+        valid_indices = []
+        for idx, (doc, meta) in enumerate(zip(self.all_docs, self.all_metas)):
+            if self._matches_filters(meta, doc, where_filters, where_document):
+                valid_indices.append(idx)
+        # take top bm25_k from valid indices
+        valid_scores = [(idx, scores[idx]) for idx in valid_indices]
+        valid_scores.sort(key=lambda x: x[1], reverse=True)
+        top_sparse = valid_scores[:bm25_k]
+        sparse_ranked: List[Dict[str, Any]] = []
+        for idx, s in top_sparse:
+            meta = self.all_metas[idx]
+            sparse_ranked.append({
+                "id": self.all_ids[idx],
+                "text": self.all_docs[idx],
+                "metadata": meta,
+                "bm25_score": float(s),
+                "distance": None,  # may be absent if not in dense top-k
+                "source": meta.get("source", "Unknown"),
+            })
+        # ----- Fuse dense + sparse -----
+        fused = self._rrf_fuse(
+            dense_ranked,
+            sparse_ranked,
+            w_dense=alpha,
+            w_sparse=1.0 - alpha,
+        )
+        logger.info(
+            f"Hybrid retrieved dense={len(dense_ranked)} sparse={len(sparse_ranked)} "
+            f"fused={len(fused)}"
+        )
+        return fused
+def get_vector_store() -> VectorStoreManager:
+    """FastAPI dependency for injecting VectorStoreManager"""
+    return VectorStoreManager()