"""Vector store with ChromaDB support and TF-IDF fallback. Provides semantic search capabilities using production vector DB or local fallback. """ import logging import os from typing import Any import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer logger = logging.getLogger(__name__) class VectorStore: """ Vector store with ChromaDB support and TF-IDF fallback. When CHROMA_DB_URL is configured, uses ChromaDB for production-grade semantic search. Falls back to TF-IDF for local development. """ def __init__(self): self.documents: list[str] = [] self.ids: list[str] = [] self.vectorizer = TfidfVectorizer() self._matrix = None self._chroma_client = None self._collection = None self._use_chroma = False # Try to initialize ChromaDB chroma_url = os.getenv("CHROMA_DB_URL") if chroma_url: self._init_chromadb(chroma_url) def _init_chromadb(self, url: str) -> bool: """Initialize ChromaDB client.""" try: import chromadb self._chroma_client = chromadb.HttpClient(host=url) # Try to get or create collection try: self._collection = self._chroma_client.get_collection( "zenith_documents" ) except Exception: self._collection = self._chroma_client.create_collection( "zenith_documents" ) self._use_chroma = True logger.info(f"ChromaDB initialized at {url}") return True except Exception as e: logger.warning(f"ChromaDB not available, using TF-IDF fallback: {e}") self._use_chroma = False return False def index(self, doc_id: str, text: str, metadata: dict[str, Any] | None = None): """Index a document for semantic search.""" if self._use_chroma and self._collection: try: self._collection.add( documents=[text], ids=[doc_id], metadatas=[metadata or {}] ) return except Exception as e: logger.error(f"ChromaDB indexing failed: {e}") self._use_chroma = False # Fallback to TF-IDF self.ids.append(doc_id) self.documents.append(text) self._matrix = self.vectorizer.fit_transform(self.documents) def query(self, text: str, top_k: int = 5) -> list[tuple[str, float]]: """Query for similar documents.""" if self._use_chroma and self._collection: try: results = self._collection.query(query_texts=[text], n_results=top_k) if results and results.get("ids"): return list( zip( results["ids"][0], [ float(s) for s in results.get( "distances", [0] * len(results["ids"][0]) ) ], ) ) except Exception as e: logger.error(f"ChromaDB query failed: {e}") # Fallback to TF-IDF if not self._matrix or len(self.documents) == 0: return [] q_vec = self.vectorizer.transform([text]) sims = (self._matrix @ q_vec.T).toarray().ravel() idxs = np.argsort(-sims)[:top_k] return [(self.ids[i], float(sims[i])) for i in idxs if sims[i] > 0] def delete(self, doc_id: str) -> bool: """Delete a document from the index.""" if self._use_chroma and self._collection: try: self._collection.delete(ids=[doc_id]) return True except Exception as e: logger.error(f"ChromaDB delete failed: {e}") return False # TF-IDF fallback if doc_id in self.ids: idx = self.ids.index(doc_id) self.ids.pop(idx) self.documents.pop(idx) if self._matrix is not None and len(self.documents) > 0: self._matrix = self.vectorizer.fit_transform(self.documents) else: self._matrix = None return True return False def get_stats(self) -> dict[str, Any]: """Get vector store statistics.""" return { "total_documents": len(self.ids), "using_chromadb": self._use_chroma, "matrix_shape": self._matrix.shape if self._matrix is not None else None, }