File size: 4,731 Bytes
4a2ab42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ae946d
 
 
4a2ab42
4ae946d
 
 
4a2ab42
 
 
 
 
 
 
 
 
 
 
 
4ae946d
 
 
4a2ab42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ae946d
 
 
 
 
 
 
 
 
 
 
4a2ab42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""Vector store with ChromaDB support and TF-IDF fallback.
Provides semantic search capabilities using production vector DB or local fallback.
"""

import logging
import os
from typing import Any

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

logger = logging.getLogger(__name__)


class VectorStore:
    """
    Vector store with ChromaDB support and TF-IDF fallback.

    When CHROMA_DB_URL is configured, uses ChromaDB for production-grade
    semantic search. Falls back to TF-IDF for local development.
    """

    def __init__(self):
        self.documents: list[str] = []
        self.ids: list[str] = []
        self.vectorizer = TfidfVectorizer()
        self._matrix = None
        self._chroma_client = None
        self._collection = None
        self._use_chroma = False

        # Try to initialize ChromaDB
        chroma_url = os.getenv("CHROMA_DB_URL")
        if chroma_url:
            self._init_chromadb(chroma_url)

    def _init_chromadb(self, url: str) -> bool:
        """Initialize ChromaDB client."""
        try:
            import chromadb

            self._chroma_client = chromadb.HttpClient(host=url)
            # Try to get or create collection
            try:
                self._collection = self._chroma_client.get_collection(
                    "zenith_documents"
                )
            except Exception:
                self._collection = self._chroma_client.create_collection(
                    "zenith_documents"
                )
            self._use_chroma = True
            logger.info(f"ChromaDB initialized at {url}")
            return True
        except Exception as e:
            logger.warning(f"ChromaDB not available, using TF-IDF fallback: {e}")
            self._use_chroma = False
            return False

    def index(self, doc_id: str, text: str, metadata: dict[str, Any] | None = None):
        """Index a document for semantic search."""
        if self._use_chroma and self._collection:
            try:
                self._collection.add(
                    documents=[text], ids=[doc_id], metadatas=[metadata or {}]
                )
                return
            except Exception as e:
                logger.error(f"ChromaDB indexing failed: {e}")
                self._use_chroma = False

        # Fallback to TF-IDF
        self.ids.append(doc_id)
        self.documents.append(text)
        self._matrix = self.vectorizer.fit_transform(self.documents)

    def query(self, text: str, top_k: int = 5) -> list[tuple[str, float]]:
        """Query for similar documents."""
        if self._use_chroma and self._collection:
            try:
                results = self._collection.query(query_texts=[text], n_results=top_k)
                if results and results.get("ids"):
                    return list(
                        zip(
                            results["ids"][0],
                            [
                                float(s)
                                for s in results.get(
                                    "distances", [0] * len(results["ids"][0])
                                )
                            ],
                        )
                    )
            except Exception as e:
                logger.error(f"ChromaDB query failed: {e}")

        # Fallback to TF-IDF
        if not self._matrix or len(self.documents) == 0:
            return []

        q_vec = self.vectorizer.transform([text])
        sims = (self._matrix @ q_vec.T).toarray().ravel()
        idxs = np.argsort(-sims)[:top_k]
        return [(self.ids[i], float(sims[i])) for i in idxs if sims[i] > 0]

    def delete(self, doc_id: str) -> bool:
        """Delete a document from the index."""
        if self._use_chroma and self._collection:
            try:
                self._collection.delete(ids=[doc_id])
                return True
            except Exception as e:
                logger.error(f"ChromaDB delete failed: {e}")
                return False

        # TF-IDF fallback
        if doc_id in self.ids:
            idx = self.ids.index(doc_id)
            self.ids.pop(idx)
            self.documents.pop(idx)
            if self._matrix is not None and len(self.documents) > 0:
                self._matrix = self.vectorizer.fit_transform(self.documents)
            else:
                self._matrix = None
            return True
        return False

    def get_stats(self) -> dict[str, Any]:
        """Get vector store statistics."""
        return {
            "total_documents": len(self.ids),
            "using_chromadb": self._use_chroma,
            "matrix_shape": self._matrix.shape if self._matrix is not None else None,
        }