doc-ingestion / src /core /vector_search.py
Vamshi Pokala
chore(ci): fix Ruff lint and Mypy for GitHub Actions
40db081
"""Vector similarity search facade over VectorDatabase (spec: vector_search.py)."""
from __future__ import annotations
from typing import Any, Dict, List, Optional, cast
from src.utils.database import VectorDatabase
class VectorSearch:
def __init__(self, db: VectorDatabase, collection_name: str):
self._db = db
self._collection = collection_name
def embed_query(self, query: str) -> List[float]:
return self._db.generate_embedding(query)
def similarity_search(self, embedding: List[float], k: int = 50) -> List[Dict]:
"""Query by precomputed embedding (Chroma path)."""
if self._db.mode != "dev":
raise NotImplementedError("similarity_search by raw embedding is only wired for Chroma dev mode")
collection = self._db.chroma_client.get_or_create_collection(name=self._collection)
results = collection.query(
query_embeddings=cast(Any, [embedding]),
n_results=k,
)
ids: List[str] = (results["ids"] or [[]])[0]
docs: List[str] = (results["documents"] or [[]])[0]
raw_metas = (results["metadatas"] or [[]])[0]
metas: List[Dict[str, Any]] = cast(List[Dict[str, Any]], raw_metas)
dists: List[float] = (results["distances"] or [[]])[0]
return [
{"id": id_, "text": doc, "metadata": meta or {}, "distance": dist}
for id_, doc, meta, dist in zip(ids, docs, metas, dists)
]
def search(
self,
query: str,
k: int = 50,
filters: Optional[Dict] = None,
max_distance: Optional[float] = None,
min_similarity: Optional[float] = None,
) -> List[Dict]:
hits = self._db.query_documents(self._collection, query, top_k=k, filters=filters)
out: List[Dict] = []
for h in hits:
dist = h.get("distance")
if dist is not None:
if max_distance is not None and dist > max_distance:
continue
sim = 1.0 - float(dist)
if min_similarity is not None and sim < min_similarity:
continue
out.append(h)
return out
def filter_by_metadata(self, results: List[Dict], filters: Dict) -> List[Dict]:
if not filters:
return results
filtered: List[Dict] = []
for r in results:
meta = r.get("metadata") or {}
if all(meta.get(k) == v for k, v in filters.items()):
filtered.append(r)
return filtered