Spaces:

hashirama7x
/

multimodal-rag

Build error

multimodal-rag / src /retrieval /vector_db.py

itachi

Initial deployment

a809248 2 months ago

20.2 kB

	"""
	Vector Database Module.
	Supports PostgreSQL+pgvector and FAISS for vector storage and retrieval.
	"""

	import json
	from abc import ABC, abstractmethod
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple, Union
	import numpy as np

	from ..utils import get_logger, get_config, LoggerMixin

	logger = get_logger(__name__)
	config = get_config()


	@dataclass
	class Document:
	"""Document with text and metadata."""

	id: str
	text: str
	embedding: Optional[np.ndarray] = None
	metadata: Dict = field(default_factory=dict)

	def to_dict(self) -> Dict:
	return {
	"id": self.id,
	"text": self.text,
	"metadata": self.metadata
	}


	@dataclass
	class SearchResult:
	"""Search result with score."""

	document: Document
	score: float
	rank: int = 0

	def to_dict(self) -> Dict:
	return {
	"id": self.document.id,
	"text": self.document.text,
	"score": self.score,
	"rank": self.rank,
	"metadata": self.document.metadata
	}


	class VectorStore(ABC, LoggerMixin):
	"""Abstract base class for vector stores."""

	@abstractmethod
	def add_documents(
	self,
	documents: List[Document],
	embeddings: Optional[np.ndarray] = None
	) -> List[str]:
	"""Add documents to the store."""
	pass

	@abstractmethod
	def search(
	self,
	query_embedding: np.ndarray,
	top_k: int = 10
	) -> List[SearchResult]:
	"""Search for similar documents."""
	pass

	@abstractmethod
	def delete(self, document_ids: List[str]) -> int:
	"""Delete documents by ID."""
	pass

	@abstractmethod
	def get_document(self, document_id: str) -> Optional[Document]:
	"""Get document by ID."""
	pass

	@property
	@abstractmethod
	def count(self) -> int:
	"""Return number of documents in store."""
	pass


	class PostgresVectorStore(VectorStore):
	"""
	PostgreSQL + pgvector vector store.

	Features:
	- ACID compliance
	- SQL filtering
	- IVFFlat/HNSW indexing
	- Full-text search support
	"""

	def __init__(
	self,
	connection_string: Optional[str] = None,
	table_name: str = "document_embeddings",
	embedding_dim: int = None,
	index_type: str = None
	):
	"""
	Initialize PostgreSQL vector store.

	Args:
	connection_string: PostgreSQL connection string
	table_name: Name of the embeddings table
	embedding_dim: Dimension of embeddings
	index_type: Index type ("ivfflat" or "hnsw")
	"""
	self.table_name = table_name
	self.embedding_dim = embedding_dim or config.embedding.embedding_dim
	self.index_type = index_type or config.database.index_type

	# Build connection string
	if connection_string:
	self.connection_string = connection_string
	else:
	db = config.database
	self.connection_string = (
	f"postgresql://{db.pg_user}:{db.pg_password}@"
	f"{db.pg_host}:{db.pg_port}/{db.pg_database}"
	)

	self.conn = None
	self._initialized = False

	def _connect(self):
	"""Connect to database."""
	if self.conn is not None:
	return

	try:
	import psycopg2
	from pgvector.psycopg2 import register_vector

	self.conn = psycopg2.connect(self.connection_string)
	register_vector(self.conn)
	self.logger.info("Connected to PostgreSQL")

	except ImportError:
	self.logger.error("psycopg2 or pgvector not installed")
	raise
	except Exception as e:
	self.logger.error(f"Failed to connect: {e}")
	raise

	def initialize(self):
	"""Create table and indexes."""
	self._connect()

	with self.conn.cursor() as cur:
	# Enable pgvector extension
	cur.execute("CREATE EXTENSION IF NOT EXISTS vector")

	# Create table
	cur.execute(f"""
	CREATE TABLE IF NOT EXISTS {self.table_name} (
	id TEXT PRIMARY KEY,
	chunk_text TEXT NOT NULL,
	embedding vector({self.embedding_dim}),
	metadata JSONB DEFAULT '{{}}',
	full_text_search tsvector GENERATED ALWAYS AS (
	to_tsvector('english', chunk_text)
	) STORED,
	created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	)
	""")

	# Create vector index
	index_name = f"{self.table_name}_embedding_idx"
	if self.index_type == "ivfflat":
	cur.execute(f"""
	CREATE INDEX IF NOT EXISTS {index_name}
	ON {self.table_name}
	USING ivfflat (embedding vector_cosine_ops)
	WITH (lists = {config.database.num_lists})
	""")
	elif self.index_type == "hnsw":
	cur.execute(f"""
	CREATE INDEX IF NOT EXISTS {index_name}
	ON {self.table_name}
	USING hnsw (embedding vector_cosine_ops)
	""")

	# Create GIN index for full-text search
	cur.execute(f"""
	CREATE INDEX IF NOT EXISTS {self.table_name}_fts_idx
	ON {self.table_name}
	USING gin(full_text_search)
	""")

	self.conn.commit()

	self._initialized = True
	self.logger.info(f"Initialized table {self.table_name}")

	def add_documents(
	self,
	documents: List[Document],
	embeddings: Optional[np.ndarray] = None
	) -> List[str]:
	"""Add documents with embeddings."""
	self._connect()

	if not self._initialized:
	self.initialize()

	ids = []
	with self.conn.cursor() as cur:
	for i, doc in enumerate(documents):
	embedding = embeddings[i] if embeddings is not None else doc.embedding

	if embedding is None:
	self.logger.warning(f"No embedding for document {doc.id}")
	continue

	cur.execute(f"""
	INSERT INTO {self.table_name} (id, chunk_text, embedding, metadata)
	VALUES (%s, %s, %s, %s)
	ON CONFLICT (id) DO UPDATE SET
	chunk_text = EXCLUDED.chunk_text,
	embedding = EXCLUDED.embedding,
	metadata = EXCLUDED.metadata
	""", (
	doc.id,
	doc.text,
	embedding.tolist(),
	json.dumps(doc.metadata)
	))
	ids.append(doc.id)

	self.conn.commit()

	self.logger.info(f"Added {len(ids)} documents")
	return ids

	def search(
	self,
	query_embedding: np.ndarray,
	top_k: int = 10,
	filter_metadata: Optional[Dict] = None
	) -> List[SearchResult]:
	"""Search for similar documents."""
	self._connect()

	with self.conn.cursor() as cur:
	# Build query
	query = f"""
	SELECT id, chunk_text, metadata,
	1 - (embedding <=> %s) as similarity
	FROM {self.table_name}
	"""

	params = [query_embedding.tolist()]

	# Add metadata filter
	if filter_metadata:
	conditions = []
	for key, value in filter_metadata.items():
	conditions.append(f"metadata->>{key} = %s")
	params.append(json.dumps(value))
	query += " WHERE " + " AND ".join(conditions)

	query += f" ORDER BY embedding <=> %s LIMIT {top_k}"
	params.append(query_embedding.tolist())

	cur.execute(query, params)
	rows = cur.fetchall()

	results = []
	for rank, (id, text, metadata, score) in enumerate(rows):
	doc = Document(
	id=id,
	text=text,
	metadata=metadata if isinstance(metadata, dict) else json.loads(metadata)
	)
	results.append(SearchResult(
	document=doc,
	score=float(score),
	rank=rank
	))

	return results

	def full_text_search(
	self,
	query: str,
	top_k: int = 10
	) -> List[SearchResult]:
	"""Perform full-text search using PostgreSQL FTS."""
	self._connect()

	with self.conn.cursor() as cur:
	cur.execute(f"""
	SELECT id, chunk_text, metadata,
	ts_rank(full_text_search, plainto_tsquery('english', %s)) as score
	FROM {self.table_name}
	WHERE full_text_search @@ plainto_tsquery('english', %s)
	ORDER BY score DESC
	LIMIT {top_k}
	""", (query, query))
	rows = cur.fetchall()

	results = []
	for rank, (id, text, metadata, score) in enumerate(rows):
	doc = Document(
	id=id,
	text=text,
	metadata=metadata if isinstance(metadata, dict) else json.loads(metadata)
	)
	results.append(SearchResult(
	document=doc,
	score=float(score),
	rank=rank
	))

	return results

	def delete(self, document_ids: List[str]) -> int:
	"""Delete documents by ID."""
	self._connect()

	with self.conn.cursor() as cur:
	cur.execute(f"""
	DELETE FROM {self.table_name}
	WHERE id = ANY(%s)
	""", (document_ids,))
	deleted = cur.rowcount
	self.conn.commit()

	self.logger.info(f"Deleted {deleted} documents")
	return deleted

	def get_document(self, document_id: str) -> Optional[Document]:
	"""Get document by ID."""
	self._connect()

	with self.conn.cursor() as cur:
	cur.execute(f"""
	SELECT id, chunk_text, metadata
	FROM {self.table_name}
	WHERE id = %s
	""", (document_id,))
	row = cur.fetchone()

	if row:
	return Document(
	id=row[0],
	text=row[1],
	metadata=row[2] if isinstance(row[2], dict) else json.loads(row[2])
	)
	return None

	@property
	def count(self) -> int:
	"""Return number of documents."""
	self._connect()

	with self.conn.cursor() as cur:
	cur.execute(f"SELECT COUNT(*) FROM {self.table_name}")
	return cur.fetchone()[0]

	def close(self):
	"""Close database connection."""
	if self.conn:
	self.conn.close()
	self.conn = None
	self.logger.info("Closed PostgreSQL connection")


	class FAISSVectorStore(VectorStore):
	"""
	FAISS vector store for fast similarity search.

	Features:
	- Fast approximate nearest neighbor search
	- GPU acceleration support
	- Multiple index types (Flat, IVF, HNSW)
	"""

	def __init__(
	self,
	embedding_dim: int = None,
	index_type: str = None,
	nlist: int = None,
	nprobe: int = None
	):
	"""
	Initialize FAISS vector store.

	Args:
	embedding_dim: Dimension of embeddings
	index_type: Index type ("flat", "ivf", "hnsw")
	nlist: Number of clusters for IVF
	nprobe: Number of clusters to search
	"""
	self.embedding_dim = embedding_dim or config.embedding.embedding_dim
	self.index_type = index_type or config.database.faiss_index_type
	self.nlist = nlist or config.database.faiss_nlist
	self.nprobe = nprobe or config.database.faiss_nprobe

	self.index = None
	self.documents: Dict[int, Document] = {}
	self.id_to_idx: Dict[str, int] = {}
	self.idx_to_id: Dict[int, str] = {}
	self.current_idx = 0

	self._init_index()

	def _init_index(self):
	"""Initialize FAISS index."""
	try:
	import faiss
	self.faiss = faiss
	except ImportError:
	self.logger.error("faiss not installed")
	raise ImportError("Install faiss: pip install faiss-cpu")

	if self.index_type == "flat":
	self.index = faiss.IndexFlatIP(self.embedding_dim) # Inner product

	elif self.index_type == "ivf":
	quantizer = faiss.IndexFlatIP(self.embedding_dim)
	self.index = faiss.IndexIVFFlat(
	quantizer,
	self.embedding_dim,
	self.nlist,
	faiss.METRIC_INNER_PRODUCT
	)
	self._needs_training = True

	elif self.index_type == "hnsw":
	self.index = faiss.IndexHNSWFlat(self.embedding_dim, 32)
	self.index.hnsw.efConstruction = 200

	else:
	raise ValueError(f"Unknown index type: {self.index_type}")

	self.logger.info(f"Initialized FAISS {self.index_type} index")

	def add_documents(
	self,
	documents: List[Document],
	embeddings: Optional[np.ndarray] = None
	) -> List[str]:
	"""Add documents with embeddings."""
	if embeddings is None:
	embeddings = np.vstack([doc.embedding for doc in documents])

	# Normalize for cosine similarity
	embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
	embeddings = embeddings.astype('float32')

	# Train IVF index if needed
	if hasattr(self, '_needs_training') and self._needs_training:
	if embeddings.shape[0] >= self.nlist:
	self.index.train(embeddings)
	self._needs_training = False
	else:
	self.logger.warning(
	f"Not enough vectors ({embeddings.shape[0]}) to train IVF index "
	f"(need {self.nlist}). Using flat index."
	)
	self.index = self.faiss.IndexFlatIP(self.embedding_dim)

	# Add to index
	self.index.add(embeddings)

	# Store document mapping
	ids = []
	for doc in documents:
	self.documents[self.current_idx] = doc
	self.id_to_idx[doc.id] = self.current_idx
	self.idx_to_id[self.current_idx] = doc.id
	ids.append(doc.id)
	self.current_idx += 1

	self.logger.info(f"Added {len(ids)} documents to FAISS")
	return ids

	def search(
	self,
	query_embedding: np.ndarray,
	top_k: int = 10
	) -> List[SearchResult]:
	"""Search for similar documents."""
	# Normalize query
	query_embedding = query_embedding / np.linalg.norm(query_embedding)
	query_embedding = query_embedding.astype('float32').reshape(1, -1)

	# Set search parameters
	if self.index_type == "ivf" and hasattr(self.index, 'nprobe'):
	self.index.nprobe = self.nprobe

	# Search
	scores, indices = self.index.search(query_embedding, top_k)

	results = []
	for rank, (idx, score) in enumerate(zip(indices[0], scores[0])):
	if idx == -1: # FAISS returns -1 for empty slots
	continue

	doc = self.documents.get(idx)
	if doc:
	results.append(SearchResult(
	document=doc,
	score=float(score),
	rank=rank
	))

	return results

	def delete(self, document_ids: List[str]) -> int:
	"""Delete documents by ID (not well supported in FAISS)."""
	deleted = 0
	for doc_id in document_ids:
	if doc_id in self.id_to_idx:
	idx = self.id_to_idx[doc_id]
	del self.documents[idx]
	del self.id_to_idx[doc_id]
	del self.idx_to_id[idx]
	deleted += 1

	self.logger.warning(
	f"Marked {deleted} documents as deleted. "
	"Note: FAISS doesn't support true deletion. Rebuild index for cleanup."
	)
	return deleted

	def get_document(self, document_id: str) -> Optional[Document]:
	"""Get document by ID."""
	idx = self.id_to_idx.get(document_id)
	if idx is not None:
	return self.documents.get(idx)
	return None

	def get_all_documents(self) -> List[Document]:
	"""Get all documents in the store."""
	return list(self.documents.values())

	@property
	def count(self) -> int:
	"""Return number of documents."""
	return len(self.documents)

	def save(self, path: Union[str, Path]):
	"""Save index and documents."""
	path = Path(path)
	path.mkdir(parents=True, exist_ok=True)

	# Save FAISS index
	self.faiss.write_index(self.index, str(path / "index.faiss"))

	# Save documents and mappings
	import pickle
	with open(path / "documents.pkl", 'wb') as f:
	pickle.dump({
	'documents': self.documents,
	'id_to_idx': self.id_to_idx,
	'idx_to_id': self.idx_to_id,
	'current_idx': self.current_idx
	}, f)

	self.logger.info(f"Saved FAISS index to {path}")

	def load(self, path: Union[str, Path]):
	"""Load index and documents."""
	path = Path(path)

	# Load FAISS index
	self.index = self.faiss.read_index(str(path / "index.faiss"))

	# Load documents and mappings
	import pickle
	with open(path / "documents.pkl", 'rb') as f:
	data = pickle.load(f)
	self.documents = data['documents']
	self.id_to_idx = data['id_to_idx']
	self.idx_to_id = data['idx_to_id']
	self.current_idx = data['current_idx']

	self.logger.info(f"Loaded FAISS index from {path}")


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Vector DB Test")
	parser.add_argument("--test", action="store_true", help="Run test mode")
	parser.add_argument("--init", action="store_true", help="Initialize PostgreSQL")
	args = parser.parse_args()

	if args.test:
	print("Vector Store Test (FAISS)\n" + "=" * 50)

	# Create sample documents
	np.random.seed(42)
	docs = [
	Document(id=f"doc_{i}", text=f"Sample document {i}",
	embedding=np.random.randn(768))
	for i in range(100)
	]

	# Initialize FAISS store
	store = FAISSVectorStore(embedding_dim=768, index_type="flat")
	store.add_documents(docs)

	print(f"Documents in store: {store.count}")

	# Search
	query = np.random.randn(768)
	results = store.search(query, top_k=5)

	print(f"\nTop 5 results:")
	for r in results:
	print(f" {r.document.id}: score={r.score:.4f}")

	if args.init:
	print("Initializing PostgreSQL Vector Store...")
	store = PostgresVectorStore()
	store.initialize()
	print("Done!")