code-crawler / src /vector_store.py
juliaturc's picture
Add code for indexing and chatting.
559dd34
raw
history blame
1.85 kB
"""Vector store abstraction and implementations."""
from abc import ABC, abstractmethod
from typing import Dict, Generator, List, Tuple
from pinecone import Pinecone
Vector = Tuple[Dict, List[float]] # (metadata, embedding)
class VectorStore(ABC):
"""Abstract class for a vector store."""
@abstractmethod
def ensure_exists(self):
"""Ensures that the vector store exists. Creates it if it doesn't."""
@abstractmethod
def upsert_batch(self, vectors: List[Vector]):
"""Upserts a batch of vectors."""
def upsert(self, vectors: Generator[Vector, None, None]):
"""Upserts in batches of 100, since vector stores have a limit on upsert size."""
batch = []
for metadata, embedding in vectors:
batch.append((metadata, embedding))
if len(batch) == 100:
self.upsert_batch(batch)
batch = []
if batch:
self.upsert_batch(batch)
class PineconeVectorStore(VectorStore):
"""Vector store implementation using Pinecone."""
def __init__(self, index_name: str, dimension: int, namespace: str):
self.index_name = index_name
self.dimension = dimension
self.client = Pinecone()
self.index = self.client.Index(self.index_name)
self.namespace = namespace
def ensure_exists(self):
if self.index_name not in self.client.list_indexes().names():
self.client.create_index(
name=self.index_name, dimension=self.dimension, metric="cosine"
)
def upsert_batch(self, vectors: List[Vector]):
pinecone_vectors = [
(metadata.get("id", str(i)), embedding, metadata)
for i, (metadata, embedding) in enumerate(vectors)
]
self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)