Spaces:
Running
Running
File size: 1,853 Bytes
559dd34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | """Vector store abstraction and implementations."""
from abc import ABC, abstractmethod
from typing import Dict, Generator, List, Tuple
from pinecone import Pinecone
Vector = Tuple[Dict, List[float]] # (metadata, embedding)
class VectorStore(ABC):
"""Abstract class for a vector store."""
@abstractmethod
def ensure_exists(self):
"""Ensures that the vector store exists. Creates it if it doesn't."""
@abstractmethod
def upsert_batch(self, vectors: List[Vector]):
"""Upserts a batch of vectors."""
def upsert(self, vectors: Generator[Vector, None, None]):
"""Upserts in batches of 100, since vector stores have a limit on upsert size."""
batch = []
for metadata, embedding in vectors:
batch.append((metadata, embedding))
if len(batch) == 100:
self.upsert_batch(batch)
batch = []
if batch:
self.upsert_batch(batch)
class PineconeVectorStore(VectorStore):
"""Vector store implementation using Pinecone."""
def __init__(self, index_name: str, dimension: int, namespace: str):
self.index_name = index_name
self.dimension = dimension
self.client = Pinecone()
self.index = self.client.Index(self.index_name)
self.namespace = namespace
def ensure_exists(self):
if self.index_name not in self.client.list_indexes().names():
self.client.create_index(
name=self.index_name, dimension=self.dimension, metric="cosine"
)
def upsert_batch(self, vectors: List[Vector]):
pinecone_vectors = [
(metadata.get("id", str(i)), embedding, metadata)
for i, (metadata, embedding) in enumerate(vectors)
]
self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)
|