juliaturc commited on
Commit
21f5cea
·
1 Parent(s): 3190b7d

Take "namespace" out of the vector store constructors.

Browse files
Files changed (3) hide show
  1. sage/index.py +4 -2
  2. sage/retriever.py +3 -2
  3. sage/vector_store.py +16 -17
sage/index.py CHANGED
@@ -92,7 +92,7 @@ def main():
92
  logging.info("Moving embeddings to the repo vector store...")
93
  repo_vector_store = build_vector_store_from_args(args, repo_manager)
94
  repo_vector_store.ensure_exists()
95
- repo_vector_store.upsert(repo_embedder.download_embeddings(repo_jobs_file))
96
 
97
  if issues_embedder is not None:
98
  logging.info("Waiting for issue embeddings to be ready...")
@@ -103,7 +103,9 @@ def main():
103
  logging.info("Moving embeddings to the issues vector store...")
104
  issues_vector_store = build_vector_store_from_args(args, issues_manager)
105
  issues_vector_store.ensure_exists()
106
- issues_vector_store.upsert(issues_embedder.download_embeddings(issues_jobs_file))
 
 
107
 
108
  logging.info("Done!")
109
 
 
92
  logging.info("Moving embeddings to the repo vector store...")
93
  repo_vector_store = build_vector_store_from_args(args, repo_manager)
94
  repo_vector_store.ensure_exists()
95
+ repo_vector_store.upsert(repo_embedder.download_embeddings(repo_jobs_file), namespace=args.index_namespace)
96
 
97
  if issues_embedder is not None:
98
  logging.info("Waiting for issue embeddings to be ready...")
 
103
  logging.info("Moving embeddings to the issues vector store...")
104
  issues_vector_store = build_vector_store_from_args(args, issues_manager)
105
  issues_vector_store.ensure_exists()
106
+ issues_vector_store.upsert(
107
+ issues_embedder.download_embeddings(issues_jobs_file), namespace=args.index_namespace
108
+ )
109
 
110
  logging.info("Done!")
111
 
sage/retriever.py CHANGED
@@ -2,7 +2,6 @@ from langchain.retrievers import ContextualCompressionRetriever
2
  from langchain_openai import OpenAIEmbeddings
3
  from langchain_voyageai import VoyageAIEmbeddings
4
 
5
-
6
  from sage.reranker import build_reranker
7
  from sage.vector_store import build_vector_store_from_args
8
 
@@ -17,7 +16,9 @@ def build_retriever_from_args(args):
17
  else:
18
  embeddings = None
19
 
20
- retriever = build_vector_store_from_args(args).as_retriever(top_k=args.retriever_top_k, embeddings=embeddings)
 
 
21
 
22
  reranker = build_reranker(args.reranker_provider, args.reranker_model, args.reranker_top_k)
23
  if reranker:
 
2
  from langchain_openai import OpenAIEmbeddings
3
  from langchain_voyageai import VoyageAIEmbeddings
4
 
 
5
  from sage.reranker import build_reranker
6
  from sage.vector_store import build_vector_store_from_args
7
 
 
16
  else:
17
  embeddings = None
18
 
19
+ retriever = build_vector_store_from_args(args).as_retriever(
20
+ top_k=args.retriever_top_k, embeddings=embeddings, namespace=args.index_namespace
21
+ )
22
 
23
  reranker = build_reranker(args.reranker_provider, args.reranker_model, args.reranker_top_k)
24
  if reranker:
sage/vector_store.py CHANGED
@@ -1,7 +1,7 @@
1
  """Vector store abstraction and implementations."""
2
 
3
- import os
4
  import logging
 
5
  from abc import ABC, abstractmethod
6
  from functools import cached_property
7
  from typing import Dict, Generator, List, Optional, Tuple
@@ -29,33 +29,32 @@ class VectorStore(ABC):
29
  """Ensures that the vector store exists. Creates it if it doesn't."""
30
 
31
  @abstractmethod
32
- def upsert_batch(self, vectors: List[Vector]):
33
  """Upserts a batch of vectors."""
34
 
35
- def upsert(self, vectors: Generator[Vector, None, None]):
36
  """Upserts in batches of 100, since vector stores have a limit on upsert size."""
37
  batch = []
38
  for metadata, embedding in vectors:
39
  batch.append((metadata, embedding))
40
  if len(batch) == 100:
41
- self.upsert_batch(batch)
42
  batch = []
43
  if batch:
44
- self.upsert_batch(batch)
45
 
46
  @abstractmethod
47
- def as_retriever(self, top_k: int, embeddings: Embeddings):
48
  """Converts the vector store to a LangChain retriever object."""
49
 
50
 
51
  class PineconeVectorStore(VectorStore):
52
  """Vector store implementation using Pinecone."""
53
 
54
- def __init__(self, index_name: str, namespace: str, dimension: int, alpha: float, bm25_cache: Optional[str] = None):
55
  """
56
  Args:
57
  index_name: The name of the Pinecone index to use. If it doesn't exist already, we'll create it.
58
- namespace: The namespace within the index to use.
59
  dimension: The dimension of the vectors.
60
  alpha: The alpha parameter for hybrid search: alpha == 1.0 means pure dense search, alpha == 0.0 means pure
61
  BM25, and 0.0 < alpha < 1.0 means a hybrid of the two.
@@ -65,7 +64,6 @@ class PineconeVectorStore(VectorStore):
65
  self.index_name = index_name
66
  self.dimension = dimension
67
  self.client = Pinecone()
68
- self.namespace = namespace
69
  self.alpha = alpha
70
 
71
  if alpha < 1.0:
@@ -107,7 +105,7 @@ class PineconeVectorStore(VectorStore):
107
  spec=ServerlessSpec(cloud="aws", region="us-east-1"),
108
  )
109
 
110
- def upsert_batch(self, vectors: List[Vector]):
111
  pinecone_vectors = []
112
  for i, (metadata, embedding) in enumerate(vectors):
113
  vector = {"id": metadata.get("id", str(i)), "values": embedding, "metadata": metadata}
@@ -115,21 +113,21 @@ class PineconeVectorStore(VectorStore):
115
  vector["sparse_values"] = self.bm25_encoder.encode_documents(metadata[TEXT_FIELD])
116
  pinecone_vectors.append(vector)
117
 
118
- self.index.upsert(vectors=pinecone_vectors, namespace=self.namespace)
119
 
120
- def as_retriever(self, top_k: int, embeddings: Embeddings):
121
  if self.bm25_encoder:
122
  return PineconeHybridSearchRetriever(
123
  embeddings=embeddings,
124
  sparse_encoder=self.bm25_encoder,
125
  index=self.index,
126
- namespace=self.namespace,
127
  top_k=top_k,
128
  alpha=self.alpha,
129
  )
130
 
131
  return LangChainPinecone.from_existing_index(
132
- index_name=self.index_name, embedding=embeddings, namespace=self.namespace
133
  ).as_retriever(search_kwargs={"k": top_k})
134
 
135
 
@@ -143,12 +141,14 @@ class MarqoVectorStore(VectorStore):
143
  def ensure_exists(self):
144
  pass
145
 
146
- def upsert_batch(self, vectors: List[Vector]):
147
  # Since Marqo is both an embedder and a vector store, the embedder is already doing the upsert.
148
  pass
149
 
150
- def as_retriever(self, top_k: int, embeddings: Embeddings = None):
151
  del embeddings # Unused; The Marqo vector store is also an embedder.
 
 
152
  vectorstore = Marqo(client=self.client, index_name=self.index_name)
153
 
154
  # Monkey-patch the _construct_documents_from_results_without_score method to not expect a "metadata" field in
@@ -188,7 +188,6 @@ def build_vector_store_from_args(args: dict, data_manager: Optional[DataManager]
188
 
189
  return PineconeVectorStore(
190
  index_name=args.pinecone_index_name,
191
- namespace=args.index_namespace,
192
  dimension=args.embedding_size if "embedding_size" in args else None,
193
  alpha=args.retrieval_alpha,
194
  bm25_cache=bm25_cache,
 
1
  """Vector store abstraction and implementations."""
2
 
 
3
  import logging
4
+ import os
5
  from abc import ABC, abstractmethod
6
  from functools import cached_property
7
  from typing import Dict, Generator, List, Optional, Tuple
 
29
  """Ensures that the vector store exists. Creates it if it doesn't."""
30
 
31
  @abstractmethod
32
+ def upsert_batch(self, vectors: List[Vector], namespace: str):
33
  """Upserts a batch of vectors."""
34
 
35
+ def upsert(self, vectors: Generator[Vector, None, None], namespace: str):
36
  """Upserts in batches of 100, since vector stores have a limit on upsert size."""
37
  batch = []
38
  for metadata, embedding in vectors:
39
  batch.append((metadata, embedding))
40
  if len(batch) == 100:
41
+ self.upsert_batch(batch, namespace)
42
  batch = []
43
  if batch:
44
+ self.upsert_batch(batch, namespace)
45
 
46
  @abstractmethod
47
+ def as_retriever(self, top_k: int, embeddings: Embeddings, namespace: str):
48
  """Converts the vector store to a LangChain retriever object."""
49
 
50
 
51
  class PineconeVectorStore(VectorStore):
52
  """Vector store implementation using Pinecone."""
53
 
54
+ def __init__(self, index_name: str, dimension: int, alpha: float, bm25_cache: Optional[str] = None):
55
  """
56
  Args:
57
  index_name: The name of the Pinecone index to use. If it doesn't exist already, we'll create it.
 
58
  dimension: The dimension of the vectors.
59
  alpha: The alpha parameter for hybrid search: alpha == 1.0 means pure dense search, alpha == 0.0 means pure
60
  BM25, and 0.0 < alpha < 1.0 means a hybrid of the two.
 
64
  self.index_name = index_name
65
  self.dimension = dimension
66
  self.client = Pinecone()
 
67
  self.alpha = alpha
68
 
69
  if alpha < 1.0:
 
105
  spec=ServerlessSpec(cloud="aws", region="us-east-1"),
106
  )
107
 
108
+ def upsert_batch(self, vectors: List[Vector], namespace: str):
109
  pinecone_vectors = []
110
  for i, (metadata, embedding) in enumerate(vectors):
111
  vector = {"id": metadata.get("id", str(i)), "values": embedding, "metadata": metadata}
 
113
  vector["sparse_values"] = self.bm25_encoder.encode_documents(metadata[TEXT_FIELD])
114
  pinecone_vectors.append(vector)
115
 
116
+ self.index.upsert(vectors=pinecone_vectors, namespace=namespace)
117
 
118
+ def as_retriever(self, top_k: int, embeddings: Embeddings, namespace: str):
119
  if self.bm25_encoder:
120
  return PineconeHybridSearchRetriever(
121
  embeddings=embeddings,
122
  sparse_encoder=self.bm25_encoder,
123
  index=self.index,
124
+ namespace=namespace,
125
  top_k=top_k,
126
  alpha=self.alpha,
127
  )
128
 
129
  return LangChainPinecone.from_existing_index(
130
+ index_name=self.index_name, embedding=embeddings, namespace=namespace
131
  ).as_retriever(search_kwargs={"k": top_k})
132
 
133
 
 
141
  def ensure_exists(self):
142
  pass
143
 
144
+ def upsert_batch(self, vectors: List[Vector], namespace: str):
145
  # Since Marqo is both an embedder and a vector store, the embedder is already doing the upsert.
146
  pass
147
 
148
+ def as_retriever(self, top_k: int, embeddings: Embeddings = None, namespace: str = None):
149
  del embeddings # Unused; The Marqo vector store is also an embedder.
150
+ del namespace # Unused; Unlike Pinecone, Marqo doesn't differentiate between index name and namespace.
151
+
152
  vectorstore = Marqo(client=self.client, index_name=self.index_name)
153
 
154
  # Monkey-patch the _construct_documents_from_results_without_score method to not expect a "metadata" field in
 
188
 
189
  return PineconeVectorStore(
190
  index_name=args.pinecone_index_name,
 
191
  dimension=args.embedding_size if "embedding_size" in args else None,
192
  alpha=args.retrieval_alpha,
193
  bm25_cache=bm25_cache,