juliaturc commited on
Commit
4c99f56
·
1 Parent(s): a520549

Allow variable embedding size for the OpenAI embedder. (#19)

Browse files
Files changed (3) hide show
  1. src/embedder.py +9 -4
  2. src/index.py +38 -8
  3. src/vector_store.py +2 -3
src/embedder.py CHANGED
@@ -35,10 +35,14 @@ class BatchEmbedder(ABC):
35
  class OpenAIBatchEmbedder(BatchEmbedder):
36
  """Batch embedder that calls OpenAI. See https://platform.openai.com/docs/guides/batch/overview."""
37
 
38
- def __init__(self, repo_manager: RepoManager, chunker: Chunker, local_dir: str):
 
 
39
  self.repo_manager = repo_manager
40
  self.chunker = chunker
41
  self.local_dir = local_dir
 
 
42
  # IDs issued by OpenAI for each batch job mapped to metadata about the chunks.
43
  self.openai_batch_ids = {}
44
  self.client = OpenAI()
@@ -124,7 +128,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
124
  logging.info("Issuing job for batch %s with %d chunks.", batch_id, len(chunks))
125
 
126
  # Create a .jsonl file with the batch.
127
- request = OpenAIBatchEmbedder._chunks_to_request(chunks, batch_id)
128
  input_file = os.path.join(self.local_dir, f"batch_{batch_id}.jsonl")
129
  OpenAIBatchEmbedder._export_to_jsonl([request], input_file)
130
 
@@ -160,14 +164,15 @@ class OpenAIBatchEmbedder(BatchEmbedder):
160
  f.write("\n")
161
 
162
  @staticmethod
163
- def _chunks_to_request(chunks: List[Chunk], batch_id: str):
164
  """Convert a list of chunks to a batch request."""
165
  return {
166
  "custom_id": batch_id,
167
  "method": "POST",
168
  "url": "/v1/embeddings",
169
  "body": {
170
- "model": "text-embedding-ada-002",
 
171
  "input": [chunk.content for chunk in chunks],
172
  },
173
  }
 
35
  class OpenAIBatchEmbedder(BatchEmbedder):
36
  """Batch embedder that calls OpenAI. See https://platform.openai.com/docs/guides/batch/overview."""
37
 
38
+ def __init__(
39
+ self, repo_manager: RepoManager, chunker: Chunker, local_dir: str, embedding_model: str, embedding_size: int
40
+ ):
41
  self.repo_manager = repo_manager
42
  self.chunker = chunker
43
  self.local_dir = local_dir
44
+ self.embedding_model = embedding_model
45
+ self.embedding_size = embedding_size
46
  # IDs issued by OpenAI for each batch job mapped to metadata about the chunks.
47
  self.openai_batch_ids = {}
48
  self.client = OpenAI()
 
128
  logging.info("Issuing job for batch %s with %d chunks.", batch_id, len(chunks))
129
 
130
  # Create a .jsonl file with the batch.
131
+ request = OpenAIBatchEmbedder._chunks_to_request(chunks, batch_id, self.embedding_model, self.embedding_size)
132
  input_file = os.path.join(self.local_dir, f"batch_{batch_id}.jsonl")
133
  OpenAIBatchEmbedder._export_to_jsonl([request], input_file)
134
 
 
164
  f.write("\n")
165
 
166
  @staticmethod
167
+ def _chunks_to_request(chunks: List[Chunk], batch_id: str, model: str, dimensions: int):
168
  """Convert a list of chunks to a batch request."""
169
  return {
170
  "custom_id": batch_id,
171
  "method": "POST",
172
  "url": "/v1/embeddings",
173
  "body": {
174
+ "model": model,
175
+ "dimensions": dimensions,
176
  "input": [chunk.content for chunk in chunks],
177
  },
178
  }
src/index.py CHANGED
@@ -15,6 +15,15 @@ MAX_TOKENS_PER_CHUNK = 8192 # The ADA embedder from OpenAI has a maximum of 819
15
  MAX_CHUNKS_PER_BATCH = 2048 # The OpenAI batch embedding API enforces a maximum of 2048 chunks per batch.
16
  MAX_TOKENS_PER_JOB = 3_000_000 # The OpenAI batch embedding API enforces a maximum of 3M tokens processed at once.
17
 
 
 
 
 
 
 
 
 
 
18
 
19
  def _read_extensions(path):
20
  with open(path, "r") as f:
@@ -25,6 +34,20 @@ def main():
25
  parser = argparse.ArgumentParser(description="Batch-embeds a repository")
26
  parser.add_argument("repo_id", help="The ID of the repository to index")
27
  parser.add_argument("--embedder_type", default="openai", choices=["openai", "marqo"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  parser.add_argument("--vector_store_type", default="pinecone", choices=["pinecone", "marqo"])
29
  parser.add_argument(
30
  "--local_dir",
@@ -43,7 +66,11 @@ def main():
43
  default=2000,
44
  help="Maximum chunks per batch. We recommend 2000 for the OpenAI embedder. Marqo enforces a limit of 64.",
45
  )
46
- parser.add_argument("--index_name", required=True, help="Vector store index name")
 
 
 
 
47
  parser.add_argument(
48
  "--include",
49
  help="Path to a file containing a list of extensions to include. One extension per line.",
@@ -64,11 +91,6 @@ def main():
64
  default="http://localhost:8882",
65
  help="URL for the Marqo server. Required if using Marqo as embedder or vector store.",
66
  )
67
- parser.add_argument(
68
- "--marqo_embedding_model",
69
- default="hf/e5-base-v2",
70
- help="The embedding model to use for Marqo.",
71
- )
72
  args = parser.parse_args()
73
 
74
  # Validate embedder and vector store compatibility.
@@ -90,6 +112,14 @@ def main():
90
  if args.include and args.exclude:
91
  parser.error("At most one of --include and --exclude can be specified.")
92
 
 
 
 
 
 
 
 
 
93
  included_extensions = _read_extensions(args.include) if args.include else None
94
  excluded_extensions = _read_extensions(args.exclude) if args.exclude else None
95
 
@@ -106,10 +136,10 @@ def main():
106
  chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
107
 
108
  if args.embedder_type == "openai":
109
- embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
110
  elif args.embedder_type == "marqo":
111
  embedder = MarqoEmbedder(
112
- repo_manager, chunker, index_name=args.index_name, url=args.marqo_url, model=args.marqo_embedding_model
113
  )
114
  else:
115
  raise ValueError(f"Unrecognized embedder type {args.embedder_type}")
 
15
  MAX_CHUNKS_PER_BATCH = 2048 # The OpenAI batch embedding API enforces a maximum of 2048 chunks per batch.
16
  MAX_TOKENS_PER_JOB = 3_000_000 # The OpenAI batch embedding API enforces a maximum of 3M tokens processed at once.
17
 
18
+ # Note that OpenAI embedding models have fixed dimensions, however, taking a slice of them is possible.
19
+ # See "Reducing embedding dimensions" under https://platform.openai.com/docs/guides/embeddings/use-cases and
20
+ # https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-dimensions
21
+ OPENAI_DEFAULT_EMBEDDING_SIZE = {
22
+ "text-embedding-ada-002": 1536,
23
+ "text-embedding-3-small": 1536,
24
+ "text-embedding-3-large": 3072,
25
+ }
26
+
27
 
28
  def _read_extensions(path):
29
  with open(path, "r") as f:
 
34
  parser = argparse.ArgumentParser(description="Batch-embeds a repository")
35
  parser.add_argument("repo_id", help="The ID of the repository to index")
36
  parser.add_argument("--embedder_type", default="openai", choices=["openai", "marqo"])
37
+ parser.add_argument(
38
+ "--embedding_model",
39
+ type=str,
40
+ default=None,
41
+ help="The embedding model. Defaults to `text-embedding-ada-002` for OpenAI and `hf/e5-base-v2` for Marqo.",
42
+ )
43
+ parser.add_argument(
44
+ "--embedding_size",
45
+ type=int,
46
+ default=None,
47
+ help="The embedding size to use for OpenAI; defaults to OpenAI defaults (e.g. 1536 for `text-embedding-3-small`"
48
+ " and 3072 for `text-embedding-3-large`). Note that OpenAI allows users to reduce these default dimensions. "
49
+ "No need to specify an embedding size for Marqo, since the embedding model determines it.",
50
+ )
51
  parser.add_argument("--vector_store_type", default="pinecone", choices=["pinecone", "marqo"])
52
  parser.add_argument(
53
  "--local_dir",
 
66
  default=2000,
67
  help="Maximum chunks per batch. We recommend 2000 for the OpenAI embedder. Marqo enforces a limit of 64.",
68
  )
69
+ parser.add_argument(
70
+ "--index_name",
71
+ required=True,
72
+ help="Vector store index name. For Pinecone, make sure to create it with the right embedding size.",
73
+ )
74
  parser.add_argument(
75
  "--include",
76
  help="Path to a file containing a list of extensions to include. One extension per line.",
 
91
  default="http://localhost:8882",
92
  help="URL for the Marqo server. Required if using Marqo as embedder or vector store.",
93
  )
 
 
 
 
 
94
  args = parser.parse_args()
95
 
96
  # Validate embedder and vector store compatibility.
 
112
  if args.include and args.exclude:
113
  parser.error("At most one of --include and --exclude can be specified.")
114
 
115
+ # Set default values based on other arguments
116
+ if args.embedder_type is None:
117
+ args.embedding_model = "text-embedding-ada-002" if args.embedder_type == "openai" else "hf/e5-base-v2"
118
+ if args.embedding_size is None and args.embedder_type == "openai":
119
+ args.embedding_size = OPENAI_DEFAULT_EMBEDDING_SIZE.get(args.embedding_model)
120
+ # No need to set embedding_size for Marqo, since the embedding model determines the embedding size.
121
+ logging.warn("--embedding_size is ignored for Marqo embedder.")
122
+
123
  included_extensions = _read_extensions(args.include) if args.include else None
124
  excluded_extensions = _read_extensions(args.exclude) if args.exclude else None
125
 
 
136
  chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
137
 
138
  if args.embedder_type == "openai":
139
+ embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir, args.embedding_model, args.embedding_size)
140
  elif args.embedder_type == "marqo":
141
  embedder = MarqoEmbedder(
142
+ repo_manager, chunker, index_name=args.index_name, url=args.marqo_url, model=args.embedding_model
143
  )
144
  else:
145
  raise ValueError(f"Unrecognized embedder type {args.embedder_type}")
src/vector_store.py CHANGED
@@ -9,7 +9,6 @@ from langchain_core.documents import Document
9
  from langchain_openai import OpenAIEmbeddings
10
  from pinecone import Pinecone
11
 
12
- OPENAI_EMBEDDING_SIZE = 1536
13
  Vector = Tuple[Dict, List[float]] # (metadata, embedding)
14
 
15
 
@@ -43,7 +42,7 @@ class VectorStore(ABC):
43
  class PineconeVectorStore(VectorStore):
44
  """Vector store implementation using Pinecone."""
45
 
46
- def __init__(self, index_name: str, namespace: str, dimension: int = OPENAI_EMBEDDING_SIZE):
47
  self.index_name = index_name
48
  self.dimension = dimension
49
  self.client = Pinecone()
@@ -100,7 +99,7 @@ class MarqoVectorStore(VectorStore):
100
  def build_from_args(args: dict) -> VectorStore:
101
  """Builds a vector store from the given command-line arguments."""
102
  if args.vector_store_type == "pinecone":
103
- return PineconeVectorStore(index_name=args.index_name, namespace=args.repo_id)
104
  elif args.vector_store_type == "marqo":
105
  return MarqoVectorStore(url=args.marqo_url, index_name=args.index_name)
106
  else:
 
9
  from langchain_openai import OpenAIEmbeddings
10
  from pinecone import Pinecone
11
 
 
12
  Vector = Tuple[Dict, List[float]] # (metadata, embedding)
13
 
14
 
 
42
  class PineconeVectorStore(VectorStore):
43
  """Vector store implementation using Pinecone."""
44
 
45
+ def __init__(self, index_name: str, namespace: str, dimension: int):
46
  self.index_name = index_name
47
  self.dimension = dimension
48
  self.client = Pinecone()
 
99
  def build_from_args(args: dict) -> VectorStore:
100
  """Builds a vector store from the given command-line arguments."""
101
  if args.vector_store_type == "pinecone":
102
+ return PineconeVectorStore(index_name=args.index_name, namespace=args.repo_id, dimension=args.embedding_size)
103
  elif args.vector_store_type == "marqo":
104
  return MarqoVectorStore(url=args.marqo_url, index_name=args.index_name)
105
  else: