Spaces:
Running
Running
Allow variable embedding size for the OpenAI embedder. (#19)
Browse files- src/embedder.py +9 -4
- src/index.py +38 -8
- src/vector_store.py +2 -3
src/embedder.py
CHANGED
|
@@ -35,10 +35,14 @@ class BatchEmbedder(ABC):
|
|
| 35 |
class OpenAIBatchEmbedder(BatchEmbedder):
|
| 36 |
"""Batch embedder that calls OpenAI. See https://platform.openai.com/docs/guides/batch/overview."""
|
| 37 |
|
| 38 |
-
def __init__(
|
|
|
|
|
|
|
| 39 |
self.repo_manager = repo_manager
|
| 40 |
self.chunker = chunker
|
| 41 |
self.local_dir = local_dir
|
|
|
|
|
|
|
| 42 |
# IDs issued by OpenAI for each batch job mapped to metadata about the chunks.
|
| 43 |
self.openai_batch_ids = {}
|
| 44 |
self.client = OpenAI()
|
|
@@ -124,7 +128,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
|
|
| 124 |
logging.info("Issuing job for batch %s with %d chunks.", batch_id, len(chunks))
|
| 125 |
|
| 126 |
# Create a .jsonl file with the batch.
|
| 127 |
-
request = OpenAIBatchEmbedder._chunks_to_request(chunks, batch_id)
|
| 128 |
input_file = os.path.join(self.local_dir, f"batch_{batch_id}.jsonl")
|
| 129 |
OpenAIBatchEmbedder._export_to_jsonl([request], input_file)
|
| 130 |
|
|
@@ -160,14 +164,15 @@ class OpenAIBatchEmbedder(BatchEmbedder):
|
|
| 160 |
f.write("\n")
|
| 161 |
|
| 162 |
@staticmethod
|
| 163 |
-
def _chunks_to_request(chunks: List[Chunk], batch_id: str):
|
| 164 |
"""Convert a list of chunks to a batch request."""
|
| 165 |
return {
|
| 166 |
"custom_id": batch_id,
|
| 167 |
"method": "POST",
|
| 168 |
"url": "/v1/embeddings",
|
| 169 |
"body": {
|
| 170 |
-
"model":
|
|
|
|
| 171 |
"input": [chunk.content for chunk in chunks],
|
| 172 |
},
|
| 173 |
}
|
|
|
|
| 35 |
class OpenAIBatchEmbedder(BatchEmbedder):
|
| 36 |
"""Batch embedder that calls OpenAI. See https://platform.openai.com/docs/guides/batch/overview."""
|
| 37 |
|
| 38 |
+
def __init__(
|
| 39 |
+
self, repo_manager: RepoManager, chunker: Chunker, local_dir: str, embedding_model: str, embedding_size: int
|
| 40 |
+
):
|
| 41 |
self.repo_manager = repo_manager
|
| 42 |
self.chunker = chunker
|
| 43 |
self.local_dir = local_dir
|
| 44 |
+
self.embedding_model = embedding_model
|
| 45 |
+
self.embedding_size = embedding_size
|
| 46 |
# IDs issued by OpenAI for each batch job mapped to metadata about the chunks.
|
| 47 |
self.openai_batch_ids = {}
|
| 48 |
self.client = OpenAI()
|
|
|
|
| 128 |
logging.info("Issuing job for batch %s with %d chunks.", batch_id, len(chunks))
|
| 129 |
|
| 130 |
# Create a .jsonl file with the batch.
|
| 131 |
+
request = OpenAIBatchEmbedder._chunks_to_request(chunks, batch_id, self.embedding_model, self.embedding_size)
|
| 132 |
input_file = os.path.join(self.local_dir, f"batch_{batch_id}.jsonl")
|
| 133 |
OpenAIBatchEmbedder._export_to_jsonl([request], input_file)
|
| 134 |
|
|
|
|
| 164 |
f.write("\n")
|
| 165 |
|
| 166 |
@staticmethod
|
| 167 |
+
def _chunks_to_request(chunks: List[Chunk], batch_id: str, model: str, dimensions: int):
|
| 168 |
"""Convert a list of chunks to a batch request."""
|
| 169 |
return {
|
| 170 |
"custom_id": batch_id,
|
| 171 |
"method": "POST",
|
| 172 |
"url": "/v1/embeddings",
|
| 173 |
"body": {
|
| 174 |
+
"model": model,
|
| 175 |
+
"dimensions": dimensions,
|
| 176 |
"input": [chunk.content for chunk in chunks],
|
| 177 |
},
|
| 178 |
}
|
src/index.py
CHANGED
|
@@ -15,6 +15,15 @@ MAX_TOKENS_PER_CHUNK = 8192 # The ADA embedder from OpenAI has a maximum of 819
|
|
| 15 |
MAX_CHUNKS_PER_BATCH = 2048 # The OpenAI batch embedding API enforces a maximum of 2048 chunks per batch.
|
| 16 |
MAX_TOKENS_PER_JOB = 3_000_000 # The OpenAI batch embedding API enforces a maximum of 3M tokens processed at once.
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def _read_extensions(path):
|
| 20 |
with open(path, "r") as f:
|
|
@@ -25,6 +34,20 @@ def main():
|
|
| 25 |
parser = argparse.ArgumentParser(description="Batch-embeds a repository")
|
| 26 |
parser.add_argument("repo_id", help="The ID of the repository to index")
|
| 27 |
parser.add_argument("--embedder_type", default="openai", choices=["openai", "marqo"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
parser.add_argument("--vector_store_type", default="pinecone", choices=["pinecone", "marqo"])
|
| 29 |
parser.add_argument(
|
| 30 |
"--local_dir",
|
|
@@ -43,7 +66,11 @@ def main():
|
|
| 43 |
default=2000,
|
| 44 |
help="Maximum chunks per batch. We recommend 2000 for the OpenAI embedder. Marqo enforces a limit of 64.",
|
| 45 |
)
|
| 46 |
-
parser.add_argument(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
parser.add_argument(
|
| 48 |
"--include",
|
| 49 |
help="Path to a file containing a list of extensions to include. One extension per line.",
|
|
@@ -64,11 +91,6 @@ def main():
|
|
| 64 |
default="http://localhost:8882",
|
| 65 |
help="URL for the Marqo server. Required if using Marqo as embedder or vector store.",
|
| 66 |
)
|
| 67 |
-
parser.add_argument(
|
| 68 |
-
"--marqo_embedding_model",
|
| 69 |
-
default="hf/e5-base-v2",
|
| 70 |
-
help="The embedding model to use for Marqo.",
|
| 71 |
-
)
|
| 72 |
args = parser.parse_args()
|
| 73 |
|
| 74 |
# Validate embedder and vector store compatibility.
|
|
@@ -90,6 +112,14 @@ def main():
|
|
| 90 |
if args.include and args.exclude:
|
| 91 |
parser.error("At most one of --include and --exclude can be specified.")
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
included_extensions = _read_extensions(args.include) if args.include else None
|
| 94 |
excluded_extensions = _read_extensions(args.exclude) if args.exclude else None
|
| 95 |
|
|
@@ -106,10 +136,10 @@ def main():
|
|
| 106 |
chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
|
| 107 |
|
| 108 |
if args.embedder_type == "openai":
|
| 109 |
-
embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
|
| 110 |
elif args.embedder_type == "marqo":
|
| 111 |
embedder = MarqoEmbedder(
|
| 112 |
-
repo_manager, chunker, index_name=args.index_name, url=args.marqo_url, model=args.
|
| 113 |
)
|
| 114 |
else:
|
| 115 |
raise ValueError(f"Unrecognized embedder type {args.embedder_type}")
|
|
|
|
| 15 |
MAX_CHUNKS_PER_BATCH = 2048 # The OpenAI batch embedding API enforces a maximum of 2048 chunks per batch.
|
| 16 |
MAX_TOKENS_PER_JOB = 3_000_000 # The OpenAI batch embedding API enforces a maximum of 3M tokens processed at once.
|
| 17 |
|
| 18 |
+
# Note that OpenAI embedding models have fixed dimensions, however, taking a slice of them is possible.
|
| 19 |
+
# See "Reducing embedding dimensions" under https://platform.openai.com/docs/guides/embeddings/use-cases and
|
| 20 |
+
# https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-dimensions
|
| 21 |
+
OPENAI_DEFAULT_EMBEDDING_SIZE = {
|
| 22 |
+
"text-embedding-ada-002": 1536,
|
| 23 |
+
"text-embedding-3-small": 1536,
|
| 24 |
+
"text-embedding-3-large": 3072,
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
|
| 28 |
def _read_extensions(path):
|
| 29 |
with open(path, "r") as f:
|
|
|
|
| 34 |
parser = argparse.ArgumentParser(description="Batch-embeds a repository")
|
| 35 |
parser.add_argument("repo_id", help="The ID of the repository to index")
|
| 36 |
parser.add_argument("--embedder_type", default="openai", choices=["openai", "marqo"])
|
| 37 |
+
parser.add_argument(
|
| 38 |
+
"--embedding_model",
|
| 39 |
+
type=str,
|
| 40 |
+
default=None,
|
| 41 |
+
help="The embedding model. Defaults to `text-embedding-ada-002` for OpenAI and `hf/e5-base-v2` for Marqo.",
|
| 42 |
+
)
|
| 43 |
+
parser.add_argument(
|
| 44 |
+
"--embedding_size",
|
| 45 |
+
type=int,
|
| 46 |
+
default=None,
|
| 47 |
+
help="The embedding size to use for OpenAI; defaults to OpenAI defaults (e.g. 1536 for `text-embedding-3-small`"
|
| 48 |
+
" and 3072 for `text-embedding-3-large`). Note that OpenAI allows users to reduce these default dimensions. "
|
| 49 |
+
"No need to specify an embedding size for Marqo, since the embedding model determines it.",
|
| 50 |
+
)
|
| 51 |
parser.add_argument("--vector_store_type", default="pinecone", choices=["pinecone", "marqo"])
|
| 52 |
parser.add_argument(
|
| 53 |
"--local_dir",
|
|
|
|
| 66 |
default=2000,
|
| 67 |
help="Maximum chunks per batch. We recommend 2000 for the OpenAI embedder. Marqo enforces a limit of 64.",
|
| 68 |
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
"--index_name",
|
| 71 |
+
required=True,
|
| 72 |
+
help="Vector store index name. For Pinecone, make sure to create it with the right embedding size.",
|
| 73 |
+
)
|
| 74 |
parser.add_argument(
|
| 75 |
"--include",
|
| 76 |
help="Path to a file containing a list of extensions to include. One extension per line.",
|
|
|
|
| 91 |
default="http://localhost:8882",
|
| 92 |
help="URL for the Marqo server. Required if using Marqo as embedder or vector store.",
|
| 93 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
args = parser.parse_args()
|
| 95 |
|
| 96 |
# Validate embedder and vector store compatibility.
|
|
|
|
| 112 |
if args.include and args.exclude:
|
| 113 |
parser.error("At most one of --include and --exclude can be specified.")
|
| 114 |
|
| 115 |
+
# Set default values based on other arguments
|
| 116 |
+
if args.embedder_type is None:
|
| 117 |
+
args.embedding_model = "text-embedding-ada-002" if args.embedder_type == "openai" else "hf/e5-base-v2"
|
| 118 |
+
if args.embedding_size is None and args.embedder_type == "openai":
|
| 119 |
+
args.embedding_size = OPENAI_DEFAULT_EMBEDDING_SIZE.get(args.embedding_model)
|
| 120 |
+
# No need to set embedding_size for Marqo, since the embedding model determines the embedding size.
|
| 121 |
+
logging.warn("--embedding_size is ignored for Marqo embedder.")
|
| 122 |
+
|
| 123 |
included_extensions = _read_extensions(args.include) if args.include else None
|
| 124 |
excluded_extensions = _read_extensions(args.exclude) if args.exclude else None
|
| 125 |
|
|
|
|
| 136 |
chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
|
| 137 |
|
| 138 |
if args.embedder_type == "openai":
|
| 139 |
+
embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir, args.embedding_model, args.embedding_size)
|
| 140 |
elif args.embedder_type == "marqo":
|
| 141 |
embedder = MarqoEmbedder(
|
| 142 |
+
repo_manager, chunker, index_name=args.index_name, url=args.marqo_url, model=args.embedding_model
|
| 143 |
)
|
| 144 |
else:
|
| 145 |
raise ValueError(f"Unrecognized embedder type {args.embedder_type}")
|
src/vector_store.py
CHANGED
|
@@ -9,7 +9,6 @@ from langchain_core.documents import Document
|
|
| 9 |
from langchain_openai import OpenAIEmbeddings
|
| 10 |
from pinecone import Pinecone
|
| 11 |
|
| 12 |
-
OPENAI_EMBEDDING_SIZE = 1536
|
| 13 |
Vector = Tuple[Dict, List[float]] # (metadata, embedding)
|
| 14 |
|
| 15 |
|
|
@@ -43,7 +42,7 @@ class VectorStore(ABC):
|
|
| 43 |
class PineconeVectorStore(VectorStore):
|
| 44 |
"""Vector store implementation using Pinecone."""
|
| 45 |
|
| 46 |
-
def __init__(self, index_name: str, namespace: str, dimension: int
|
| 47 |
self.index_name = index_name
|
| 48 |
self.dimension = dimension
|
| 49 |
self.client = Pinecone()
|
|
@@ -100,7 +99,7 @@ class MarqoVectorStore(VectorStore):
|
|
| 100 |
def build_from_args(args: dict) -> VectorStore:
|
| 101 |
"""Builds a vector store from the given command-line arguments."""
|
| 102 |
if args.vector_store_type == "pinecone":
|
| 103 |
-
return PineconeVectorStore(index_name=args.index_name, namespace=args.repo_id)
|
| 104 |
elif args.vector_store_type == "marqo":
|
| 105 |
return MarqoVectorStore(url=args.marqo_url, index_name=args.index_name)
|
| 106 |
else:
|
|
|
|
| 9 |
from langchain_openai import OpenAIEmbeddings
|
| 10 |
from pinecone import Pinecone
|
| 11 |
|
|
|
|
| 12 |
Vector = Tuple[Dict, List[float]] # (metadata, embedding)
|
| 13 |
|
| 14 |
|
|
|
|
| 42 |
class PineconeVectorStore(VectorStore):
|
| 43 |
"""Vector store implementation using Pinecone."""
|
| 44 |
|
| 45 |
+
def __init__(self, index_name: str, namespace: str, dimension: int):
|
| 46 |
self.index_name = index_name
|
| 47 |
self.dimension = dimension
|
| 48 |
self.client = Pinecone()
|
|
|
|
| 99 |
def build_from_args(args: dict) -> VectorStore:
|
| 100 |
"""Builds a vector store from the given command-line arguments."""
|
| 101 |
if args.vector_store_type == "pinecone":
|
| 102 |
+
return PineconeVectorStore(index_name=args.index_name, namespace=args.repo_id, dimension=args.embedding_size)
|
| 103 |
elif args.vector_store_type == "marqo":
|
| 104 |
return MarqoVectorStore(url=args.marqo_url, index_name=args.index_name)
|
| 105 |
else:
|