Spaces:

Asish22
/

code-crawler

Running

App Files Files Community

juliaturc commited on Aug 29, 2024

Commit

eab5126

1 Parent(s): d0e366f

Add max_embedding_jobs flag.

Browse files

Files changed (2) hide show

src/embedder.py +13 -5
src/index.py +10 -3

src/embedder.py CHANGED Viewed

@@ -34,9 +34,7 @@ class BatchEmbedder(ABC):
 class OpenAIBatchEmbedder(BatchEmbedder):
     """Batch embedder that calls OpenAI. See https://platform.openai.com/docs/guides/batch/overview."""
-    def __init__(
-        self, repo_manager: RepoManager, chunker: Chunker, local_dir: str
-    ):
         self.repo_manager = repo_manager
         self.chunker = chunker
         self.local_dir = local_dir
@@ -44,7 +42,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
         self.openai_batch_ids = {}
         self.client = OpenAI()
-    def embed_repo(self, chunks_per_batch: int):
         """Issues batch embedding jobs for the entire repository."""
         if self.openai_batch_ids:
             raise ValueError("Embeddings are in progress.")
@@ -67,6 +65,14 @@ class OpenAIBatchEmbedder(BatchEmbedder):
                     self.openai_batch_ids[openai_batch_id] = self._metadata_for_chunks(
                         sub_batch
                     )
                 batch = []
         # Finally, commit the last batch.
@@ -133,7 +139,9 @@ class OpenAIBatchEmbedder(BatchEmbedder):
         OpenAIBatchEmbedder._export_to_jsonl([request], input_file)
         # Uplaod the file and issue the embedding job.
-        batch_input_file = self.client.files.create(file=open(input_file, "rb"), purpose="batch")
         batch_status = self._create_batch_job(batch_input_file.id)
         logging.info("Created job with ID %s", batch_status.id)
         return batch_status.id

 class OpenAIBatchEmbedder(BatchEmbedder):
     """Batch embedder that calls OpenAI. See https://platform.openai.com/docs/guides/batch/overview."""
+    def __init__(self, repo_manager: RepoManager, chunker: Chunker, local_dir: str):
         self.repo_manager = repo_manager
         self.chunker = chunker
         self.local_dir = local_dir
         self.openai_batch_ids = {}
         self.client = OpenAI()
+    def embed_repo(self, chunks_per_batch: int, max_embedding_jobs: int = None):
         """Issues batch embedding jobs for the entire repository."""
         if self.openai_batch_ids:
             raise ValueError("Embeddings are in progress.")
                     self.openai_batch_ids[openai_batch_id] = self._metadata_for_chunks(
                         sub_batch
                     )
+                    if (
+                        max_embedding_jobs
+                        and len(self.openai_batch_ids) >= max_embedding_jobs
+                    ):
+                        logging.info(
+                            "Reached the maximum number of embedding jobs. Stopping."
+                        )
+                        return
                 batch = []
         # Finally, commit the last batch.
         OpenAIBatchEmbedder._export_to_jsonl([request], input_file)
         # Uplaod the file and issue the embedding job.
+        batch_input_file = self.client.files.create(
+            file=open(input_file, "rb"), purpose="batch"
+        )
         batch_status = self._create_batch_job(batch_input_file.id)
         logging.info("Created job with ID %s", batch_status.id)
         return batch_status.id

src/index.py CHANGED Viewed

@@ -47,10 +47,17 @@ def main():
         "--pinecone_index_name", required=True, help="Pinecone index name"
     )
     parser.add_argument(
-        "--include", help="Path to a file containing a list of extensions to include. One extension per line."
     )
     parser.add_argument(
-        "--exclude", help="Path to a file containing a list of extensions to exclude. One extension per line."
     )
     args = parser.parse_args()
@@ -84,7 +91,7 @@ def main():
     logging.info("Issuing embedding jobs...")
     chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
     embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
-    embedder.embed_repo(args.chunks_per_batch)
     logging.info("Waiting for embeddings to be ready...")
     while not embedder.embeddings_are_ready():

         "--pinecone_index_name", required=True, help="Pinecone index name"
     )
     parser.add_argument(
+        "--include",
+        help="Path to a file containing a list of extensions to include. One extension per line.",
     )
     parser.add_argument(
+        "--exclude",
+        help="Path to a file containing a list of extensions to exclude. One extension per line.",
+    )
+    parser.add_argument(
+        "--max_embedding_jobs", type=int,
+        help="Maximum number of embedding jobs to run. Specifying this might result in "
+        "indexing only part of the repository, but prevents you from burning through OpenAI credits.",
     )
     args = parser.parse_args()
     logging.info("Issuing embedding jobs...")
     chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
     embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
+    embedder.embed_repo(args.chunks_per_batch, args.max_embedding_jobs)
     logging.info("Waiting for embeddings to be ready...")
     while not embedder.embeddings_are_ready():