Spaces:

Asish22
/

code-crawler

Running

App Files Files Community

juliaturc commited on Aug 29, 2024

Commit

a8c35cd

2 Parent(s): 5cf92d3 77a0875

Merge pull request #8 from Storia-AI/julia/fixes

Browse files

Files changed (5) hide show

pyproject.toml +2 -0
src/embedder.py +11 -18
src/index.py +10 -3
src/repo_manager.py +2 -0
src/sample-exclude.txt +1 -0

pyproject.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [tool.black]
2	+ line-length = 120

src/embedder.py CHANGED Viewed

@@ -34,9 +34,7 @@ class BatchEmbedder(ABC):
 class OpenAIBatchEmbedder(BatchEmbedder):
     """Batch embedder that calls OpenAI. See https://platform.openai.com/docs/guides/batch/overview."""
-    def __init__(
-        self, repo_manager: RepoManager, chunker: Chunker, local_dir: str
-    ):
         self.repo_manager = repo_manager
         self.chunker = chunker
         self.local_dir = local_dir
@@ -44,7 +42,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
         self.openai_batch_ids = {}
         self.client = OpenAI()
-    def embed_repo(self, chunks_per_batch: int):
         """Issues batch embedding jobs for the entire repository."""
         if self.openai_batch_ids:
             raise ValueError("Embeddings are in progress.")
@@ -60,24 +58,21 @@ class OpenAIBatchEmbedder(BatchEmbedder):
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
-                    batch = batch[i : i + chunks_per_batch]
                     openai_batch_id = self._issue_job_for_chunks(
-                        batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}"
-                    )
-                    self.openai_batch_ids[openai_batch_id] = self._metadata_for_chunks(
-                        batch
                     )
                 batch = []
         # Finally, commit the last batch.
         if batch:
-            openai_batch_id = self._issue_job_for_chunks(
-                batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}"
-            )
             self.openai_batch_ids[openai_batch_id] = self._metadata_for_chunks(batch)
-        logging.info(
-            "Issued %d jobs for %d chunks.", len(self.openai_batch_ids), chunk_count
-        )
         # Save the job IDs to a file, just in case this script is terminated by mistake.
         metadata_file = os.path.join(self.local_dir, "openai_batch_ids.json")
@@ -149,9 +144,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
                 metadata={},
             )
         except Exception as e:
-            print(
-                f"Failed to create batch job with input_file_id={input_file_id}. Error: {e}"
-            )
             return None
     @staticmethod

 class OpenAIBatchEmbedder(BatchEmbedder):
     """Batch embedder that calls OpenAI. See https://platform.openai.com/docs/guides/batch/overview."""
+    def __init__(self, repo_manager: RepoManager, chunker: Chunker, local_dir: str):
         self.repo_manager = repo_manager
         self.chunker = chunker
         self.local_dir = local_dir
         self.openai_batch_ids = {}
         self.client = OpenAI()
+    def embed_repo(self, chunks_per_batch: int, max_embedding_jobs: int = None):
         """Issues batch embedding jobs for the entire repository."""
         if self.openai_batch_ids:
             raise ValueError("Embeddings are in progress.")
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
+                    sub_batch = batch[i : i + chunks_per_batch]
                     openai_batch_id = self._issue_job_for_chunks(
+                        sub_batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}"
                     )
+                    self.openai_batch_ids[openai_batch_id] = self._metadata_for_chunks(sub_batch)
+                    if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
+                        logging.info("Reached the maximum number of embedding jobs. Stopping.")
+                        return
                 batch = []
         # Finally, commit the last batch.
         if batch:
+            openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}")
             self.openai_batch_ids[openai_batch_id] = self._metadata_for_chunks(batch)
+        logging.info("Issued %d jobs for %d chunks.", len(self.openai_batch_ids), chunk_count)
         # Save the job IDs to a file, just in case this script is terminated by mistake.
         metadata_file = os.path.join(self.local_dir, "openai_batch_ids.json")
                 metadata={},
             )
         except Exception as e:
+            print(f"Failed to create batch job with input_file_id={input_file_id}. Error: {e}")
             return None
     @staticmethod

src/index.py CHANGED Viewed

@@ -47,10 +47,17 @@ def main():
         "--pinecone_index_name", required=True, help="Pinecone index name"
     )
     parser.add_argument(
-        "--include", help="Path to a file containing a list of extensions to include. One extension per line."
     )
     parser.add_argument(
-        "--exclude", help="Path to a file containing a list of extensions to exclude. One extension per line."
     )
     args = parser.parse_args()
@@ -84,7 +91,7 @@ def main():
     logging.info("Issuing embedding jobs...")
     chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
     embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
-    embedder.embed_repo(args.chunks_per_batch)
     logging.info("Waiting for embeddings to be ready...")
     while not embedder.embeddings_are_ready():

         "--pinecone_index_name", required=True, help="Pinecone index name"
     )
     parser.add_argument(
+        "--include",
+        help="Path to a file containing a list of extensions to include. One extension per line.",
     )
     parser.add_argument(
+        "--exclude",
+        help="Path to a file containing a list of extensions to exclude. One extension per line.",
+    )
+    parser.add_argument(
+        "--max_embedding_jobs", type=int,
+        help="Maximum number of embedding jobs to run. Specifying this might result in "
+        "indexing only part of the repository, but prevents you from burning through OpenAI credits.",
     )
     args = parser.parse_args()
     logging.info("Issuing embedding jobs...")
     chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
     embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
+    embedder.embed_repo(args.chunks_per_batch, args.max_embedding_jobs)
     logging.info("Waiting for embeddings to be ready...")
     while not embedder.embeddings_are_ready():

src/repo_manager.py CHANGED Viewed

@@ -89,6 +89,8 @@ class RepoManager:
     def _should_include(self, file_path: str) -> bool:
         """Checks whether the file should be indexed, based on the included and excluded extensions."""
         _, extension = os.path.splitext(file_path)
         extension = extension.lower()
         if self.included_extensions and extension not in self.included_extensions:

     def _should_include(self, file_path: str) -> bool:
         """Checks whether the file should be indexed, based on the included and excluded extensions."""
+        if os.path.islink(file_path):
+            return False
         _, extension = os.path.splitext(file_path)
         extension = extension.lower()
         if self.included_extensions and extension not in self.included_extensions:

src/sample-exclude.txt CHANGED Viewed

@@ -41,6 +41,7 @@
 .pt
 .ptl
 .s
 .sqlite
 .stl
 .sum

 .pt
 .ptl
 .s
+.so
 .sqlite
 .stl
 .sum