Spaces:

Asish22
/

code-crawler

Running

App Files Files Community

juliaturc commited on Aug 30, 2024

Commit

e8553c3

1 Parent(s): 40b4763

Support for indexing with Marqo

Browse files

Files changed (4) hide show

requirements.txt +1 -0
src/chunker.py +17 -0
src/embedder.py +70 -21
src/index.py +39 -6

requirements.txt CHANGED Viewed

@@ -4,6 +4,7 @@ gradio==4.42.0
 langchain==0.2.14
 langchain-community==0.2.12
 langchain-openai==0.1.22
 nbformat==5.10.4
 openai==1.42.0
 pinecone==5.0.1

 langchain==0.2.14
 langchain-community==0.2.12
 langchain-openai==0.1.22
+marqo==3.7.0
 nbformat==5.10.4
 openai==1.42.0
 pinecone==5.0.1

src/chunker.py CHANGED Viewed

@@ -30,6 +30,23 @@ class Chunk:
         """The text content to be embedded. Might contain information beyond just the text snippet from the file."""
         return self._content
     def populate_content(self, file_content: str):
         """Populates the content of the chunk with the file path and file content."""
         self._content = (

         """The text content to be embedded. Might contain information beyond just the text snippet from the file."""
         return self._content
+    @property
+    def to_dict(self):
+        """Converts the chunk to a dictionary that can be passed to a vector store."""
+        # Some vector stores require the IDs to be ASCII.
+        filename_ascii = self.filename.encode("ascii", "ignore").decode("ascii")
+        return {
+            # Some vector stores require the IDs to be ASCII.
+            "id": f"{filename_ascii}_{self.start_byte}_{self.end_byte}",
+            "filename": self.filename,
+            "start_byte": self.start_byte,
+            "end_byte": self.end_byte,
+            # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
+            # size limit. In that case, you can simply store the start/end bytes above, and fetch the content
+            # directly from the repository when needed.
+            "text": self.content,
+        }
     def populate_content(self, file_content: str):
         """Populates the content of the chunk with the file path and file content."""
         self._content = (

src/embedder.py CHANGED Viewed

@@ -11,6 +11,7 @@ from openai import OpenAI
 from chunker import Chunk, Chunker
 from repo_manager import RepoManager
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
@@ -19,7 +20,7 @@ class BatchEmbedder(ABC):
     """Abstract class for batch embedding of a repository."""
     @abstractmethod
-    def embed_repo(self, chunks_per_batch: int):
         """Issues batch embedding jobs for the entire repository."""
     @abstractmethod
@@ -62,7 +63,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
                     openai_batch_id = self._issue_job_for_chunks(
                         sub_batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}"
                     )
-                    self.openai_batch_ids[openai_batch_id] = self._metadata_for_chunks(sub_batch)
                     if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
                         logging.info("Reached the maximum number of embedding jobs. Stopping.")
                         return
@@ -71,7 +72,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
         # Finally, commit the last batch.
         if batch:
             openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}")
-            self.openai_batch_ids[openai_batch_id] = self._metadata_for_chunks(batch)
         logging.info("Issued %d jobs for %d chunks.", len(self.openai_batch_ids), chunk_count)
         # Save the job IDs to a file, just in case this script is terminated by mistake.
@@ -171,22 +172,70 @@ class OpenAIBatchEmbedder(BatchEmbedder):
             },
         }
-    @staticmethod
-    def _metadata_for_chunks(chunks):
-        metadata = []
-        for chunk in chunks:
-            filename_ascii = chunk.filename.encode("ascii", "ignore").decode("ascii")
-            metadata.append(
-                {
-                    # Some vector stores require the IDs to be ASCII.
-                    "id": f"{filename_ascii}_{chunk.start_byte}_{chunk.end_byte}",
-                    "filename": chunk.filename,
-                    "start_byte": chunk.start_byte,
-                    "end_byte": chunk.end_byte,
-                    # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
-                    # size limit. In that case, you can simply store the start/end bytes above, and fetch the content
-                    # directly from the repository when needed.
-                    "text": chunk.content,
-                }
             )
-        return metadata

 from chunker import Chunk, Chunker
 from repo_manager import RepoManager
+import marqo
 Vector = Tuple[Dict, List[float]]  # (metadata, embedding)
     """Abstract class for batch embedding of a repository."""
     @abstractmethod
+    def embed_repo(self, chunks_per_batch: int, max_embedding_jobs: int = None):
         """Issues batch embedding jobs for the entire repository."""
     @abstractmethod
                     openai_batch_id = self._issue_job_for_chunks(
                         sub_batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}"
                     )
+                    self.openai_batch_ids[openai_batch_id] = [chunk.to_dict for chunk in sub_batch]
                     if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
                         logging.info("Reached the maximum number of embedding jobs. Stopping.")
                         return
         # Finally, commit the last batch.
         if batch:
             openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{repo_name}/{len(self.openai_batch_ids)}")
+            self.openai_batch_ids[openai_batch_id] = [chunk.to_dict for chunk in batch]
         logging.info("Issued %d jobs for %d chunks.", len(self.openai_batch_ids), chunk_count)
         # Save the job IDs to a file, just in case this script is terminated by mistake.
             },
         }
+class MarqoEmbedder(BatchEmbedder):
+    """Embedder that uses the open-source Marqo vector search engine.
+    Embeddings can be stored locally (in which case `url` the constructor should point to localhost) or in the cloud.
+    """
+    def __init__(self,
+                 repo_manager: RepoManager,
+                 chunker: Chunker,
+                 index_name: str,
+                 url: str,
+                 model="hf/e5-base-v2"):
+        self.repo_manager = repo_manager
+        self.chunker = chunker
+        self.client = marqo.Client(url=url)
+        self.index = self.client.index(index_name)
+        all_index_names = [result["indexName"] for result in self.client.get_indexes()["results"]]
+        if not index_name in all_index_names:
+            self.client.create_index(index_name, model=model)
+    def embed_repo(self, chunks_per_batch: int, max_embedding_jobs: int = None):
+        """Issues batch embedding jobs for the entire repository."""
+        if chunks_per_batch > 64:
+            raise ValueError("Marqo enforces a limit of 64 chunks per batch.")
+        chunk_count = 0
+        batch = []
+        for filepath, content in self.repo_manager.walk():
+            chunks = self.chunker.chunk(filepath, content)
+            chunk_count += len(chunks)
+            batch.extend(chunks)
+            if len(batch) > chunks_per_batch:
+                for i in range(0, len(batch), chunks_per_batch):
+                    sub_batch = batch[i : i + chunks_per_batch]
+                    logging.info("Indexing %d chunks...", len(sub_batch))
+                    self.index.add_documents(
+                        documents=[chunk.to_dict for chunk in sub_batch],
+                        tensor_fields=["text"]
+                    )
+                    if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
+                        logging.info("Reached the maximum number of embedding jobs. Stopping.")
+                        return
+                batch = []
+        # Finally, commit the last batch.
+        if batch:
+            self.index.add_documents(
+                documents=[chunk.to_dict for chunk in batch],
+                tensor_fields=["text"]
             )
+        logging.info(f"Successfully embedded {chunk_count} chunks.")
+    def embeddings_are_ready(self) -> bool:
+        """Checks whether the batch embedding jobs are done."""
+        # Marqo indexes documents synchronously, so once embed_repo() returns, the embeddings are ready.
+        return True
+    def download_embeddings(self) -> Generator[Vector, None, None]:
+        """Yields (chunk_metadata, embedding) pairs for each chunk in the repository."""
+        # Marqo stores embeddings as they are created, so they're already in the vector store. No need to download them
+        # as we would with e.g. OpenAI, Cohere, or some other cloud-based embedding service.
+        return []

src/index.py CHANGED Viewed

@@ -5,7 +5,7 @@ import logging
 import time
 from chunker import UniversalChunker
-from embedder import OpenAIBatchEmbedder
 from repo_manager import RepoManager
 from vector_store import PineconeVectorStore
@@ -29,6 +29,8 @@ def _read_extensions(path):
 def main():
     parser = argparse.ArgumentParser(description="Batch-embeds a repository")
     parser.add_argument("repo_id", help="The ID of the repository to index")
     parser.add_argument(
         "--local_dir",
         default="repos",
@@ -44,7 +46,7 @@ def main():
         "--chunks_per_batch", type=int, default=2000, help="Maximum chunks per batch"
     )
     parser.add_argument(
-        "--pinecone_index_name", required=True, help="Pinecone index name"
     )
     parser.add_argument(
         "--include",
@@ -60,10 +62,25 @@ def main():
         help="Maximum number of embedding jobs to run. Specifying this might result in "
         "indexing only part of the repository, but prevents you from burning through OpenAI credits.",
     )
     args = parser.parse_args()
-    # Validate the arguments.
     if args.tokens_per_chunk > MAX_TOKENS_PER_CHUNK:
         parser.error(
             f"The maximum number of tokens per chunk is {MAX_TOKENS_PER_CHUNK}."
@@ -91,9 +108,25 @@ def main():
     logging.info("Issuing embedding jobs...")
     chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
-    embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
     embedder.embed_repo(args.chunks_per_batch, args.max_embedding_jobs)
     logging.info("Waiting for embeddings to be ready...")
     while not embedder.embeddings_are_ready():
         logging.info("Sleeping for 30 seconds...")
@@ -102,7 +135,7 @@ def main():
     logging.info("Moving embeddings to the vector store...")
     # Note to developer: Replace this with your preferred vector store.
     vector_store = PineconeVectorStore(
-        index_name=args.pinecone_index_name,
         dimension=OPENAI_EMBEDDING_SIZE,
         namespace=repo_manager.repo_id,
     )

 import time
 from chunker import UniversalChunker
+from embedder import OpenAIBatchEmbedder, MarqoEmbedder
 from repo_manager import RepoManager
 from vector_store import PineconeVectorStore
 def main():
     parser = argparse.ArgumentParser(description="Batch-embeds a repository")
     parser.add_argument("repo_id", help="The ID of the repository to index")
+    parser.add_argument("--embedder_type", default="openai", choices=["openai", "marqo"])
+    parser.add_argument("--vector_store_type", default="pinecone", choices=["pinecone", "marqo"])
     parser.add_argument(
         "--local_dir",
         default="repos",
         "--chunks_per_batch", type=int, default=2000, help="Maximum chunks per batch"
     )
     parser.add_argument(
+        "--index_name", required=True, help="Vector store index name"
     )
     parser.add_argument(
         "--include",
         help="Maximum number of embedding jobs to run. Specifying this might result in "
         "indexing only part of the repository, but prevents you from burning through OpenAI credits.",
     )
+    parser.add_argument(
+        "--marqo_url",
+        default="http://localhost:8882",
+        help="URL for the Marqo server. Required if using Marqo as embedder or vector store.",
+    )
+    parser.add_argument(
+        "--marqo_embedding_model",
+        default="hf/e5-base-v2",
+        help="The embedding model to use for Marqo.",
+    )
     args = parser.parse_args()
+    # Validate embedder and vector store compatibility.
+    if args.embedder_type == "openai" and args.vector_store_type != "pinecone":
+        parser.error("When using OpenAI embedder, the vector store type must be Pinecone.")
+    if args.embedder_type == "marqo" and args.vector_store_type != "marqo":
+        parser.error("When using the marqo embedder, the vector store type must also be marqo.")
+    # Validate other arguments.
     if args.tokens_per_chunk > MAX_TOKENS_PER_CHUNK:
         parser.error(
             f"The maximum number of tokens per chunk is {MAX_TOKENS_PER_CHUNK}."
     logging.info("Issuing embedding jobs...")
     chunker = UniversalChunker(max_tokens=args.tokens_per_chunk)
+    if args.embedder_type == "openai":
+        embedder = OpenAIBatchEmbedder(repo_manager, chunker, args.local_dir)
+    elif args.embedder_type == "marqo":
+        embedder = MarqoEmbedder(repo_manager,
+                                 chunker,
+                                 index_name=args.index_name,
+                                 url=args.marqo_url,
+                                 model=args.marqo_embedding_model)
+    else:
+        raise ValueError(f"Unrecognized embedder type {args.embedder_type}")
     embedder.embed_repo(args.chunks_per_batch, args.max_embedding_jobs)
+    if args.vector_store_type == "marqo":
+        # Marqo computes embeddings and stores them in the vector store at once, so we're done.
+        logging.info("Done!")
+        return
     logging.info("Waiting for embeddings to be ready...")
     while not embedder.embeddings_are_ready():
         logging.info("Sleeping for 30 seconds...")
     logging.info("Moving embeddings to the vector store...")
     # Note to developer: Replace this with your preferred vector store.
     vector_store = PineconeVectorStore(
+        index_name=args.index_name,
         dimension=OPENAI_EMBEDDING_SIZE,
         namespace=repo_manager.repo_id,
     )