Spaces:

Asish22
/

code-crawler

Sleeping

App Files Files Community

juliaturc commited on Sep 8, 2024

Commit

39898b4

1 Parent(s): 90af3bf

Option to not store file chunk content to vector store

Browse files

Files changed (10) hide show

README.md +1 -1
repo2vec/chat.py +1 -0
repo2vec/chunker.py +5 -2
repo2vec/data_manager.py +6 -6
repo2vec/embedder.py +52 -29
repo2vec/github.py +3 -2
repo2vec/index.py +8 -7
repo2vec/vector_store.py +2 -1
setup.py +6 -4
tests/conftest.py +2 -2

README.md CHANGED Viewed

@@ -104,7 +104,7 @@ If you are planning on indexing GitHub issues in addition to the codebase, you w
 2. Index the repository. This might take a few minutes, depending on its size.
     ```
     r2v-index $GITHUB_REPO \
-        --embedder-type=openai
         --vector-store=pinecone \
         --index-name=$PINECONE_INDEX_NAME
     ```

 2. Index the repository. This might take a few minutes, depending on its size.
     ```
     r2v-index $GITHUB_REPO \
+        --embedder-type=openai \
         --vector-store=pinecone \
         --index-name=$PINECONE_INDEX_NAME
     ```

repo2vec/chat.py CHANGED Viewed

@@ -125,5 +125,6 @@ def main():
         examples=["What does this repo do?", "Give me some sample code."],
     ).launch(share=args.share)
 if __name__ == "__main__":
     main()

         examples=["What does this repo do?", "Give me some sample code."],
     ).launch(share=args.share)
 if __name__ == "__main__":
     main()

repo2vec/chunker.py CHANGED Viewed

@@ -31,7 +31,7 @@ class Chunk:
 class FileChunk(Chunk):
     """A chunk of code or text extracted from a file in the repository."""
-    file_content: str    # The content of the entire file, not just this chunk.
     file_metadata: Dict  # Metadata of the entire file, not just this chunk.
     start_byte: int
     end_byte: int
@@ -57,6 +57,7 @@ class FileChunk(Chunk):
             "id": f"{filename_ascii}_{self.start_byte}_{self.end_byte}",
             "start_byte": self.start_byte,
             "end_byte": self.end_byte,
             # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
             # size limit. In that case, you can simply store the start/end bytes above, and fetch the content
             # directly from the repository when needed.
@@ -202,7 +203,9 @@ class CodeFileChunker(Chunker):
         for chunk in file_chunks:
             # Make sure that the chunk has content and doesn't exceed the max_tokens limit. Otherwise there must be
             # a bug in the code.
-            assert chunk.num_tokens <= self.max_tokens, f"Chunk size {chunk.num_tokens} exceeds max_tokens {self.max_tokens}."
         return file_chunks

 class FileChunk(Chunk):
     """A chunk of code or text extracted from a file in the repository."""
+    file_content: str  # The content of the entire file, not just this chunk.
     file_metadata: Dict  # Metadata of the entire file, not just this chunk.
     start_byte: int
     end_byte: int
             "id": f"{filename_ascii}_{self.start_byte}_{self.end_byte}",
             "start_byte": self.start_byte,
             "end_byte": self.end_byte,
+            "length": self.end_byte - self.start_byte,
             # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
             # size limit. In that case, you can simply store the start/end bytes above, and fetch the content
             # directly from the repository when needed.
         for chunk in file_chunks:
             # Make sure that the chunk has content and doesn't exceed the max_tokens limit. Otherwise there must be
             # a bug in the code.
+            assert (
+                chunk.num_tokens <= self.max_tokens
+            ), f"Chunk size {chunk.num_tokens} exceeds max_tokens {self.max_tokens}."
         return file_chunks

repo2vec/data_manager.py CHANGED Viewed

@@ -155,15 +155,15 @@ class GitHubRepoManager(DataManager):
         if self.inclusions:
             return (
-                extension in self.inclusions.get("ext", []) or
-                file_name in self.inclusions.get("file", []) or
-                any(d in dirs for d in self.inclusions.get("dir", []))
             )
         elif self.exclusions:
             return (
-                extension not in self.exclusions.get("ext", []) and
-                file_name not in self.exclusions.get("file", []) and
-                all(d not in dirs for d in self.exclusions.get("dir", []))
             )
         return True

         if self.inclusions:
             return (
+                extension in self.inclusions.get("ext", [])
+                or file_name in self.inclusions.get("file", [])
+                or any(d in dirs for d in self.inclusions.get("dir", []))
             )
         elif self.exclusions:
             return (
+                extension not in self.exclusions.get("ext", [])
+                and file_name not in self.exclusions.get("file", [])
+                and all(d not in dirs for d in self.exclusions.get("dir", []))
             )
         return True

repo2vec/embedder.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import json
 import logging
 import os
 from abc import ABC, abstractmethod
 from collections import Counter
 from typing import Dict, Generator, List, Optional, Tuple
@@ -43,18 +44,14 @@ class OpenAIBatchEmbedder(BatchEmbedder):
         self.local_dir = local_dir
         self.embedding_model = embedding_model
         self.embedding_size = embedding_size
-        # IDs issued by OpenAI for each batch job mapped to metadata about the chunks.
-        self.openai_batch_ids = {}
         self.client = OpenAI()
-    def embed_dataset(self, chunks_per_batch: int, max_embedding_jobs: int = None):
-        """Issues batch embedding jobs for the entire dataset."""
-        if self.openai_batch_ids:
-            raise ValueError("Embeddings are in progress.")
         batch = []
         chunk_count = 0
-        dataset_name = self.data_manager.dataset_id.split("/")[-1]
         for content, metadata in self.data_manager.walk():
             chunks = self.chunker.chunk(content, metadata)
@@ -64,41 +61,58 @@ class OpenAIBatchEmbedder(BatchEmbedder):
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
                     sub_batch = batch[i : i + chunks_per_batch]
-                    openai_batch_id = self._issue_job_for_chunks(
-                        sub_batch, batch_id=f"{dataset_name}/{len(self.openai_batch_ids)}"
-                    )
-                    self.openai_batch_ids[openai_batch_id] = [chunk.metadata for chunk in sub_batch]
-                    if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
                         logging.info("Reached the maximum number of embedding jobs. Stopping.")
                         return
                 batch = []
         # Finally, commit the last batch.
         if batch:
-            openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{dataset_name}/{len(self.openai_batch_ids)}")
-            self.openai_batch_ids[openai_batch_id] = [chunk.metadata for chunk in batch]
-        logging.info("Issued %d jobs for %d chunks.", len(self.openai_batch_ids), chunk_count)
-        # Save the job IDs to a file, just in case this script is terminated by mistake.
-        metadata_file = os.path.join(self.local_dir, "openai_batch_ids.json")
         with open(metadata_file, "w") as f:
-            json.dump(self.openai_batch_ids, f)
         logging.info("Job metadata saved at %s", metadata_file)
-    def embeddings_are_ready(self) -> bool:
-        """Checks whether the embeddings jobs are done (either completed or failed)."""
-        if not self.openai_batch_ids:
-            raise ValueError("No embeddings in progress.")
-        job_ids = self.openai_batch_ids.keys()
         statuses = [self.client.batches.retrieve(job_id.strip()) for job_id in job_ids]
         are_ready = all(status.status in ["completed", "failed"] for status in statuses)
         status_counts = Counter(status.status for status in statuses)
         logging.info("Job statuses: %s", status_counts)
         return are_ready
-    def download_embeddings(self) -> Generator[Vector, None, None]:
-        """Yield a (chunk_metadata, embedding) pair for each chunk in the dataset."""
-        job_ids = self.openai_batch_ids.keys()
         statuses = [self.client.batches.retrieve(job_id.strip()) for job_id in job_ids]
         for idx, status in enumerate(statuses):
@@ -111,7 +125,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
                 logging.error("Job %s failed with error: %s", status.id, error.text)
                 continue
-            batch_metadata = self.openai_batch_ids[status.id]
             file_response = self.client.files.content(status.output_file_id)
             data = json.loads(file_response.text)["response"]["body"]["data"]
             logging.info("Job %s generated %d embeddings.", status.id, len(data))
@@ -119,6 +133,13 @@ class OpenAIBatchEmbedder(BatchEmbedder):
             for datum in data:
                 idx = int(datum["index"])
                 metadata = batch_metadata[idx]
                 embedding = datum["embedding"]
                 yield (metadata, embedding)
@@ -206,6 +227,7 @@ class MarqoEmbedder(BatchEmbedder):
         chunk_count = 0
         batch = []
         for content, metadata in self.data_manager.walk():
             chunks = self.chunker.chunk(content, metadata)
@@ -220,8 +242,9 @@ class MarqoEmbedder(BatchEmbedder):
                         documents=[chunk.metadata for chunk in sub_batch],
                         tensor_fields=["text"],
                     )
-                    if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
                         logging.info("Reached the maximum number of embedding jobs. Stopping.")
                         return
                 batch = []

 import json
 import logging
 import os
+import time
 from abc import ABC, abstractmethod
 from collections import Counter
 from typing import Dict, Generator, List, Optional, Tuple
         self.local_dir = local_dir
         self.embedding_model = embedding_model
         self.embedding_size = embedding_size
         self.client = OpenAI()
+    def embed_dataset(self, chunks_per_batch: int, max_embedding_jobs: int = None) -> str:
+        """Issues batch embedding jobs for the entire dataset. Returns the filename containing the job IDs."""
         batch = []
+        batch_ids = {}  # job_id -> metadata
         chunk_count = 0
+        dataset_name = self.data_manager.dataset_id.replace("/", "_")
         for content, metadata in self.data_manager.walk():
             chunks = self.chunker.chunk(content, metadata)
             if len(batch) > chunks_per_batch:
                 for i in range(0, len(batch), chunks_per_batch):
                     sub_batch = batch[i : i + chunks_per_batch]
+                    openai_batch_id = self._issue_job_for_chunks(sub_batch, batch_id=f"{dataset_name}/{len(batch_ids)}")
+                    batch_ids[openai_batch_id] = [chunk.metadata for chunk in sub_batch]
+                    if max_embedding_jobs and len(batch_ids) >= max_embedding_jobs:
                         logging.info("Reached the maximum number of embedding jobs. Stopping.")
                         return
                 batch = []
         # Finally, commit the last batch.
         if batch:
+            openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{dataset_name}/{len(batch_ids)}")
+            batch_ids[openai_batch_id] = [chunk.metadata for chunk in batch]
+        logging.info("Issued %d jobs for %d chunks.", len(batch_ids), chunk_count)
+        timestamp = int(time.time())
+        metadata_file = os.path.join(self.local_dir, f"{dataset_name}_openai_batch_ids_{timestamp}.json")
         with open(metadata_file, "w") as f:
+            json.dump(batch_ids, f)
         logging.info("Job metadata saved at %s", metadata_file)
+        return metadata_file
+    def embeddings_are_ready(self, metadata_file: str) -> bool:
+        """Checks whether the embeddings jobs are done (either completed or failed).
+        Args:
+            metadata_file: Path to the file containing the job metadata (output of self.embed_dataset).
+        """
+        with open(metadata_file, "r") as f:
+            batch_ids = json.load(f)
+        job_ids = batch_ids.keys()
         statuses = [self.client.batches.retrieve(job_id.strip()) for job_id in job_ids]
         are_ready = all(status.status in ["completed", "failed"] for status in statuses)
         status_counts = Counter(status.status for status in statuses)
         logging.info("Job statuses: %s", status_counts)
         return are_ready
+    def download_embeddings(
+        self, metadata_file: str, store_file_chunk_content: bool = True
+    ) -> Generator[Vector, None, None]:
+        """Yields a (chunk_metadata, embedding) pair for each chunk in the dataset.
+        Args:
+            metadata_file: Path to the file containing the job metadata (output of self.embed_dataset).
+            store_file_chunk_content: Whether to store the text content in the metadata for file chunks. Set this to
+                False if you want to save space in the vector store. After retrieval, the content of a file chunk can be
+                reconstructed based on the file_path, start_byte and end_byte fields in the metadata. This will not
+                affect other types of chunks (e.g. GitHub issues) for which the content is harder to reconstruct.
+        """
+        with open(metadata_file, "r") as f:
+            batch_ids = json.load(f)
+        job_ids = batch_ids.keys()
         statuses = [self.client.batches.retrieve(job_id.strip()) for job_id in job_ids]
         for idx, status in enumerate(statuses):
                 logging.error("Job %s failed with error: %s", status.id, error.text)
                 continue
+            batch_metadata = batch_ids[status.id]
             file_response = self.client.files.content(status.output_file_id)
             data = json.loads(file_response.text)["response"]["body"]["data"]
             logging.info("Job %s generated %d embeddings.", status.id, len(data))
             for datum in data:
                 idx = int(datum["index"])
                 metadata = batch_metadata[idx]
+                if (
+                    not store_file_chunk_content
+                    and "file_path" in metadata
+                    and "start_byte" in metadata
+                    and "end_byte" in metadata
+                ):
+                    metadata.pop("text", None)
                 embedding = datum["embedding"]
                 yield (metadata, embedding)
         chunk_count = 0
         batch = []
+        job_count = 0
         for content, metadata in self.data_manager.walk():
             chunks = self.chunker.chunk(content, metadata)
                         documents=[chunk.metadata for chunk in sub_batch],
                         tensor_fields=["text"],
                     )
+                    job_count += 1
+                    if max_embedding_jobs and job_count >= max_embedding_jobs:
                         logging.info("Reached the maximum number of embedding jobs. Stopping.")
                         return
                 batch = []

repo2vec/github.py CHANGED Viewed

@@ -1,10 +1,10 @@
 """GitHub-specific implementations for DataManager and Chunker."""
 import os
 from dataclasses import dataclass
 from typing import Any, Dict, Generator, List, Tuple
-import logging
 import requests
 import tiktoken
@@ -234,7 +234,8 @@ class GitHubIssuesChunker(Chunker):
                         issue=issue,
                         start_comment=comment_idx,
                         end_comment=comment_idx + 1,
-                    ))
             else:
                 # Add the comment to the existing chunk.
                 chunks[-1].end_comment = comment_idx + 1

 """GitHub-specific implementations for DataManager and Chunker."""
+import logging
 import os
 from dataclasses import dataclass
 from typing import Any, Dict, Generator, List, Tuple
 import requests
 import tiktoken
                         issue=issue,
                         start_comment=comment_idx,
                         end_comment=comment_idx + 1,
+                    )
+                )
             else:
                 # Add the comment to the existing chunk.
                 chunks[-1].end_comment = comment_idx + 1

repo2vec/index.py CHANGED Viewed

@@ -3,9 +3,10 @@
 import argparse
 import logging
 import os
-import pkg_resources
 import time
 from repo2vec.chunker import UniversalFileChunker
 from repo2vec.data_manager import GitHubRepoManager
 from repo2vec.embedder import build_batch_embedder_from_flags
@@ -202,7 +203,7 @@ def main():
         logging.info("Embedding the repo...")
         chunker = UniversalFileChunker(max_tokens=args.tokens_per_chunk)
         repo_embedder = build_batch_embedder_from_flags(repo_manager, chunker, args)
-        repo_embedder.embed_dataset(args.chunks_per_batch, args.max_embedding_jobs)
     # Index the GitHub issues.
     issues_embedder = None
@@ -213,7 +214,7 @@ def main():
         logging.info("Embedding GitHub issues...")
         chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
         issues_embedder = build_batch_embedder_from_flags(issues_manager, chunker, args)
-        issues_embedder.embed_dataset(args.chunks_per_batch, args.max_embedding_jobs)
     ########################
     # Step 2: Vector Store #
@@ -226,25 +227,25 @@ def main():
     if repo_embedder is not None:
         logging.info("Waiting for repo embeddings to be ready...")
-        while not repo_embedder.embeddings_are_ready():
             logging.info("Sleeping for 30 seconds...")
             time.sleep(30)
         logging.info("Moving embeddings to the repo vector store...")
         repo_vector_store = build_from_args(args)
         repo_vector_store.ensure_exists()
-        repo_vector_store.upsert(repo_embedder.download_embeddings())
     if issues_embedder is not None:
         logging.info("Waiting for issue embeddings to be ready...")
-        while not issues_embedder.embeddings_are_ready():
             logging.info("Sleeping for 30 seconds...")
             time.sleep(30)
         logging.info("Moving embeddings to the issues vector store...")
         issues_vector_store = build_from_args(args)
         issues_vector_store.ensure_exists()
-        issues_vector_store.upsert(issues_embedder.download_embeddings())
     logging.info("Done!")

 import argparse
 import logging
 import os
 import time
+import pkg_resources
 from repo2vec.chunker import UniversalFileChunker
 from repo2vec.data_manager import GitHubRepoManager
 from repo2vec.embedder import build_batch_embedder_from_flags
         logging.info("Embedding the repo...")
         chunker = UniversalFileChunker(max_tokens=args.tokens_per_chunk)
         repo_embedder = build_batch_embedder_from_flags(repo_manager, chunker, args)
+        repo_jobs_file = repo_embedder.embed_dataset(args.chunks_per_batch, args.max_embedding_jobs)
     # Index the GitHub issues.
     issues_embedder = None
         logging.info("Embedding GitHub issues...")
         chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
         issues_embedder = build_batch_embedder_from_flags(issues_manager, chunker, args)
+        issues_jobs_file = issues_embedder.embed_dataset(args.chunks_per_batch, args.max_embedding_jobs)
     ########################
     # Step 2: Vector Store #
     if repo_embedder is not None:
         logging.info("Waiting for repo embeddings to be ready...")
+        while not repo_embedder.embeddings_are_ready(repo_jobs_file):
             logging.info("Sleeping for 30 seconds...")
             time.sleep(30)
         logging.info("Moving embeddings to the repo vector store...")
         repo_vector_store = build_from_args(args)
         repo_vector_store.ensure_exists()
+        repo_vector_store.upsert(repo_embedder.download_embeddings(repo_jobs_file))
     if issues_embedder is not None:
         logging.info("Waiting for issue embeddings to be ready...")
+        while not issues_embedder.embeddings_are_ready(issues_jobs_file):
             logging.info("Sleeping for 30 seconds...")
             time.sleep(30)
         logging.info("Moving embeddings to the issues vector store...")
         issues_vector_store = build_from_args(args)
         issues_vector_store.ensure_exists()
+        issues_vector_store.upsert(issues_embedder.download_embeddings(issues_jobs_file))
     logging.info("Done!")

repo2vec/vector_store.py CHANGED Viewed

@@ -4,7 +4,8 @@ from abc import ABC, abstractmethod
 from typing import Dict, Generator, List, Tuple
 import marqo
-from langchain_community.vectorstores import Marqo, Pinecone as LangChainPinecone
 from langchain_core.documents import Document
 from langchain_openai import OpenAIEmbeddings
 from pinecone import Pinecone

 from typing import Dict, Generator, List, Tuple
 import marqo
+from langchain_community.vectorstores import Marqo
+from langchain_community.vectorstores import Pinecone as LangChainPinecone
 from langchain_core.documents import Document
 from langchain_openai import OpenAIEmbeddings
 from pinecone import Pinecone

setup.py CHANGED Viewed

@@ -1,9 +1,11 @@
-from setuptools import setup, find_packages
 def readfile(filename):
-    with open(filename, 'r+') as f:
         return f.read()
 setup(
     name="repo2vec",
     version="0.1.6",
@@ -30,5 +32,5 @@ setup(
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
-    python_requires='>=3.9',
-)

+from setuptools import find_packages, setup
 def readfile(filename):
+    with open(filename, "r+") as f:
         return f.read()
 setup(
     name="repo2vec",
     version="0.1.6",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
+    python_requires=">=3.9",
+)

tests/conftest.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import sys
 import os
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../repo2vec')))

 import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../repo2vec")))