juliaturc commited on
Commit
39898b4
·
1 Parent(s): 90af3bf

Option to not store file chunk content to vector store

Browse files
README.md CHANGED
@@ -104,7 +104,7 @@ If you are planning on indexing GitHub issues in addition to the codebase, you w
104
  2. Index the repository. This might take a few minutes, depending on its size.
105
  ```
106
  r2v-index $GITHUB_REPO \
107
- --embedder-type=openai
108
  --vector-store=pinecone \
109
  --index-name=$PINECONE_INDEX_NAME
110
  ```
 
104
  2. Index the repository. This might take a few minutes, depending on its size.
105
  ```
106
  r2v-index $GITHUB_REPO \
107
+ --embedder-type=openai \
108
  --vector-store=pinecone \
109
  --index-name=$PINECONE_INDEX_NAME
110
  ```
repo2vec/chat.py CHANGED
@@ -125,5 +125,6 @@ def main():
125
  examples=["What does this repo do?", "Give me some sample code."],
126
  ).launch(share=args.share)
127
 
 
128
  if __name__ == "__main__":
129
  main()
 
125
  examples=["What does this repo do?", "Give me some sample code."],
126
  ).launch(share=args.share)
127
 
128
+
129
  if __name__ == "__main__":
130
  main()
repo2vec/chunker.py CHANGED
@@ -31,7 +31,7 @@ class Chunk:
31
  class FileChunk(Chunk):
32
  """A chunk of code or text extracted from a file in the repository."""
33
 
34
- file_content: str # The content of the entire file, not just this chunk.
35
  file_metadata: Dict # Metadata of the entire file, not just this chunk.
36
  start_byte: int
37
  end_byte: int
@@ -57,6 +57,7 @@ class FileChunk(Chunk):
57
  "id": f"{filename_ascii}_{self.start_byte}_{self.end_byte}",
58
  "start_byte": self.start_byte,
59
  "end_byte": self.end_byte,
 
60
  # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
61
  # size limit. In that case, you can simply store the start/end bytes above, and fetch the content
62
  # directly from the repository when needed.
@@ -202,7 +203,9 @@ class CodeFileChunker(Chunker):
202
  for chunk in file_chunks:
203
  # Make sure that the chunk has content and doesn't exceed the max_tokens limit. Otherwise there must be
204
  # a bug in the code.
205
- assert chunk.num_tokens <= self.max_tokens, f"Chunk size {chunk.num_tokens} exceeds max_tokens {self.max_tokens}."
 
 
206
 
207
  return file_chunks
208
 
 
31
  class FileChunk(Chunk):
32
  """A chunk of code or text extracted from a file in the repository."""
33
 
34
+ file_content: str # The content of the entire file, not just this chunk.
35
  file_metadata: Dict # Metadata of the entire file, not just this chunk.
36
  start_byte: int
37
  end_byte: int
 
57
  "id": f"{filename_ascii}_{self.start_byte}_{self.end_byte}",
58
  "start_byte": self.start_byte,
59
  "end_byte": self.end_byte,
60
+ "length": self.end_byte - self.start_byte,
61
  # Note to developer: When choosing a large chunk size, you might exceed the vector store's metadata
62
  # size limit. In that case, you can simply store the start/end bytes above, and fetch the content
63
  # directly from the repository when needed.
 
203
  for chunk in file_chunks:
204
  # Make sure that the chunk has content and doesn't exceed the max_tokens limit. Otherwise there must be
205
  # a bug in the code.
206
+ assert (
207
+ chunk.num_tokens <= self.max_tokens
208
+ ), f"Chunk size {chunk.num_tokens} exceeds max_tokens {self.max_tokens}."
209
 
210
  return file_chunks
211
 
repo2vec/data_manager.py CHANGED
@@ -155,15 +155,15 @@ class GitHubRepoManager(DataManager):
155
 
156
  if self.inclusions:
157
  return (
158
- extension in self.inclusions.get("ext", []) or
159
- file_name in self.inclusions.get("file", []) or
160
- any(d in dirs for d in self.inclusions.get("dir", []))
161
  )
162
  elif self.exclusions:
163
  return (
164
- extension not in self.exclusions.get("ext", []) and
165
- file_name not in self.exclusions.get("file", []) and
166
- all(d not in dirs for d in self.exclusions.get("dir", []))
167
  )
168
  return True
169
 
 
155
 
156
  if self.inclusions:
157
  return (
158
+ extension in self.inclusions.get("ext", [])
159
+ or file_name in self.inclusions.get("file", [])
160
+ or any(d in dirs for d in self.inclusions.get("dir", []))
161
  )
162
  elif self.exclusions:
163
  return (
164
+ extension not in self.exclusions.get("ext", [])
165
+ and file_name not in self.exclusions.get("file", [])
166
+ and all(d not in dirs for d in self.exclusions.get("dir", []))
167
  )
168
  return True
169
 
repo2vec/embedder.py CHANGED
@@ -3,6 +3,7 @@
3
  import json
4
  import logging
5
  import os
 
6
  from abc import ABC, abstractmethod
7
  from collections import Counter
8
  from typing import Dict, Generator, List, Optional, Tuple
@@ -43,18 +44,14 @@ class OpenAIBatchEmbedder(BatchEmbedder):
43
  self.local_dir = local_dir
44
  self.embedding_model = embedding_model
45
  self.embedding_size = embedding_size
46
- # IDs issued by OpenAI for each batch job mapped to metadata about the chunks.
47
- self.openai_batch_ids = {}
48
  self.client = OpenAI()
49
 
50
- def embed_dataset(self, chunks_per_batch: int, max_embedding_jobs: int = None):
51
- """Issues batch embedding jobs for the entire dataset."""
52
- if self.openai_batch_ids:
53
- raise ValueError("Embeddings are in progress.")
54
-
55
  batch = []
 
56
  chunk_count = 0
57
- dataset_name = self.data_manager.dataset_id.split("/")[-1]
58
 
59
  for content, metadata in self.data_manager.walk():
60
  chunks = self.chunker.chunk(content, metadata)
@@ -64,41 +61,58 @@ class OpenAIBatchEmbedder(BatchEmbedder):
64
  if len(batch) > chunks_per_batch:
65
  for i in range(0, len(batch), chunks_per_batch):
66
  sub_batch = batch[i : i + chunks_per_batch]
67
- openai_batch_id = self._issue_job_for_chunks(
68
- sub_batch, batch_id=f"{dataset_name}/{len(self.openai_batch_ids)}"
69
- )
70
- self.openai_batch_ids[openai_batch_id] = [chunk.metadata for chunk in sub_batch]
71
- if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
72
  logging.info("Reached the maximum number of embedding jobs. Stopping.")
73
  return
74
  batch = []
75
 
76
  # Finally, commit the last batch.
77
  if batch:
78
- openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{dataset_name}/{len(self.openai_batch_ids)}")
79
- self.openai_batch_ids[openai_batch_id] = [chunk.metadata for chunk in batch]
80
- logging.info("Issued %d jobs for %d chunks.", len(self.openai_batch_ids), chunk_count)
81
 
82
- # Save the job IDs to a file, just in case this script is terminated by mistake.
83
- metadata_file = os.path.join(self.local_dir, "openai_batch_ids.json")
84
  with open(metadata_file, "w") as f:
85
- json.dump(self.openai_batch_ids, f)
86
  logging.info("Job metadata saved at %s", metadata_file)
 
87
 
88
- def embeddings_are_ready(self) -> bool:
89
- """Checks whether the embeddings jobs are done (either completed or failed)."""
90
- if not self.openai_batch_ids:
91
- raise ValueError("No embeddings in progress.")
92
- job_ids = self.openai_batch_ids.keys()
 
 
 
 
 
93
  statuses = [self.client.batches.retrieve(job_id.strip()) for job_id in job_ids]
94
  are_ready = all(status.status in ["completed", "failed"] for status in statuses)
95
  status_counts = Counter(status.status for status in statuses)
96
  logging.info("Job statuses: %s", status_counts)
97
  return are_ready
98
 
99
- def download_embeddings(self) -> Generator[Vector, None, None]:
100
- """Yield a (chunk_metadata, embedding) pair for each chunk in the dataset."""
101
- job_ids = self.openai_batch_ids.keys()
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  statuses = [self.client.batches.retrieve(job_id.strip()) for job_id in job_ids]
103
 
104
  for idx, status in enumerate(statuses):
@@ -111,7 +125,7 @@ class OpenAIBatchEmbedder(BatchEmbedder):
111
  logging.error("Job %s failed with error: %s", status.id, error.text)
112
  continue
113
 
114
- batch_metadata = self.openai_batch_ids[status.id]
115
  file_response = self.client.files.content(status.output_file_id)
116
  data = json.loads(file_response.text)["response"]["body"]["data"]
117
  logging.info("Job %s generated %d embeddings.", status.id, len(data))
@@ -119,6 +133,13 @@ class OpenAIBatchEmbedder(BatchEmbedder):
119
  for datum in data:
120
  idx = int(datum["index"])
121
  metadata = batch_metadata[idx]
 
 
 
 
 
 
 
122
  embedding = datum["embedding"]
123
  yield (metadata, embedding)
124
 
@@ -206,6 +227,7 @@ class MarqoEmbedder(BatchEmbedder):
206
 
207
  chunk_count = 0
208
  batch = []
 
209
 
210
  for content, metadata in self.data_manager.walk():
211
  chunks = self.chunker.chunk(content, metadata)
@@ -220,8 +242,9 @@ class MarqoEmbedder(BatchEmbedder):
220
  documents=[chunk.metadata for chunk in sub_batch],
221
  tensor_fields=["text"],
222
  )
 
223
 
224
- if max_embedding_jobs and len(self.openai_batch_ids) >= max_embedding_jobs:
225
  logging.info("Reached the maximum number of embedding jobs. Stopping.")
226
  return
227
  batch = []
 
3
  import json
4
  import logging
5
  import os
6
+ import time
7
  from abc import ABC, abstractmethod
8
  from collections import Counter
9
  from typing import Dict, Generator, List, Optional, Tuple
 
44
  self.local_dir = local_dir
45
  self.embedding_model = embedding_model
46
  self.embedding_size = embedding_size
 
 
47
  self.client = OpenAI()
48
 
49
+ def embed_dataset(self, chunks_per_batch: int, max_embedding_jobs: int = None) -> str:
50
+ """Issues batch embedding jobs for the entire dataset. Returns the filename containing the job IDs."""
 
 
 
51
  batch = []
52
+ batch_ids = {} # job_id -> metadata
53
  chunk_count = 0
54
+ dataset_name = self.data_manager.dataset_id.replace("/", "_")
55
 
56
  for content, metadata in self.data_manager.walk():
57
  chunks = self.chunker.chunk(content, metadata)
 
61
  if len(batch) > chunks_per_batch:
62
  for i in range(0, len(batch), chunks_per_batch):
63
  sub_batch = batch[i : i + chunks_per_batch]
64
+ openai_batch_id = self._issue_job_for_chunks(sub_batch, batch_id=f"{dataset_name}/{len(batch_ids)}")
65
+ batch_ids[openai_batch_id] = [chunk.metadata for chunk in sub_batch]
66
+ if max_embedding_jobs and len(batch_ids) >= max_embedding_jobs:
 
 
67
  logging.info("Reached the maximum number of embedding jobs. Stopping.")
68
  return
69
  batch = []
70
 
71
  # Finally, commit the last batch.
72
  if batch:
73
+ openai_batch_id = self._issue_job_for_chunks(batch, batch_id=f"{dataset_name}/{len(batch_ids)}")
74
+ batch_ids[openai_batch_id] = [chunk.metadata for chunk in batch]
75
+ logging.info("Issued %d jobs for %d chunks.", len(batch_ids), chunk_count)
76
 
77
+ timestamp = int(time.time())
78
+ metadata_file = os.path.join(self.local_dir, f"{dataset_name}_openai_batch_ids_{timestamp}.json")
79
  with open(metadata_file, "w") as f:
80
+ json.dump(batch_ids, f)
81
  logging.info("Job metadata saved at %s", metadata_file)
82
+ return metadata_file
83
 
84
+ def embeddings_are_ready(self, metadata_file: str) -> bool:
85
+ """Checks whether the embeddings jobs are done (either completed or failed).
86
+
87
+ Args:
88
+ metadata_file: Path to the file containing the job metadata (output of self.embed_dataset).
89
+ """
90
+ with open(metadata_file, "r") as f:
91
+ batch_ids = json.load(f)
92
+
93
+ job_ids = batch_ids.keys()
94
  statuses = [self.client.batches.retrieve(job_id.strip()) for job_id in job_ids]
95
  are_ready = all(status.status in ["completed", "failed"] for status in statuses)
96
  status_counts = Counter(status.status for status in statuses)
97
  logging.info("Job statuses: %s", status_counts)
98
  return are_ready
99
 
100
+ def download_embeddings(
101
+ self, metadata_file: str, store_file_chunk_content: bool = True
102
+ ) -> Generator[Vector, None, None]:
103
+ """Yields a (chunk_metadata, embedding) pair for each chunk in the dataset.
104
+
105
+ Args:
106
+ metadata_file: Path to the file containing the job metadata (output of self.embed_dataset).
107
+ store_file_chunk_content: Whether to store the text content in the metadata for file chunks. Set this to
108
+ False if you want to save space in the vector store. After retrieval, the content of a file chunk can be
109
+ reconstructed based on the file_path, start_byte and end_byte fields in the metadata. This will not
110
+ affect other types of chunks (e.g. GitHub issues) for which the content is harder to reconstruct.
111
+ """
112
+ with open(metadata_file, "r") as f:
113
+ batch_ids = json.load(f)
114
+
115
+ job_ids = batch_ids.keys()
116
  statuses = [self.client.batches.retrieve(job_id.strip()) for job_id in job_ids]
117
 
118
  for idx, status in enumerate(statuses):
 
125
  logging.error("Job %s failed with error: %s", status.id, error.text)
126
  continue
127
 
128
+ batch_metadata = batch_ids[status.id]
129
  file_response = self.client.files.content(status.output_file_id)
130
  data = json.loads(file_response.text)["response"]["body"]["data"]
131
  logging.info("Job %s generated %d embeddings.", status.id, len(data))
 
133
  for datum in data:
134
  idx = int(datum["index"])
135
  metadata = batch_metadata[idx]
136
+ if (
137
+ not store_file_chunk_content
138
+ and "file_path" in metadata
139
+ and "start_byte" in metadata
140
+ and "end_byte" in metadata
141
+ ):
142
+ metadata.pop("text", None)
143
  embedding = datum["embedding"]
144
  yield (metadata, embedding)
145
 
 
227
 
228
  chunk_count = 0
229
  batch = []
230
+ job_count = 0
231
 
232
  for content, metadata in self.data_manager.walk():
233
  chunks = self.chunker.chunk(content, metadata)
 
242
  documents=[chunk.metadata for chunk in sub_batch],
243
  tensor_fields=["text"],
244
  )
245
+ job_count += 1
246
 
247
+ if max_embedding_jobs and job_count >= max_embedding_jobs:
248
  logging.info("Reached the maximum number of embedding jobs. Stopping.")
249
  return
250
  batch = []
repo2vec/github.py CHANGED
@@ -1,10 +1,10 @@
1
  """GitHub-specific implementations for DataManager and Chunker."""
2
 
 
3
  import os
4
  from dataclasses import dataclass
5
  from typing import Any, Dict, Generator, List, Tuple
6
 
7
- import logging
8
  import requests
9
  import tiktoken
10
 
@@ -234,7 +234,8 @@ class GitHubIssuesChunker(Chunker):
234
  issue=issue,
235
  start_comment=comment_idx,
236
  end_comment=comment_idx + 1,
237
- ))
 
238
  else:
239
  # Add the comment to the existing chunk.
240
  chunks[-1].end_comment = comment_idx + 1
 
1
  """GitHub-specific implementations for DataManager and Chunker."""
2
 
3
+ import logging
4
  import os
5
  from dataclasses import dataclass
6
  from typing import Any, Dict, Generator, List, Tuple
7
 
 
8
  import requests
9
  import tiktoken
10
 
 
234
  issue=issue,
235
  start_comment=comment_idx,
236
  end_comment=comment_idx + 1,
237
+ )
238
+ )
239
  else:
240
  # Add the comment to the existing chunk.
241
  chunks[-1].end_comment = comment_idx + 1
repo2vec/index.py CHANGED
@@ -3,9 +3,10 @@
3
  import argparse
4
  import logging
5
  import os
6
- import pkg_resources
7
  import time
8
 
 
 
9
  from repo2vec.chunker import UniversalFileChunker
10
  from repo2vec.data_manager import GitHubRepoManager
11
  from repo2vec.embedder import build_batch_embedder_from_flags
@@ -202,7 +203,7 @@ def main():
202
  logging.info("Embedding the repo...")
203
  chunker = UniversalFileChunker(max_tokens=args.tokens_per_chunk)
204
  repo_embedder = build_batch_embedder_from_flags(repo_manager, chunker, args)
205
- repo_embedder.embed_dataset(args.chunks_per_batch, args.max_embedding_jobs)
206
 
207
  # Index the GitHub issues.
208
  issues_embedder = None
@@ -213,7 +214,7 @@ def main():
213
  logging.info("Embedding GitHub issues...")
214
  chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
215
  issues_embedder = build_batch_embedder_from_flags(issues_manager, chunker, args)
216
- issues_embedder.embed_dataset(args.chunks_per_batch, args.max_embedding_jobs)
217
 
218
  ########################
219
  # Step 2: Vector Store #
@@ -226,25 +227,25 @@ def main():
226
 
227
  if repo_embedder is not None:
228
  logging.info("Waiting for repo embeddings to be ready...")
229
- while not repo_embedder.embeddings_are_ready():
230
  logging.info("Sleeping for 30 seconds...")
231
  time.sleep(30)
232
 
233
  logging.info("Moving embeddings to the repo vector store...")
234
  repo_vector_store = build_from_args(args)
235
  repo_vector_store.ensure_exists()
236
- repo_vector_store.upsert(repo_embedder.download_embeddings())
237
 
238
  if issues_embedder is not None:
239
  logging.info("Waiting for issue embeddings to be ready...")
240
- while not issues_embedder.embeddings_are_ready():
241
  logging.info("Sleeping for 30 seconds...")
242
  time.sleep(30)
243
 
244
  logging.info("Moving embeddings to the issues vector store...")
245
  issues_vector_store = build_from_args(args)
246
  issues_vector_store.ensure_exists()
247
- issues_vector_store.upsert(issues_embedder.download_embeddings())
248
 
249
  logging.info("Done!")
250
 
 
3
  import argparse
4
  import logging
5
  import os
 
6
  import time
7
 
8
+ import pkg_resources
9
+
10
  from repo2vec.chunker import UniversalFileChunker
11
  from repo2vec.data_manager import GitHubRepoManager
12
  from repo2vec.embedder import build_batch_embedder_from_flags
 
203
  logging.info("Embedding the repo...")
204
  chunker = UniversalFileChunker(max_tokens=args.tokens_per_chunk)
205
  repo_embedder = build_batch_embedder_from_flags(repo_manager, chunker, args)
206
+ repo_jobs_file = repo_embedder.embed_dataset(args.chunks_per_batch, args.max_embedding_jobs)
207
 
208
  # Index the GitHub issues.
209
  issues_embedder = None
 
214
  logging.info("Embedding GitHub issues...")
215
  chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
216
  issues_embedder = build_batch_embedder_from_flags(issues_manager, chunker, args)
217
+ issues_jobs_file = issues_embedder.embed_dataset(args.chunks_per_batch, args.max_embedding_jobs)
218
 
219
  ########################
220
  # Step 2: Vector Store #
 
227
 
228
  if repo_embedder is not None:
229
  logging.info("Waiting for repo embeddings to be ready...")
230
+ while not repo_embedder.embeddings_are_ready(repo_jobs_file):
231
  logging.info("Sleeping for 30 seconds...")
232
  time.sleep(30)
233
 
234
  logging.info("Moving embeddings to the repo vector store...")
235
  repo_vector_store = build_from_args(args)
236
  repo_vector_store.ensure_exists()
237
+ repo_vector_store.upsert(repo_embedder.download_embeddings(repo_jobs_file))
238
 
239
  if issues_embedder is not None:
240
  logging.info("Waiting for issue embeddings to be ready...")
241
+ while not issues_embedder.embeddings_are_ready(issues_jobs_file):
242
  logging.info("Sleeping for 30 seconds...")
243
  time.sleep(30)
244
 
245
  logging.info("Moving embeddings to the issues vector store...")
246
  issues_vector_store = build_from_args(args)
247
  issues_vector_store.ensure_exists()
248
+ issues_vector_store.upsert(issues_embedder.download_embeddings(issues_jobs_file))
249
 
250
  logging.info("Done!")
251
 
repo2vec/vector_store.py CHANGED
@@ -4,7 +4,8 @@ from abc import ABC, abstractmethod
4
  from typing import Dict, Generator, List, Tuple
5
 
6
  import marqo
7
- from langchain_community.vectorstores import Marqo, Pinecone as LangChainPinecone
 
8
  from langchain_core.documents import Document
9
  from langchain_openai import OpenAIEmbeddings
10
  from pinecone import Pinecone
 
4
  from typing import Dict, Generator, List, Tuple
5
 
6
  import marqo
7
+ from langchain_community.vectorstores import Marqo
8
+ from langchain_community.vectorstores import Pinecone as LangChainPinecone
9
  from langchain_core.documents import Document
10
  from langchain_openai import OpenAIEmbeddings
11
  from pinecone import Pinecone
setup.py CHANGED
@@ -1,9 +1,11 @@
1
- from setuptools import setup, find_packages
 
2
 
3
  def readfile(filename):
4
- with open(filename, 'r+') as f:
5
  return f.read()
6
 
 
7
  setup(
8
  name="repo2vec",
9
  version="0.1.6",
@@ -30,5 +32,5 @@ setup(
30
  "License :: OSI Approved :: MIT License",
31
  "Operating System :: OS Independent",
32
  ],
33
- python_requires='>=3.9',
34
- )
 
1
+ from setuptools import find_packages, setup
2
+
3
 
4
  def readfile(filename):
5
+ with open(filename, "r+") as f:
6
  return f.read()
7
 
8
+
9
  setup(
10
  name="repo2vec",
11
  version="0.1.6",
 
32
  "License :: OSI Approved :: MIT License",
33
  "Operating System :: OS Independent",
34
  ],
35
+ python_requires=">=3.9",
36
+ )
tests/conftest.py CHANGED
@@ -1,4 +1,4 @@
1
- import sys
2
  import os
 
3
 
4
- sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../repo2vec')))
 
 
1
  import os
2
+ import sys
3
 
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../repo2vec")))