juliaturc commited on
Commit
6ac6bd5
·
1 Parent(s): 59fd872

Improvements for GitHub issue indexing

Browse files
Files changed (2) hide show
  1. repo2vec/github.py +21 -6
  2. repo2vec/index.py +11 -2
repo2vec/github.py CHANGED
@@ -46,9 +46,10 @@ class GitHubIssue:
46
  class GitHubIssuesManager(DataManager):
47
  """Class to manage the GitHub issues of a particular repository."""
48
 
49
- def __init__(self, repo_id: str, max_issues: int = None):
50
  super().__init__(dataset_id=repo_id + "/issues")
51
  self.repo_id = repo_id
 
52
  self.max_issues = max_issues
53
  self.access_token = os.getenv("GITHUB_TOKEN")
54
  if not self.access_token:
@@ -60,7 +61,7 @@ class GitHubIssuesManager(DataManager):
60
  per_page = min(self.max_issues or 100, 100) # 100 is maximum per page
61
  url = f"https://api.github.com/repos/{self.repo_id}/issues?per_page={per_page}"
62
  while url:
63
- print(f"Fetching issues from {url}")
64
  response = self._get_page_of_issues(url)
65
  response.raise_for_status()
66
  for issue in response.json():
@@ -72,7 +73,7 @@ class GitHubIssuesManager(DataManager):
72
  title=issue["title"],
73
  # When there's no body, issue["body"] is None.
74
  body=issue["body"] or "",
75
- comments=self._get_comments(issue["comments_url"]),
76
  )
77
  )
78
  if self.max_issues and len(self.issues) >= self.max_issues:
@@ -125,7 +126,7 @@ class GitHubIssuesManager(DataManager):
125
  "X-GitHub-Api-Version": "2022-11-28",
126
  },
127
  )
128
- except requests.exceptions.ConnectionTimeout:
129
  logging.warn(f"Timeout fetching comments from {comments_url}")
130
  return []
131
  comments = []
@@ -203,8 +204,22 @@ class GitHubIssuesChunker(Chunker):
203
 
204
  chunks = []
205
 
206
- # First, create a chunk for the issue body.
207
- issue_body_chunk = IssueChunk(issue, 0, 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  chunks.append(issue_body_chunk)
209
 
210
  for comment_idx, comment in enumerate(issue.comments):
 
46
  class GitHubIssuesManager(DataManager):
47
  """Class to manage the GitHub issues of a particular repository."""
48
 
49
+ def __init__(self, repo_id: str, index_comments: bool = False, max_issues: int = None):
50
  super().__init__(dataset_id=repo_id + "/issues")
51
  self.repo_id = repo_id
52
+ self.index_comments = index_comments
53
  self.max_issues = max_issues
54
  self.access_token = os.getenv("GITHUB_TOKEN")
55
  if not self.access_token:
 
61
  per_page = min(self.max_issues or 100, 100) # 100 is maximum per page
62
  url = f"https://api.github.com/repos/{self.repo_id}/issues?per_page={per_page}"
63
  while url:
64
+ logging.info(f"Fetching issues from {url}")
65
  response = self._get_page_of_issues(url)
66
  response.raise_for_status()
67
  for issue in response.json():
 
73
  title=issue["title"],
74
  # When there's no body, issue["body"] is None.
75
  body=issue["body"] or "",
76
+ comments=self._get_comments(issue["comments_url"]) if self.index_comments else [],
77
  )
78
  )
79
  if self.max_issues and len(self.issues) >= self.max_issues:
 
126
  "X-GitHub-Api-Version": "2022-11-28",
127
  },
128
  )
129
+ except requests.exceptions.ConnectTimeout:
130
  logging.warn(f"Timeout fetching comments from {comments_url}")
131
  return []
132
  comments = []
 
204
 
205
  chunks = []
206
 
207
+ # First, create a chunk for the body of the issue. If it's too long, then truncate it.
208
+ if len(tokenizer.encode(issue.pretty, disallowed_special=())) > self.max_tokens:
209
+ title_len = len(tokenizer.encode(issue.title, disallowed_special=()))
210
+ target_body_len = self.max_tokens - title_len - 20 # 20 for buffer
211
+ trimmed_body = tokenizer.decode(tokenizer.encode(issue.body, disallowed_special=())[:target_body_len])
212
+ trimmed_issue = GitHubIssue(
213
+ url=issue.url,
214
+ html_url=issue.html_url,
215
+ title=issue.title,
216
+ body=trimmed_body,
217
+ comments=issue.comments,
218
+ )
219
+ issue_body_chunk = IssueChunk(trimmed_issue, 0, 0)
220
+ else:
221
+ issue_body_chunk = IssueChunk(issue, 0, 0)
222
+
223
  chunks.append(issue_body_chunk)
224
 
225
  for comment_idx, comment in enumerate(issue.comments):
repo2vec/index.py CHANGED
@@ -81,7 +81,6 @@ def main():
81
  )
82
  parser.add_argument(
83
  "--exclude",
84
- default=pkg_resources.resource_filename(__name__, "sample-exclude.txt"),
85
  help="Path to a file containing a list of extensions to exclude. One extension per line.",
86
  )
87
  parser.add_argument(
@@ -110,6 +109,14 @@ def main():
110
  help="Whether to index GitHub issues. At least one of --index-repo and --index-issues must be True. When "
111
  "--index-issues is set, you must also set a GITHUB_TOKEN environment variable.",
112
  )
 
 
 
 
 
 
 
 
113
  args = parser.parse_args()
114
 
115
  # Validate embedder and vector store compatibility.
@@ -130,6 +137,8 @@ def main():
130
  parser.error(f"The maximum number of chunks per job is {MAX_TOKENS_PER_JOB}.")
131
  if args.include and args.exclude:
132
  parser.error("At most one of --include and --exclude can be specified.")
 
 
133
  if not args.index_repo and not args.index_issues:
134
  parser.error("At least one of --index-repo and --index-issues must be true.")
135
 
@@ -174,7 +183,7 @@ def main():
174
  issues_embedder = None
175
  if args.index_issues:
176
  logging.info("Issuing embedding jobs for GitHub issues...")
177
- issues_manager = GitHubIssuesManager(args.repo_id)
178
  issues_manager.download()
179
  logging.info("Embedding GitHub issues...")
180
  chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
 
81
  )
82
  parser.add_argument(
83
  "--exclude",
 
84
  help="Path to a file containing a list of extensions to exclude. One extension per line.",
85
  )
86
  parser.add_argument(
 
109
  help="Whether to index GitHub issues. At least one of --index-repo and --index-issues must be True. When "
110
  "--index-issues is set, you must also set a GITHUB_TOKEN environment variable.",
111
  )
112
+ parser.add_argument(
113
+ "--index-issue-comments",
114
+ action=argparse.BooleanOptionalAction,
115
+ default=False,
116
+ help="Whether to index the comments of GitHub issues. This is only relevant if --index-issues is set. "
117
+ "GitHub's API for downloading comments is quite slow. Indexing solely the body of an issue seems to bring most "
118
+ "of the gains anyway.",
119
+ )
120
  args = parser.parse_args()
121
 
122
  # Validate embedder and vector store compatibility.
 
137
  parser.error(f"The maximum number of chunks per job is {MAX_TOKENS_PER_JOB}.")
138
  if args.include and args.exclude:
139
  parser.error("At most one of --include and --exclude can be specified.")
140
+ if not args.include and not args.exclude:
141
+ args.exclude = pkg_resources.resource_filename(__name__, "sample-exclude.txt")
142
  if not args.index_repo and not args.index_issues:
143
  parser.error("At least one of --index-repo and --index-issues must be true.")
144
 
 
183
  issues_embedder = None
184
  if args.index_issues:
185
  logging.info("Issuing embedding jobs for GitHub issues...")
186
+ issues_manager = GitHubIssuesManager(args.repo_id, index_comments=args.index_issue_comments)
187
  issues_manager.download()
188
  logging.info("Embedding GitHub issues...")
189
  chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)