Spaces:

Asish22
/

code-crawler

Running

App Files Files Community

juliaturc commited on Sep 4, 2024

Commit

6ac6bd5

1 Parent(s): 59fd872

Improvements for GitHub issue indexing

Browse files

Files changed (2) hide show

repo2vec/github.py +21 -6
repo2vec/index.py +11 -2

repo2vec/github.py CHANGED Viewed

@@ -46,9 +46,10 @@ class GitHubIssue:
 class GitHubIssuesManager(DataManager):
     """Class to manage the GitHub issues of a particular repository."""
-    def __init__(self, repo_id: str, max_issues: int = None):
         super().__init__(dataset_id=repo_id + "/issues")
         self.repo_id = repo_id
         self.max_issues = max_issues
         self.access_token = os.getenv("GITHUB_TOKEN")
         if not self.access_token:
@@ -60,7 +61,7 @@ class GitHubIssuesManager(DataManager):
         per_page = min(self.max_issues or 100, 100)  # 100 is maximum per page
         url = f"https://api.github.com/repos/{self.repo_id}/issues?per_page={per_page}"
         while url:
-            print(f"Fetching issues from {url}")
             response = self._get_page_of_issues(url)
             response.raise_for_status()
             for issue in response.json():
@@ -72,7 +73,7 @@ class GitHubIssuesManager(DataManager):
                             title=issue["title"],
                             # When there's no body, issue["body"] is None.
                             body=issue["body"] or "",
-                            comments=self._get_comments(issue["comments_url"]),
                         )
                     )
             if self.max_issues and len(self.issues) >= self.max_issues:
@@ -125,7 +126,7 @@ class GitHubIssuesManager(DataManager):
                     "X-GitHub-Api-Version": "2022-11-28",
                 },
             )
-        except requests.exceptions.ConnectionTimeout:
             logging.warn(f"Timeout fetching comments from {comments_url}")
             return []
         comments = []
@@ -203,8 +204,22 @@ class GitHubIssuesChunker(Chunker):
         chunks = []
-        # First, create a chunk for the issue body.
-        issue_body_chunk = IssueChunk(issue, 0, 0)
         chunks.append(issue_body_chunk)
         for comment_idx, comment in enumerate(issue.comments):

 class GitHubIssuesManager(DataManager):
     """Class to manage the GitHub issues of a particular repository."""
+    def __init__(self, repo_id: str, index_comments: bool = False, max_issues: int = None):
         super().__init__(dataset_id=repo_id + "/issues")
         self.repo_id = repo_id
+        self.index_comments = index_comments
         self.max_issues = max_issues
         self.access_token = os.getenv("GITHUB_TOKEN")
         if not self.access_token:
         per_page = min(self.max_issues or 100, 100)  # 100 is maximum per page
         url = f"https://api.github.com/repos/{self.repo_id}/issues?per_page={per_page}"
         while url:
+            logging.info(f"Fetching issues from {url}")
             response = self._get_page_of_issues(url)
             response.raise_for_status()
             for issue in response.json():
                             title=issue["title"],
                             # When there's no body, issue["body"] is None.
                             body=issue["body"] or "",
+                            comments=self._get_comments(issue["comments_url"]) if self.index_comments else [],
                         )
                     )
             if self.max_issues and len(self.issues) >= self.max_issues:
                     "X-GitHub-Api-Version": "2022-11-28",
                 },
             )
+        except requests.exceptions.ConnectTimeout:
             logging.warn(f"Timeout fetching comments from {comments_url}")
             return []
         comments = []
         chunks = []
+        # First, create a chunk for the body of the issue. If it's too long, then truncate it.
+        if len(tokenizer.encode(issue.pretty, disallowed_special=())) > self.max_tokens:
+            title_len = len(tokenizer.encode(issue.title, disallowed_special=()))
+            target_body_len = self.max_tokens - title_len - 20  # 20 for buffer
+            trimmed_body = tokenizer.decode(tokenizer.encode(issue.body, disallowed_special=())[:target_body_len])
+            trimmed_issue = GitHubIssue(
+                url=issue.url,
+                html_url=issue.html_url,
+                title=issue.title,
+                body=trimmed_body,
+                comments=issue.comments,
+            )
+            issue_body_chunk = IssueChunk(trimmed_issue, 0, 0)
+        else:
+            issue_body_chunk = IssueChunk(issue, 0, 0)
         chunks.append(issue_body_chunk)
         for comment_idx, comment in enumerate(issue.comments):

repo2vec/index.py CHANGED Viewed

@@ -81,7 +81,6 @@ def main():
     )
     parser.add_argument(
         "--exclude",
-        default=pkg_resources.resource_filename(__name__, "sample-exclude.txt"),
         help="Path to a file containing a list of extensions to exclude. One extension per line.",
     )
     parser.add_argument(
@@ -110,6 +109,14 @@ def main():
         help="Whether to index GitHub issues. At least one of --index-repo and --index-issues must be True. When "
         "--index-issues is set, you must also set a GITHUB_TOKEN environment variable.",
     )
     args = parser.parse_args()
     # Validate embedder and vector store compatibility.
@@ -130,6 +137,8 @@ def main():
         parser.error(f"The maximum number of chunks per job is {MAX_TOKENS_PER_JOB}.")
     if args.include and args.exclude:
         parser.error("At most one of --include and --exclude can be specified.")
     if not args.index_repo and not args.index_issues:
         parser.error("At least one of --index-repo and --index-issues must be true.")
@@ -174,7 +183,7 @@ def main():
     issues_embedder = None
     if args.index_issues:
         logging.info("Issuing embedding jobs for GitHub issues...")
-        issues_manager = GitHubIssuesManager(args.repo_id)
         issues_manager.download()
         logging.info("Embedding GitHub issues...")
         chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)

     )
     parser.add_argument(
         "--exclude",
         help="Path to a file containing a list of extensions to exclude. One extension per line.",
     )
     parser.add_argument(
         help="Whether to index GitHub issues. At least one of --index-repo and --index-issues must be True. When "
         "--index-issues is set, you must also set a GITHUB_TOKEN environment variable.",
     )
+    parser.add_argument(
+        "--index-issue-comments",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Whether to index the comments of GitHub issues. This is only relevant if --index-issues is set. "
+        "GitHub's API for downloading comments is quite slow. Indexing solely the body of an issue seems to bring most "
+        "of the gains anyway.",
+    )
     args = parser.parse_args()
     # Validate embedder and vector store compatibility.
         parser.error(f"The maximum number of chunks per job is {MAX_TOKENS_PER_JOB}.")
     if args.include and args.exclude:
         parser.error("At most one of --include and --exclude can be specified.")
+    if not args.include and not args.exclude:
+        args.exclude = pkg_resources.resource_filename(__name__, "sample-exclude.txt")
     if not args.index_repo and not args.index_issues:
         parser.error("At least one of --index-repo and --index-issues must be true.")
     issues_embedder = None
     if args.index_issues:
         logging.info("Issuing embedding jobs for GitHub issues...")
+        issues_manager = GitHubIssuesManager(args.repo_id, index_comments=args.index_issue_comments)
         issues_manager.download()
         logging.info("Embedding GitHub issues...")
         chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)