juliaturc commited on
Commit
8b42d65
·
1 Parent(s): d0e7851

Pass access_token in the constructor of DataManagers.

Browse files
Files changed (4) hide show
  1. sage/data_manager.py +3 -2
  2. sage/github.py +2 -3
  3. sage/index.py +5 -2
  4. setup.py +1 -1
sage/data_manager.py CHANGED
@@ -30,6 +30,7 @@ class GitHubRepoManager(DataManager):
30
  self,
31
  repo_id: str,
32
  commit_hash: str = None,
 
33
  local_dir: str = None,
34
  inclusion_file: str = None,
35
  exclusion_file: str = None,
@@ -38,6 +39,7 @@ class GitHubRepoManager(DataManager):
38
  Args:
39
  repo_id: The identifier of the repository in owner/repo format, e.g. "Storia-AI/sage".
40
  commit_hash: Optional commit hash to checkout. If not specified, we pull the latest version of the repo.
 
41
  local_dir: The local directory where the repository will be cloned.
42
  inclusion_file: A file with a lists of files/directories/extensions to include. Each line must be in one of
43
  the following formats: "ext:.my-extension", "file:my-file.py", or "dir:my-directory".
@@ -47,6 +49,7 @@ class GitHubRepoManager(DataManager):
47
  super().__init__(dataset_id=repo_id)
48
  self.repo_id = repo_id
49
  self.commit_hash = commit_hash
 
50
 
51
  self.local_dir = local_dir or "/tmp/"
52
  if not os.path.exists(self.local_dir):
@@ -57,8 +60,6 @@ class GitHubRepoManager(DataManager):
57
  if not os.path.exists(self.log_dir):
58
  os.makedirs(self.log_dir)
59
 
60
- self.access_token = os.getenv("GITHUB_TOKEN")
61
-
62
  if inclusion_file and exclusion_file:
63
  raise ValueError("Only one of inclusion_file or exclusion_file should be provided.")
64
 
 
30
  self,
31
  repo_id: str,
32
  commit_hash: str = None,
33
+ access_token: str = None,
34
  local_dir: str = None,
35
  inclusion_file: str = None,
36
  exclusion_file: str = None,
 
39
  Args:
40
  repo_id: The identifier of the repository in owner/repo format, e.g. "Storia-AI/sage".
41
  commit_hash: Optional commit hash to checkout. If not specified, we pull the latest version of the repo.
42
+ access_token: A GitHub access token to use for cloning private repositories. Not needed for public repos.
43
  local_dir: The local directory where the repository will be cloned.
44
  inclusion_file: A file with a lists of files/directories/extensions to include. Each line must be in one of
45
  the following formats: "ext:.my-extension", "file:my-file.py", or "dir:my-directory".
 
49
  super().__init__(dataset_id=repo_id)
50
  self.repo_id = repo_id
51
  self.commit_hash = commit_hash
52
+ self.access_token = access_token
53
 
54
  self.local_dir = local_dir or "/tmp/"
55
  if not os.path.exists(self.local_dir):
 
60
  if not os.path.exists(self.log_dir):
61
  os.makedirs(self.log_dir)
62
 
 
 
63
  if inclusion_file and exclusion_file:
64
  raise ValueError("Only one of inclusion_file or exclusion_file should be provided.")
65
 
sage/github.py CHANGED
@@ -1,7 +1,6 @@
1
  """GitHub-specific implementations for DataManager and Chunker."""
2
 
3
  import logging
4
- import os
5
  from dataclasses import dataclass
6
  from typing import Any, Dict, Generator, List, Tuple
7
 
@@ -47,12 +46,12 @@ class GitHubIssue:
47
  class GitHubIssuesManager(DataManager):
48
  """Class to manage the GitHub issues of a particular repository."""
49
 
50
- def __init__(self, repo_id: str, index_comments: bool = False, max_issues: int = None):
51
  super().__init__(dataset_id=repo_id + "/issues")
52
  self.repo_id = repo_id
53
  self.index_comments = index_comments
54
  self.max_issues = max_issues
55
- self.access_token = os.getenv("GITHUB_TOKEN")
56
  if not self.access_token:
57
  raise ValueError("Please set the GITHUB_TOKEN environment variable when indexing GitHub issues.")
58
  self.issues = []
 
1
  """GitHub-specific implementations for DataManager and Chunker."""
2
 
3
  import logging
 
4
  from dataclasses import dataclass
5
  from typing import Any, Dict, Generator, List, Tuple
6
 
 
46
  class GitHubIssuesManager(DataManager):
47
  """Class to manage the GitHub issues of a particular repository."""
48
 
49
+ def __init__(self, repo_id: str, access_token: str, index_comments: bool = False, max_issues: int = None):
50
  super().__init__(dataset_id=repo_id + "/issues")
51
  self.repo_id = repo_id
52
  self.index_comments = index_comments
53
  self.max_issues = max_issues
54
+ self.access_token = access_token
55
  if not self.access_token:
56
  raise ValueError("Please set the GITHUB_TOKEN environment variable when indexing GitHub issues.")
57
  self.issues = []
sage/index.py CHANGED
@@ -1,10 +1,10 @@
1
  """Runs a batch job to compute embeddings for an entire repo and stores them into a vector store."""
2
 
3
  import logging
 
4
  import time
5
 
6
  import configargparse
7
- import pkg_resources
8
 
9
  import sage.config as sage_config
10
  from sage.chunker import UniversalFileChunker
@@ -53,6 +53,7 @@ def main():
53
  repo_manager = GitHubRepoManager(
54
  args.repo_id,
55
  commit_hash=args.commit_hash,
 
56
  local_dir=args.local_dir,
57
  inclusion_file=args.include,
58
  exclusion_file=args.exclude,
@@ -67,7 +68,9 @@ def main():
67
  issues_embedder = None
68
  if args.index_issues:
69
  logging.info("Issuing embedding jobs for GitHub issues...")
70
- issues_manager = GitHubIssuesManager(args.repo_id, index_comments=args.index_issue_comments)
 
 
71
  issues_manager.download()
72
  logging.info("Embedding GitHub issues...")
73
  chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
 
1
  """Runs a batch job to compute embeddings for an entire repo and stores them into a vector store."""
2
 
3
  import logging
4
+ import os
5
  import time
6
 
7
  import configargparse
 
8
 
9
  import sage.config as sage_config
10
  from sage.chunker import UniversalFileChunker
 
53
  repo_manager = GitHubRepoManager(
54
  args.repo_id,
55
  commit_hash=args.commit_hash,
56
+ access_token=os.getenv("GITHUB_TOKEN"),
57
  local_dir=args.local_dir,
58
  inclusion_file=args.include,
59
  exclusion_file=args.exclude,
 
68
  issues_embedder = None
69
  if args.index_issues:
70
  logging.info("Issuing embedding jobs for GitHub issues...")
71
+ issues_manager = GitHubIssuesManager(
72
+ args.repo_id, access_token=os.getenv("GITHUB_TOKEN"), index_comments=args.index_issue_comments
73
+ )
74
  issues_manager.download()
75
  logging.info("Embedding GitHub issues...")
76
  chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
setup.py CHANGED
@@ -8,7 +8,7 @@ def readfile(filename):
8
 
9
  setup(
10
  name="sage",
11
- version="0.1.1",
12
  packages=find_packages(),
13
  include_package_data=True,
14
  package_data={
 
8
 
9
  setup(
10
  name="sage",
11
+ version="0.1.2",
12
  packages=find_packages(),
13
  include_package_data=True,
14
  package_data={