Spaces:
Running
Running
Pass access_token in the constructor of DataManagers.
Browse files- sage/data_manager.py +3 -2
- sage/github.py +2 -3
- sage/index.py +5 -2
- setup.py +1 -1
sage/data_manager.py
CHANGED
|
@@ -30,6 +30,7 @@ class GitHubRepoManager(DataManager):
|
|
| 30 |
self,
|
| 31 |
repo_id: str,
|
| 32 |
commit_hash: str = None,
|
|
|
|
| 33 |
local_dir: str = None,
|
| 34 |
inclusion_file: str = None,
|
| 35 |
exclusion_file: str = None,
|
|
@@ -38,6 +39,7 @@ class GitHubRepoManager(DataManager):
|
|
| 38 |
Args:
|
| 39 |
repo_id: The identifier of the repository in owner/repo format, e.g. "Storia-AI/sage".
|
| 40 |
commit_hash: Optional commit hash to checkout. If not specified, we pull the latest version of the repo.
|
|
|
|
| 41 |
local_dir: The local directory where the repository will be cloned.
|
| 42 |
inclusion_file: A file with a lists of files/directories/extensions to include. Each line must be in one of
|
| 43 |
the following formats: "ext:.my-extension", "file:my-file.py", or "dir:my-directory".
|
|
@@ -47,6 +49,7 @@ class GitHubRepoManager(DataManager):
|
|
| 47 |
super().__init__(dataset_id=repo_id)
|
| 48 |
self.repo_id = repo_id
|
| 49 |
self.commit_hash = commit_hash
|
|
|
|
| 50 |
|
| 51 |
self.local_dir = local_dir or "/tmp/"
|
| 52 |
if not os.path.exists(self.local_dir):
|
|
@@ -57,8 +60,6 @@ class GitHubRepoManager(DataManager):
|
|
| 57 |
if not os.path.exists(self.log_dir):
|
| 58 |
os.makedirs(self.log_dir)
|
| 59 |
|
| 60 |
-
self.access_token = os.getenv("GITHUB_TOKEN")
|
| 61 |
-
|
| 62 |
if inclusion_file and exclusion_file:
|
| 63 |
raise ValueError("Only one of inclusion_file or exclusion_file should be provided.")
|
| 64 |
|
|
|
|
| 30 |
self,
|
| 31 |
repo_id: str,
|
| 32 |
commit_hash: str = None,
|
| 33 |
+
access_token: str = None,
|
| 34 |
local_dir: str = None,
|
| 35 |
inclusion_file: str = None,
|
| 36 |
exclusion_file: str = None,
|
|
|
|
| 39 |
Args:
|
| 40 |
repo_id: The identifier of the repository in owner/repo format, e.g. "Storia-AI/sage".
|
| 41 |
commit_hash: Optional commit hash to checkout. If not specified, we pull the latest version of the repo.
|
| 42 |
+
access_token: A GitHub access token to use for cloning private repositories. Not needed for public repos.
|
| 43 |
local_dir: The local directory where the repository will be cloned.
|
| 44 |
inclusion_file: A file with a lists of files/directories/extensions to include. Each line must be in one of
|
| 45 |
the following formats: "ext:.my-extension", "file:my-file.py", or "dir:my-directory".
|
|
|
|
| 49 |
super().__init__(dataset_id=repo_id)
|
| 50 |
self.repo_id = repo_id
|
| 51 |
self.commit_hash = commit_hash
|
| 52 |
+
self.access_token = access_token
|
| 53 |
|
| 54 |
self.local_dir = local_dir or "/tmp/"
|
| 55 |
if not os.path.exists(self.local_dir):
|
|
|
|
| 60 |
if not os.path.exists(self.log_dir):
|
| 61 |
os.makedirs(self.log_dir)
|
| 62 |
|
|
|
|
|
|
|
| 63 |
if inclusion_file and exclusion_file:
|
| 64 |
raise ValueError("Only one of inclusion_file or exclusion_file should be provided.")
|
| 65 |
|
sage/github.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
"""GitHub-specific implementations for DataManager and Chunker."""
|
| 2 |
|
| 3 |
import logging
|
| 4 |
-
import os
|
| 5 |
from dataclasses import dataclass
|
| 6 |
from typing import Any, Dict, Generator, List, Tuple
|
| 7 |
|
|
@@ -47,12 +46,12 @@ class GitHubIssue:
|
|
| 47 |
class GitHubIssuesManager(DataManager):
|
| 48 |
"""Class to manage the GitHub issues of a particular repository."""
|
| 49 |
|
| 50 |
-
def __init__(self, repo_id: str, index_comments: bool = False, max_issues: int = None):
|
| 51 |
super().__init__(dataset_id=repo_id + "/issues")
|
| 52 |
self.repo_id = repo_id
|
| 53 |
self.index_comments = index_comments
|
| 54 |
self.max_issues = max_issues
|
| 55 |
-
self.access_token =
|
| 56 |
if not self.access_token:
|
| 57 |
raise ValueError("Please set the GITHUB_TOKEN environment variable when indexing GitHub issues.")
|
| 58 |
self.issues = []
|
|
|
|
| 1 |
"""GitHub-specific implementations for DataManager and Chunker."""
|
| 2 |
|
| 3 |
import logging
|
|
|
|
| 4 |
from dataclasses import dataclass
|
| 5 |
from typing import Any, Dict, Generator, List, Tuple
|
| 6 |
|
|
|
|
| 46 |
class GitHubIssuesManager(DataManager):
|
| 47 |
"""Class to manage the GitHub issues of a particular repository."""
|
| 48 |
|
| 49 |
+
def __init__(self, repo_id: str, access_token: str, index_comments: bool = False, max_issues: int = None):
|
| 50 |
super().__init__(dataset_id=repo_id + "/issues")
|
| 51 |
self.repo_id = repo_id
|
| 52 |
self.index_comments = index_comments
|
| 53 |
self.max_issues = max_issues
|
| 54 |
+
self.access_token = access_token
|
| 55 |
if not self.access_token:
|
| 56 |
raise ValueError("Please set the GITHUB_TOKEN environment variable when indexing GitHub issues.")
|
| 57 |
self.issues = []
|
sage/index.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
"""Runs a batch job to compute embeddings for an entire repo and stores them into a vector store."""
|
| 2 |
|
| 3 |
import logging
|
|
|
|
| 4 |
import time
|
| 5 |
|
| 6 |
import configargparse
|
| 7 |
-
import pkg_resources
|
| 8 |
|
| 9 |
import sage.config as sage_config
|
| 10 |
from sage.chunker import UniversalFileChunker
|
|
@@ -53,6 +53,7 @@ def main():
|
|
| 53 |
repo_manager = GitHubRepoManager(
|
| 54 |
args.repo_id,
|
| 55 |
commit_hash=args.commit_hash,
|
|
|
|
| 56 |
local_dir=args.local_dir,
|
| 57 |
inclusion_file=args.include,
|
| 58 |
exclusion_file=args.exclude,
|
|
@@ -67,7 +68,9 @@ def main():
|
|
| 67 |
issues_embedder = None
|
| 68 |
if args.index_issues:
|
| 69 |
logging.info("Issuing embedding jobs for GitHub issues...")
|
| 70 |
-
issues_manager = GitHubIssuesManager(
|
|
|
|
|
|
|
| 71 |
issues_manager.download()
|
| 72 |
logging.info("Embedding GitHub issues...")
|
| 73 |
chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
|
|
|
|
| 1 |
"""Runs a batch job to compute embeddings for an entire repo and stores them into a vector store."""
|
| 2 |
|
| 3 |
import logging
|
| 4 |
+
import os
|
| 5 |
import time
|
| 6 |
|
| 7 |
import configargparse
|
|
|
|
| 8 |
|
| 9 |
import sage.config as sage_config
|
| 10 |
from sage.chunker import UniversalFileChunker
|
|
|
|
| 53 |
repo_manager = GitHubRepoManager(
|
| 54 |
args.repo_id,
|
| 55 |
commit_hash=args.commit_hash,
|
| 56 |
+
access_token=os.getenv("GITHUB_TOKEN"),
|
| 57 |
local_dir=args.local_dir,
|
| 58 |
inclusion_file=args.include,
|
| 59 |
exclusion_file=args.exclude,
|
|
|
|
| 68 |
issues_embedder = None
|
| 69 |
if args.index_issues:
|
| 70 |
logging.info("Issuing embedding jobs for GitHub issues...")
|
| 71 |
+
issues_manager = GitHubIssuesManager(
|
| 72 |
+
args.repo_id, access_token=os.getenv("GITHUB_TOKEN"), index_comments=args.index_issue_comments
|
| 73 |
+
)
|
| 74 |
issues_manager.download()
|
| 75 |
logging.info("Embedding GitHub issues...")
|
| 76 |
chunker = GitHubIssuesChunker(max_tokens=args.tokens_per_chunk)
|
setup.py
CHANGED
|
@@ -8,7 +8,7 @@ def readfile(filename):
|
|
| 8 |
|
| 9 |
setup(
|
| 10 |
name="sage",
|
| 11 |
-
version="0.1.
|
| 12 |
packages=find_packages(),
|
| 13 |
include_package_data=True,
|
| 14 |
package_data={
|
|
|
|
| 8 |
|
| 9 |
setup(
|
| 10 |
name="sage",
|
| 11 |
+
version="0.1.2",
|
| 12 |
packages=find_packages(),
|
| 13 |
include_package_data=True,
|
| 14 |
package_data={
|