Sebastiangmz's picture
Initial CodeRAG deploy
d557d77
raw
history blame
4.23 kB
"""Repository loading and cloning."""
from pathlib import Path
from typing import Callable, Optional
from git import Repo, GitCommandError
from coderag.config import get_settings
from coderag.logging import get_logger
from coderag.ingestion.validator import GitHubRepoInfo
logger = get_logger(__name__)
ProgressCallback = Callable[[str, int], None]
class LoaderError(Exception):
"""Repository loading error."""
pass
class RepositoryLoader:
"""Loads repositories from GitHub."""
def __init__(self, cache_dir: Optional[Path] = None) -> None:
settings = get_settings()
self.cache_dir = cache_dir or settings.ingestion.repos_cache_dir
self.cache_dir.mkdir(parents=True, exist_ok=True)
def get_repo_path(self, repo_info: GitHubRepoInfo) -> Path:
return self.cache_dir / repo_info.owner / repo_info.name
def clone_repository(
self,
repo_info: GitHubRepoInfo,
branch: Optional[str] = None,
progress_callback: Optional[ProgressCallback] = None,
) -> Path:
repo_path = self.get_repo_path(repo_info)
# Try branches in order: specified, repo default, main, master
branches_to_try = []
if branch:
branches_to_try.append(branch)
if repo_info.branch and repo_info.branch not in branches_to_try:
branches_to_try.append(repo_info.branch)
if "main" not in branches_to_try:
branches_to_try.append("main")
if "master" not in branches_to_try:
branches_to_try.append("master")
if repo_path.exists():
logger.info("Repository exists, updating", path=str(repo_path))
return self._update_repository(repo_path, branches_to_try[0], progress_callback)
if progress_callback:
progress_callback("Cloning repository", 0)
repo_path.parent.mkdir(parents=True, exist_ok=True)
last_error = None
for try_branch in branches_to_try:
try:
logger.info("Trying to clone", url=repo_info.clone_url, branch=try_branch)
Repo.clone_from(
repo_info.clone_url,
repo_path,
branch=try_branch,
depth=1,
single_branch=True,
)
if progress_callback:
progress_callback("Clone complete", 100)
logger.info("Repository cloned", path=str(repo_path), branch=try_branch)
return repo_path
except GitCommandError as e:
last_error = e
logger.debug("Branch not found, trying next", branch=try_branch)
# Clean up partial clone if any
import shutil
shutil.rmtree(repo_path, ignore_errors=True)
continue
raise LoaderError(f"Failed to clone repository (tried branches: {branches_to_try}): {last_error}")
def _update_repository(
self,
repo_path: Path,
branch: str,
progress_callback: Optional[ProgressCallback] = None,
) -> Path:
try:
repo = Repo(repo_path)
if progress_callback:
progress_callback("Fetching updates", 30)
repo.remotes.origin.fetch()
repo.git.checkout(branch)
repo.remotes.origin.pull()
if progress_callback:
progress_callback("Update complete", 100)
logger.info("Repository updated", path=str(repo_path))
return repo_path
except GitCommandError as e:
logger.warning("Update failed, re-cloning", error=str(e))
import shutil
shutil.rmtree(repo_path, ignore_errors=True)
raise LoaderError(f"Failed to update, please re-clone: {e}")
def is_cached(self, repo_info: GitHubRepoInfo) -> bool:
return self.get_repo_path(repo_info).exists()
def delete_cache(self, repo_info: GitHubRepoInfo) -> None:
repo_path = self.get_repo_path(repo_info)
if repo_path.exists():
import shutil
shutil.rmtree(repo_path)
logger.info("Cache deleted", path=str(repo_path))