""" GitHub utilities for repository operations. """ import os import re import subprocess import tempfile from pathlib import Path from typing import Optional, Dict, Any, List def clone_repository( repo_url: str, target_dir: Optional[str] = None, depth: int = 1, timeout: int = 60 ) -> Optional[str]: """ Clone a GitHub repository. Args: repo_url: GitHub repository URL target_dir: Target directory (optional, creates temp if None) depth: Clone depth (1 for shallow clone) timeout: Timeout in seconds Returns: Path to cloned repository, or None if failed """ try: # Create target directory if target_dir is None: target_dir = tempfile.mkdtemp(prefix="reproagent_repo_") else: Path(target_dir).mkdir(parents=True, exist_ok=True) print(f"📥 Cloning {repo_url} to {target_dir}...") # Clone with git cmd = ['git', 'clone', '--depth', str(depth), repo_url, target_dir] result = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout ) if result.returncode == 0: print(f"✅ Repository cloned successfully") return target_dir else: print(f"❌ Clone failed: {result.stderr}") return None except subprocess.TimeoutExpired: print(f"❌ Clone timeout after {timeout}s") return None except Exception as e: print(f"❌ Clone error: {e}") return None def get_repo_info(repo_path: str) -> Dict[str, Any]: """ Get information about a git repository. Args: repo_path: Path to repository Returns: Dictionary with repo info """ info = { 'path': repo_path, 'exists': False, 'is_git_repo': False, 'remote_url': None, 'branch': None, 'last_commit': None, 'file_count': 0, 'size_mb': 0 } repo_dir = Path(repo_path) if not repo_dir.exists(): return info info['exists'] = True # Check if git repo git_dir = repo_dir / '.git' if not git_dir.exists(): return info info['is_git_repo'] = True # Get remote URL try: result = subprocess.run( ['git', '-C', repo_path, 'config', '--get', 'remote.origin.url'], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: info['remote_url'] = result.stdout.strip() except: pass # Get current branch try: result = subprocess.run( ['git', '-C', repo_path, 'rev-parse', '--abbrev-ref', 'HEAD'], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: info['branch'] = result.stdout.strip() except: pass # Get last commit try: result = subprocess.run( ['git', '-C', repo_path, 'log', '-1', '--pretty=format:%H %s'], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: info['last_commit'] = result.stdout.strip() except: pass # Count files try: file_count = sum(1 for _ in repo_dir.rglob('*') if _.is_file()) info['file_count'] = file_count except: pass # Calculate size try: total_size = sum(f.stat().st_size for f in repo_dir.rglob('*') if f.is_file()) info['size_mb'] = total_size / (1024 * 1024) except: pass return info def extract_github_urls(text: str) -> List[str]: """ Extract GitHub URLs from text using regex. Args: text: Text to search Returns: List of GitHub URLs """ pattern = r'https?://github\.com/[\w\-]+/[\w\-.]+' matches = re.findall(pattern, text) # Remove duplicates and clean urls = [] for url in matches: # Remove trailing punctuation url = re.sub(r'[.,;)\]]+$', '', url) if url not in urls: urls.append(url) return urls def parse_github_url(url: str) -> Optional[Dict[str, str]]: """ Parse GitHub URL into components. Args: url: GitHub URL Returns: Dict with owner, repo, etc., or None if invalid """ pattern = r'https?://github\.com/(?P[\w\-]+)/(?P[\w\-\.]+)' match = re.match(pattern, url) if match: return { 'owner': match.group('owner'), 'repo': match.group('repo'), 'url': url } return None def find_python_files(repo_path: str) -> List[str]: """ Find all Python files in repository. Args: repo_path: Path to repository Returns: List of Python file paths (relative) """ repo_dir = Path(repo_path) if not repo_dir.exists(): return [] python_files = [] for py_file in repo_dir.rglob('*.py'): # Skip hidden directories and common non-code dirs parts = py_file.parts if any(p.startswith('.') or p in ['__pycache__', 'venv', 'env', 'build', 'dist'] for p in parts): continue rel_path = py_file.relative_to(repo_dir) python_files.append(str(rel_path)) return python_files def find_config_files(repo_path: str) -> Dict[str, Optional[str]]: """ Find common configuration files. Args: repo_path: Path to repository Returns: Dict mapping config type to path """ repo_dir = Path(repo_path) config_files = { 'requirements': None, 'setup': None, 'pyproject': None, 'dockerfile': None, 'readme': None, 'license': None } if not repo_dir.exists(): return config_files # Check for each type if (repo_dir / 'requirements.txt').exists(): config_files['requirements'] = 'requirements.txt' if (repo_dir / 'setup.py').exists(): config_files['setup'] = 'setup.py' if (repo_dir / 'pyproject.toml').exists(): config_files['pyproject'] = 'pyproject.toml' if (repo_dir / 'Dockerfile').exists(): config_files['dockerfile'] = 'Dockerfile' # README (check multiple variants) for readme_name in ['README.md', 'README.rst', 'README.txt', 'README']: if (repo_dir / readme_name).exists(): config_files['readme'] = readme_name break # LICENSE for license_name in ['LICENSE', 'LICENSE.md', 'LICENSE.txt']: if (repo_dir / license_name).exists(): config_files['license'] = license_name break return config_files # Test if __name__ == "__main__": # Test URL extraction test_text = """ Check out our code at https://github.com/example/awesome-repo Also see https://github.com/another/project. """ urls = extract_github_urls(test_text) print("Found URLs:", urls) for url in urls: parsed = parse_github_url(url) print(f"Parsed: {parsed}")