"""Fetches repository structure and key file contents from GitHub API.""" import os import re import requests _GITHUB_API = "https://api.github.com" _SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", "dist", "build", ".next", ".nuxt", "coverage", "htmlcov"} _CODE_EXTS = {".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".java", ".rb", ".php", ".cs", ".cpp", ".c", ".h", ".swift", ".kt"} _DOC_EXTS = {".md", ".rst", ".txt", ".yaml", ".yml", ".toml", ".json"} _PRIORITY = ["README.md", "readme.md", "README.rst", "main.py", "app.py", "index.js", "index.ts", "main.go", "src/main.rs", "setup.py", "pyproject.toml", "package.json", "go.mod", "Cargo.toml"] def _headers(): token = os.environ.get("GITHUB_TOKEN", "") h = {"Accept": "application/vnd.github.v3+json"} if token: h["Authorization"] = f"Bearer {token}" return h def parse_repo_url(url: str) -> tuple[str, str]: """Return (owner, repo) from a GitHub URL or owner/repo string.""" url = url.strip().rstrip("/") # Match github.com/owner/repo — ignore /tree/, /blob/, /issues/ etc. m = re.search(r"github\.com/([^/]+)/([^/?#\s]+)", url) if m: repo = m.group(2) if repo.endswith(".git"): repo = repo[:-4] return m.group(1), repo # Plain "owner/repo" shorthand parts = url.split("/") if len(parts) == 2 and parts[0] and parts[1]: return parts[0], parts[1] raise ValueError(f"Cannot parse GitHub URL: {url!r}") def get_repo_info(owner: str, repo: str) -> dict: r = requests.get(f"{_GITHUB_API}/repos/{owner}/{repo}", headers=_headers(), timeout=15) r.raise_for_status() d = r.json() return { "full_name": d.get("full_name", ""), "description": d.get("description", ""), "language": d.get("language", ""), "stars": d.get("stargazers_count", 0), "forks": d.get("forks_count", 0), "topics": d.get("topics", []), "default_branch": d.get("default_branch", "main"), "url": d.get("html_url", ""), } def get_file_tree(owner: str, repo: str, branch: str = "main", max_files: int = 150) -> list[str]: """Return flat list of file paths, priority files first.""" r = requests.get( f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1", headers=_headers(), timeout=20) if r.status_code == 404: # Try main vs master alt = "master" if branch == "main" else "main" r = requests.get( f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{alt}?recursive=1", headers=_headers(), timeout=20) r.raise_for_status() blobs = [item["path"] for item in r.json().get("tree", []) if item["type"] == "blob" and not any(seg in _SKIP_DIRS for seg in item["path"].split("/"))] # Sort: priority first, then code, then docs, then rest def rank(p): name = p.split("/")[-1] if p in _PRIORITY or name in _PRIORITY: return 0 ext = os.path.splitext(p)[1].lower() if ext in _CODE_EXTS: return 1 if ext in _DOC_EXTS: return 2 return 3 return sorted(blobs, key=rank)[:max_files] def fetch_file(owner: str, repo: str, path: str, branch: str = "main") -> str: """Fetch raw content of a single file (max 50KB).""" r = requests.get( f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}", headers=_headers(), timeout=15) if r.status_code != 200: return "" text = r.text return text[:50_000] # cap at 50KB per file def fetch_key_files(owner: str, repo: str, tree: list[str], branch: str = "main", max_chars: int = 60_000) -> dict[str, str]: """Fetch the most important files up to max_chars total.""" results: dict[str, str] = {} total = 0 # Always try priority files first ordered = [p for p in _PRIORITY if p in tree] + [p for p in tree if p not in _PRIORITY] for path in ordered: if total >= max_chars: break content = fetch_file(owner, repo, path, branch) if content: results[path] = content total += len(content) return results