""" GitHub API Client ================== This module handles all communication with GitHub's REST API. It provides methods to: 1. Fetch PR diff (the raw unified diff showing what changed) 2. Fetch file contents (full source code for context/RAG) 3. Fetch changed file list (which files were modified) 4. Post a PR review with inline comments (anchored to specific lines) 5. Post a summary comment on the PR conversation GitHub API Authentication: - We authenticate using installation access tokens (from auth.py) - Every request includes the token in the Authorization header - The token is scoped to the specific repos where our app is installed GitHub API Versioning: - We pin to version "2022-11-28" via X-GitHub-Api-Version header - This ensures our code doesn't break when GitHub ships API changes - This is a best practice for any API integration in production Rate Limits: - GitHub Apps get 5,000 requests/hour per installation - That's plenty for our use case (~10-20 API calls per PR review) Reference: https://docs.github.com/en/rest """ from __future__ import annotations import base64 from dataclasses import dataclass import httpx import structlog from app.github.auth import get_installation_token logger = structlog.get_logger() GITHUB_API = "https://api.github.com" @dataclass class PRData: """ All the data we fetch about a PR, bundled together. This is passed to the agent orchestrator so agents have full context. A dataclass (vs a dict) gives us type safety and autocomplete in the IDE. """ repo_full_name: str # e.g. "ninjacode911/myapp" pr_number: int commit_sha: str # HEAD commit of the PR title: str diff: str # Raw unified diff (the actual code changes) changed_files: list[dict] # List of {filename, status, additions, deletions, patch} file_contents: dict[str, str] # {filepath: full_file_content} for changed files class GitHubClient: """ Async GitHub API client for a specific installation. Usage: client = GitHubClient(installation_id=12345) pr_data = await client.fetch_pr_data("ninjacode911/myapp", 42) await client.post_review_comment(...) Why a class instead of standalone functions? - The installation_id and token are shared across all API calls for one webhook event - A class groups these related operations together with shared state - Makes it easy to test by mocking one object """ def __init__(self, installation_id: int): self.installation_id = installation_id async def _get_headers(self) -> dict[str, str]: """ Build the authorization headers for GitHub API requests. Delegates to auth.py which handles token caching and refresh. No client-level cache — auth.py's cache is the single source of truth. """ token = await get_installation_token(self.installation_id) return { "Authorization": f"token {token}", "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", } async def fetch_pr_data(self, repo_full_name: str, pr_number: int) -> PRData: """ Fetch all data needed to review a PR in one method. This makes 3 API calls: 1. GET /repos/{owner}/{repo}/pulls/{pr_number} — PR metadata + diff 2. GET /repos/{owner}/{repo}/pulls/{pr_number}/files — list of changed files 3. GET /repos/{owner}/{repo}/contents/{path} — full content per changed file We fetch full file contents (not just the diff) because our agents need surrounding context. The diff alone doesn't show imports, class definitions, or the rest of the function — all critical for understanding security and performance implications. Args: repo_full_name: "owner/repo" format (e.g. "ninjacode911/myapp") pr_number: The PR number Returns: PRData with diff, changed files, and full file contents """ headers = await self._get_headers() async with httpx.AsyncClient(timeout=30.0) as http: # --- 1. Fetch PR metadata --- pr_response = await http.get( f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}", headers=headers, ) pr_response.raise_for_status() pr_json = pr_response.json() commit_sha = pr_json["head"]["sha"] title = pr_json["title"] # --- 2. Fetch the raw diff --- # By setting Accept to "application/vnd.github.diff", GitHub returns # the raw unified diff instead of JSON. This is the same format you # see with `git diff` — it's what our agents will analyze. diff_response = await http.get( f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}", headers={**headers, "Accept": "application/vnd.github.diff"}, ) diff_response.raise_for_status() diff = diff_response.text # --- 3. Fetch list of changed files --- # This gives us structured data: filename, status (added/modified/removed), # number of additions/deletions, and the patch (per-file diff). # We paginate because large PRs can have 100+ files. changed_files = [] page = 1 while page <= 30: # Cap at 3000 files to prevent runaway loops files_response = await http.get( f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}/files", headers=headers, params={"per_page": 100, "page": page}, ) files_response.raise_for_status() batch = files_response.json() if not batch: break changed_files.extend(batch) if len(batch) < 100: break page += 1 # --- 4. Fetch full file contents for each changed file --- # We need the complete source code (not just the diff) for RAG context. # The agents can then understand imports, class hierarchy, etc. file_contents = {} for file_info in changed_files: filename = file_info["filename"] status = file_info["status"] # Skip deleted files and binary files — no content to review if status == "removed": continue try: content = await self._fetch_file_content( http, headers, repo_full_name, filename, commit_sha ) if content is not None: file_contents[filename] = content except Exception as e: # Non-fatal: if we can't fetch one file, continue with the rest logger.warning( "Failed to fetch file content", filename=filename, error=str(e), ) logger.info( "Fetched PR data", repo=repo_full_name, pr=pr_number, changed_files=len(changed_files), files_with_content=len(file_contents), ) return PRData( repo_full_name=repo_full_name, pr_number=pr_number, commit_sha=commit_sha, title=title, diff=diff, changed_files=changed_files, file_contents=file_contents, ) async def _fetch_file_content( self, http: httpx.AsyncClient, headers: dict, repo_full_name: str, filepath: str, ref: str, ) -> str | None: """ Fetch the full content of a single file at a specific commit. GitHub's Contents API returns file content as base64-encoded string. We decode it to get the actual source code text. Why base64? Because GitHub's API is JSON-based, and JSON can't safely contain arbitrary binary content. Base64 encodes binary as ASCII text. This is the same encoding used in email attachments (MIME). Args: http: The httpx client (reused for connection pooling) headers: Auth headers repo_full_name: "owner/repo" filepath: Path to the file in the repo ref: Git ref (commit SHA) to fetch the file at Returns: The file content as a string, or None if the file is binary/too large """ response = await http.get( f"{GITHUB_API}/repos/{repo_full_name}/contents/{filepath}", headers=headers, params={"ref": ref}, ) if response.status_code == 404: return None response.raise_for_status() data = response.json() # GitHub returns "file" type for regular files. # Skip directories, symlinks, or submodules. if data.get("type") != "file": return None # Files > 1MB use a different API (Blobs). Skip for now — these are # usually auto-generated or binary files, not worth reviewing. if data.get("size", 0) > 1_000_000: logger.info("Skipping large file", filepath=filepath, size=data["size"]) return None # Decode the base64-encoded content content_b64 = data.get("content", "") try: return base64.b64decode(content_b64).decode("utf-8") except (UnicodeDecodeError, Exception): # Binary file — can't decode as UTF-8 return None async def post_review( self, repo_full_name: str, pr_number: int, commit_sha: str, body: str, comments: list[dict], ) -> dict: """ Post a pull request review with inline comments. This is the core output mechanism of CodeProbe. A "review" in GitHub terms is a batch of inline comments submitted together, optionally with a top-level body and an event type (APPROVE, REQUEST_CHANGES, COMMENT). Each inline comment is anchored to a specific file and line, so it appears right next to the relevant code — just like a human reviewer would comment. GitHub's review API is atomic: either all comments post successfully, or none do. This prevents partial reviews that would confuse developers. Args: repo_full_name: "owner/repo" pr_number: PR number commit_sha: The exact commit SHA these comments reference body: The top-level review summary (shown above inline comments) comments: List of dicts with keys: - path: file path (e.g. "src/auth/login.py") - line: line number in the diff (the new file's line number) - body: the comment text (Markdown supported) Returns: The GitHub API response as a dict """ headers = await self._get_headers() # We use "COMMENT" event — this posts the review without approving or # requesting changes. Our bot shouldn't block PRs at the GitHub level; # instead, we indicate blocking via the Health Score in the summary. review_payload = { "commit_id": commit_sha, "body": body, "event": "COMMENT", "comments": comments, } async with httpx.AsyncClient(timeout=30.0) as http: response = await http.post( f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}/reviews", headers=headers, json=review_payload, ) response.raise_for_status() logger.info( "Posted PR review", repo=repo_full_name, pr=pr_number, inline_comments=len(comments), ) return response.json() async def post_comment( self, repo_full_name: str, pr_number: int, body: str ) -> dict: """ Post a standalone comment on the PR conversation (not inline). Used for the summary comment (Health Score, finding counts, executive summary) when we don't have inline comments, or as a fallback. This uses the Issues API (PRs are issues in GitHub's data model) rather than the Pull Request Review API. Args: repo_full_name: "owner/repo" pr_number: PR number body: Comment text (Markdown) Returns: The GitHub API response as a dict """ headers = await self._get_headers() async with httpx.AsyncClient(timeout=30.0) as http: response = await http.post( f"{GITHUB_API}/repos/{repo_full_name}/issues/{pr_number}/comments", headers=headers, json={"body": body}, ) response.raise_for_status() logger.info("Posted PR comment", repo=repo_full_name, pr=pr_number) return response.json()