Spaces:
Running
Running
| """ | |
| GitHub API Client | |
| ================== | |
| This module handles all communication with GitHub's REST API. It provides | |
| methods to: | |
| 1. Fetch PR diff (the raw unified diff showing what changed) | |
| 2. Fetch file contents (full source code for context/RAG) | |
| 3. Fetch changed file list (which files were modified) | |
| 4. Post a PR review with inline comments (anchored to specific lines) | |
| 5. Post a summary comment on the PR conversation | |
| GitHub API Authentication: | |
| - We authenticate using installation access tokens (from auth.py) | |
| - Every request includes the token in the Authorization header | |
| - The token is scoped to the specific repos where our app is installed | |
| GitHub API Versioning: | |
| - We pin to version "2022-11-28" via X-GitHub-Api-Version header | |
| - This ensures our code doesn't break when GitHub ships API changes | |
| - This is a best practice for any API integration in production | |
| Rate Limits: | |
| - GitHub Apps get 5,000 requests/hour per installation | |
| - That's plenty for our use case (~10-20 API calls per PR review) | |
| Reference: https://docs.github.com/en/rest | |
| """ | |
| from __future__ import annotations | |
| import base64 | |
| from dataclasses import dataclass | |
| import httpx | |
| import structlog | |
| from app.github.auth import get_installation_token | |
| logger = structlog.get_logger() | |
| GITHUB_API = "https://api.github.com" | |
| class PRData: | |
| """ | |
| All the data we fetch about a PR, bundled together. | |
| This is passed to the agent orchestrator so agents have full context. | |
| A dataclass (vs a dict) gives us type safety and autocomplete in the IDE. | |
| """ | |
| repo_full_name: str # e.g. "ninjacode911/myapp" | |
| pr_number: int | |
| commit_sha: str # HEAD commit of the PR | |
| title: str | |
| diff: str # Raw unified diff (the actual code changes) | |
| changed_files: list[dict] # List of {filename, status, additions, deletions, patch} | |
| file_contents: dict[str, str] # {filepath: full_file_content} for changed files | |
| class GitHubClient: | |
| """ | |
| Async GitHub API client for a specific installation. | |
| Usage: | |
| client = GitHubClient(installation_id=12345) | |
| pr_data = await client.fetch_pr_data("ninjacode911/myapp", 42) | |
| await client.post_review_comment(...) | |
| Why a class instead of standalone functions? | |
| - The installation_id and token are shared across all API calls for one webhook event | |
| - A class groups these related operations together with shared state | |
| - Makes it easy to test by mocking one object | |
| """ | |
| def __init__(self, installation_id: int): | |
| self.installation_id = installation_id | |
| async def _get_headers(self) -> dict[str, str]: | |
| """ | |
| Build the authorization headers for GitHub API requests. | |
| Delegates to auth.py which handles token caching and refresh. | |
| No client-level cache — auth.py's cache is the single source of truth. | |
| """ | |
| token = await get_installation_token(self.installation_id) | |
| return { | |
| "Authorization": f"token {token}", | |
| "Accept": "application/vnd.github+json", | |
| "X-GitHub-Api-Version": "2022-11-28", | |
| } | |
| async def fetch_pr_data(self, repo_full_name: str, pr_number: int) -> PRData: | |
| """ | |
| Fetch all data needed to review a PR in one method. | |
| This makes 3 API calls: | |
| 1. GET /repos/{owner}/{repo}/pulls/{pr_number} — PR metadata + diff | |
| 2. GET /repos/{owner}/{repo}/pulls/{pr_number}/files — list of changed files | |
| 3. GET /repos/{owner}/{repo}/contents/{path} — full content per changed file | |
| We fetch full file contents (not just the diff) because our agents need | |
| surrounding context. The diff alone doesn't show imports, class definitions, | |
| or the rest of the function — all critical for understanding security and | |
| performance implications. | |
| Args: | |
| repo_full_name: "owner/repo" format (e.g. "ninjacode911/myapp") | |
| pr_number: The PR number | |
| Returns: | |
| PRData with diff, changed files, and full file contents | |
| """ | |
| headers = await self._get_headers() | |
| async with httpx.AsyncClient(timeout=30.0) as http: | |
| # --- 1. Fetch PR metadata --- | |
| pr_response = await http.get( | |
| f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}", | |
| headers=headers, | |
| ) | |
| pr_response.raise_for_status() | |
| pr_json = pr_response.json() | |
| commit_sha = pr_json["head"]["sha"] | |
| title = pr_json["title"] | |
| # --- 2. Fetch the raw diff --- | |
| # By setting Accept to "application/vnd.github.diff", GitHub returns | |
| # the raw unified diff instead of JSON. This is the same format you | |
| # see with `git diff` — it's what our agents will analyze. | |
| diff_response = await http.get( | |
| f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}", | |
| headers={**headers, "Accept": "application/vnd.github.diff"}, | |
| ) | |
| diff_response.raise_for_status() | |
| diff = diff_response.text | |
| # --- 3. Fetch list of changed files --- | |
| # This gives us structured data: filename, status (added/modified/removed), | |
| # number of additions/deletions, and the patch (per-file diff). | |
| # We paginate because large PRs can have 100+ files. | |
| changed_files = [] | |
| page = 1 | |
| while page <= 30: # Cap at 3000 files to prevent runaway loops | |
| files_response = await http.get( | |
| f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}/files", | |
| headers=headers, | |
| params={"per_page": 100, "page": page}, | |
| ) | |
| files_response.raise_for_status() | |
| batch = files_response.json() | |
| if not batch: | |
| break | |
| changed_files.extend(batch) | |
| if len(batch) < 100: | |
| break | |
| page += 1 | |
| # --- 4. Fetch full file contents for each changed file --- | |
| # We need the complete source code (not just the diff) for RAG context. | |
| # The agents can then understand imports, class hierarchy, etc. | |
| file_contents = {} | |
| for file_info in changed_files: | |
| filename = file_info["filename"] | |
| status = file_info["status"] | |
| # Skip deleted files and binary files — no content to review | |
| if status == "removed": | |
| continue | |
| try: | |
| content = await self._fetch_file_content( | |
| http, headers, repo_full_name, filename, commit_sha | |
| ) | |
| if content is not None: | |
| file_contents[filename] = content | |
| except Exception as e: | |
| # Non-fatal: if we can't fetch one file, continue with the rest | |
| logger.warning( | |
| "Failed to fetch file content", | |
| filename=filename, | |
| error=str(e), | |
| ) | |
| logger.info( | |
| "Fetched PR data", | |
| repo=repo_full_name, | |
| pr=pr_number, | |
| changed_files=len(changed_files), | |
| files_with_content=len(file_contents), | |
| ) | |
| return PRData( | |
| repo_full_name=repo_full_name, | |
| pr_number=pr_number, | |
| commit_sha=commit_sha, | |
| title=title, | |
| diff=diff, | |
| changed_files=changed_files, | |
| file_contents=file_contents, | |
| ) | |
| async def _fetch_file_content( | |
| self, | |
| http: httpx.AsyncClient, | |
| headers: dict, | |
| repo_full_name: str, | |
| filepath: str, | |
| ref: str, | |
| ) -> str | None: | |
| """ | |
| Fetch the full content of a single file at a specific commit. | |
| GitHub's Contents API returns file content as base64-encoded string. | |
| We decode it to get the actual source code text. | |
| Why base64? Because GitHub's API is JSON-based, and JSON can't safely | |
| contain arbitrary binary content. Base64 encodes binary as ASCII text. | |
| This is the same encoding used in email attachments (MIME). | |
| Args: | |
| http: The httpx client (reused for connection pooling) | |
| headers: Auth headers | |
| repo_full_name: "owner/repo" | |
| filepath: Path to the file in the repo | |
| ref: Git ref (commit SHA) to fetch the file at | |
| Returns: | |
| The file content as a string, or None if the file is binary/too large | |
| """ | |
| response = await http.get( | |
| f"{GITHUB_API}/repos/{repo_full_name}/contents/{filepath}", | |
| headers=headers, | |
| params={"ref": ref}, | |
| ) | |
| if response.status_code == 404: | |
| return None | |
| response.raise_for_status() | |
| data = response.json() | |
| # GitHub returns "file" type for regular files. | |
| # Skip directories, symlinks, or submodules. | |
| if data.get("type") != "file": | |
| return None | |
| # Files > 1MB use a different API (Blobs). Skip for now — these are | |
| # usually auto-generated or binary files, not worth reviewing. | |
| if data.get("size", 0) > 1_000_000: | |
| logger.info("Skipping large file", filepath=filepath, size=data["size"]) | |
| return None | |
| # Decode the base64-encoded content | |
| content_b64 = data.get("content", "") | |
| try: | |
| return base64.b64decode(content_b64).decode("utf-8") | |
| except (UnicodeDecodeError, Exception): | |
| # Binary file — can't decode as UTF-8 | |
| return None | |
| async def post_review( | |
| self, | |
| repo_full_name: str, | |
| pr_number: int, | |
| commit_sha: str, | |
| body: str, | |
| comments: list[dict], | |
| ) -> dict: | |
| """ | |
| Post a pull request review with inline comments. | |
| This is the core output mechanism of CodeProbe. A "review" in GitHub terms | |
| is a batch of inline comments submitted together, optionally with a top-level | |
| body and an event type (APPROVE, REQUEST_CHANGES, COMMENT). | |
| Each inline comment is anchored to a specific file and line, so it appears | |
| right next to the relevant code — just like a human reviewer would comment. | |
| GitHub's review API is atomic: either all comments post successfully, or | |
| none do. This prevents partial reviews that would confuse developers. | |
| Args: | |
| repo_full_name: "owner/repo" | |
| pr_number: PR number | |
| commit_sha: The exact commit SHA these comments reference | |
| body: The top-level review summary (shown above inline comments) | |
| comments: List of dicts with keys: | |
| - path: file path (e.g. "src/auth/login.py") | |
| - line: line number in the diff (the new file's line number) | |
| - body: the comment text (Markdown supported) | |
| Returns: | |
| The GitHub API response as a dict | |
| """ | |
| headers = await self._get_headers() | |
| # We use "COMMENT" event — this posts the review without approving or | |
| # requesting changes. Our bot shouldn't block PRs at the GitHub level; | |
| # instead, we indicate blocking via the Health Score in the summary. | |
| review_payload = { | |
| "commit_id": commit_sha, | |
| "body": body, | |
| "event": "COMMENT", | |
| "comments": comments, | |
| } | |
| async with httpx.AsyncClient(timeout=30.0) as http: | |
| response = await http.post( | |
| f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}/reviews", | |
| headers=headers, | |
| json=review_payload, | |
| ) | |
| response.raise_for_status() | |
| logger.info( | |
| "Posted PR review", | |
| repo=repo_full_name, | |
| pr=pr_number, | |
| inline_comments=len(comments), | |
| ) | |
| return response.json() | |
| async def post_comment( | |
| self, repo_full_name: str, pr_number: int, body: str | |
| ) -> dict: | |
| """ | |
| Post a standalone comment on the PR conversation (not inline). | |
| Used for the summary comment (Health Score, finding counts, executive summary) | |
| when we don't have inline comments, or as a fallback. | |
| This uses the Issues API (PRs are issues in GitHub's data model) rather | |
| than the Pull Request Review API. | |
| Args: | |
| repo_full_name: "owner/repo" | |
| pr_number: PR number | |
| body: Comment text (Markdown) | |
| Returns: | |
| The GitHub API response as a dict | |
| """ | |
| headers = await self._get_headers() | |
| async with httpx.AsyncClient(timeout=30.0) as http: | |
| response = await http.post( | |
| f"{GITHUB_API}/repos/{repo_full_name}/issues/{pr_number}/comments", | |
| headers=headers, | |
| json={"body": body}, | |
| ) | |
| response.raise_for_status() | |
| logger.info("Posted PR comment", repo=repo_full_name, pr=pr_number) | |
| return response.json() | |