ninja-code-guard / app /github /client.py
NinjainPJs's picture
initial - commit
4b445f6
"""
GitHub API Client
==================
This module handles all communication with GitHub's REST API. It provides
methods to:
1. Fetch PR diff (the raw unified diff showing what changed)
2. Fetch file contents (full source code for context/RAG)
3. Fetch changed file list (which files were modified)
4. Post a PR review with inline comments (anchored to specific lines)
5. Post a summary comment on the PR conversation
GitHub API Authentication:
- We authenticate using installation access tokens (from auth.py)
- Every request includes the token in the Authorization header
- The token is scoped to the specific repos where our app is installed
GitHub API Versioning:
- We pin to version "2022-11-28" via X-GitHub-Api-Version header
- This ensures our code doesn't break when GitHub ships API changes
- This is a best practice for any API integration in production
Rate Limits:
- GitHub Apps get 5,000 requests/hour per installation
- That's plenty for our use case (~10-20 API calls per PR review)
Reference: https://docs.github.com/en/rest
"""
from __future__ import annotations
import base64
from dataclasses import dataclass
import httpx
import structlog
from app.github.auth import get_installation_token
logger = structlog.get_logger()
GITHUB_API = "https://api.github.com"
@dataclass
class PRData:
"""
All the data we fetch about a PR, bundled together.
This is passed to the agent orchestrator so agents have full context.
A dataclass (vs a dict) gives us type safety and autocomplete in the IDE.
"""
repo_full_name: str # e.g. "ninjacode911/myapp"
pr_number: int
commit_sha: str # HEAD commit of the PR
title: str
diff: str # Raw unified diff (the actual code changes)
changed_files: list[dict] # List of {filename, status, additions, deletions, patch}
file_contents: dict[str, str] # {filepath: full_file_content} for changed files
class GitHubClient:
"""
Async GitHub API client for a specific installation.
Usage:
client = GitHubClient(installation_id=12345)
pr_data = await client.fetch_pr_data("ninjacode911/myapp", 42)
await client.post_review_comment(...)
Why a class instead of standalone functions?
- The installation_id and token are shared across all API calls for one webhook event
- A class groups these related operations together with shared state
- Makes it easy to test by mocking one object
"""
def __init__(self, installation_id: int):
self.installation_id = installation_id
async def _get_headers(self) -> dict[str, str]:
"""
Build the authorization headers for GitHub API requests.
Delegates to auth.py which handles token caching and refresh.
No client-level cache — auth.py's cache is the single source of truth.
"""
token = await get_installation_token(self.installation_id)
return {
"Authorization": f"token {token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
async def fetch_pr_data(self, repo_full_name: str, pr_number: int) -> PRData:
"""
Fetch all data needed to review a PR in one method.
This makes 3 API calls:
1. GET /repos/{owner}/{repo}/pulls/{pr_number} — PR metadata + diff
2. GET /repos/{owner}/{repo}/pulls/{pr_number}/files — list of changed files
3. GET /repos/{owner}/{repo}/contents/{path} — full content per changed file
We fetch full file contents (not just the diff) because our agents need
surrounding context. The diff alone doesn't show imports, class definitions,
or the rest of the function — all critical for understanding security and
performance implications.
Args:
repo_full_name: "owner/repo" format (e.g. "ninjacode911/myapp")
pr_number: The PR number
Returns:
PRData with diff, changed files, and full file contents
"""
headers = await self._get_headers()
async with httpx.AsyncClient(timeout=30.0) as http:
# --- 1. Fetch PR metadata ---
pr_response = await http.get(
f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}",
headers=headers,
)
pr_response.raise_for_status()
pr_json = pr_response.json()
commit_sha = pr_json["head"]["sha"]
title = pr_json["title"]
# --- 2. Fetch the raw diff ---
# By setting Accept to "application/vnd.github.diff", GitHub returns
# the raw unified diff instead of JSON. This is the same format you
# see with `git diff` — it's what our agents will analyze.
diff_response = await http.get(
f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}",
headers={**headers, "Accept": "application/vnd.github.diff"},
)
diff_response.raise_for_status()
diff = diff_response.text
# --- 3. Fetch list of changed files ---
# This gives us structured data: filename, status (added/modified/removed),
# number of additions/deletions, and the patch (per-file diff).
# We paginate because large PRs can have 100+ files.
changed_files = []
page = 1
while page <= 30: # Cap at 3000 files to prevent runaway loops
files_response = await http.get(
f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}/files",
headers=headers,
params={"per_page": 100, "page": page},
)
files_response.raise_for_status()
batch = files_response.json()
if not batch:
break
changed_files.extend(batch)
if len(batch) < 100:
break
page += 1
# --- 4. Fetch full file contents for each changed file ---
# We need the complete source code (not just the diff) for RAG context.
# The agents can then understand imports, class hierarchy, etc.
file_contents = {}
for file_info in changed_files:
filename = file_info["filename"]
status = file_info["status"]
# Skip deleted files and binary files — no content to review
if status == "removed":
continue
try:
content = await self._fetch_file_content(
http, headers, repo_full_name, filename, commit_sha
)
if content is not None:
file_contents[filename] = content
except Exception as e:
# Non-fatal: if we can't fetch one file, continue with the rest
logger.warning(
"Failed to fetch file content",
filename=filename,
error=str(e),
)
logger.info(
"Fetched PR data",
repo=repo_full_name,
pr=pr_number,
changed_files=len(changed_files),
files_with_content=len(file_contents),
)
return PRData(
repo_full_name=repo_full_name,
pr_number=pr_number,
commit_sha=commit_sha,
title=title,
diff=diff,
changed_files=changed_files,
file_contents=file_contents,
)
async def _fetch_file_content(
self,
http: httpx.AsyncClient,
headers: dict,
repo_full_name: str,
filepath: str,
ref: str,
) -> str | None:
"""
Fetch the full content of a single file at a specific commit.
GitHub's Contents API returns file content as base64-encoded string.
We decode it to get the actual source code text.
Why base64? Because GitHub's API is JSON-based, and JSON can't safely
contain arbitrary binary content. Base64 encodes binary as ASCII text.
This is the same encoding used in email attachments (MIME).
Args:
http: The httpx client (reused for connection pooling)
headers: Auth headers
repo_full_name: "owner/repo"
filepath: Path to the file in the repo
ref: Git ref (commit SHA) to fetch the file at
Returns:
The file content as a string, or None if the file is binary/too large
"""
response = await http.get(
f"{GITHUB_API}/repos/{repo_full_name}/contents/{filepath}",
headers=headers,
params={"ref": ref},
)
if response.status_code == 404:
return None
response.raise_for_status()
data = response.json()
# GitHub returns "file" type for regular files.
# Skip directories, symlinks, or submodules.
if data.get("type") != "file":
return None
# Files > 1MB use a different API (Blobs). Skip for now — these are
# usually auto-generated or binary files, not worth reviewing.
if data.get("size", 0) > 1_000_000:
logger.info("Skipping large file", filepath=filepath, size=data["size"])
return None
# Decode the base64-encoded content
content_b64 = data.get("content", "")
try:
return base64.b64decode(content_b64).decode("utf-8")
except (UnicodeDecodeError, Exception):
# Binary file — can't decode as UTF-8
return None
async def post_review(
self,
repo_full_name: str,
pr_number: int,
commit_sha: str,
body: str,
comments: list[dict],
) -> dict:
"""
Post a pull request review with inline comments.
This is the core output mechanism of CodeProbe. A "review" in GitHub terms
is a batch of inline comments submitted together, optionally with a top-level
body and an event type (APPROVE, REQUEST_CHANGES, COMMENT).
Each inline comment is anchored to a specific file and line, so it appears
right next to the relevant code — just like a human reviewer would comment.
GitHub's review API is atomic: either all comments post successfully, or
none do. This prevents partial reviews that would confuse developers.
Args:
repo_full_name: "owner/repo"
pr_number: PR number
commit_sha: The exact commit SHA these comments reference
body: The top-level review summary (shown above inline comments)
comments: List of dicts with keys:
- path: file path (e.g. "src/auth/login.py")
- line: line number in the diff (the new file's line number)
- body: the comment text (Markdown supported)
Returns:
The GitHub API response as a dict
"""
headers = await self._get_headers()
# We use "COMMENT" event — this posts the review without approving or
# requesting changes. Our bot shouldn't block PRs at the GitHub level;
# instead, we indicate blocking via the Health Score in the summary.
review_payload = {
"commit_id": commit_sha,
"body": body,
"event": "COMMENT",
"comments": comments,
}
async with httpx.AsyncClient(timeout=30.0) as http:
response = await http.post(
f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}/reviews",
headers=headers,
json=review_payload,
)
response.raise_for_status()
logger.info(
"Posted PR review",
repo=repo_full_name,
pr=pr_number,
inline_comments=len(comments),
)
return response.json()
async def post_comment(
self, repo_full_name: str, pr_number: int, body: str
) -> dict:
"""
Post a standalone comment on the PR conversation (not inline).
Used for the summary comment (Health Score, finding counts, executive summary)
when we don't have inline comments, or as a fallback.
This uses the Issues API (PRs are issues in GitHub's data model) rather
than the Pull Request Review API.
Args:
repo_full_name: "owner/repo"
pr_number: PR number
body: Comment text (Markdown)
Returns:
The GitHub API response as a dict
"""
headers = await self._get_headers()
async with httpx.AsyncClient(timeout=30.0) as http:
response = await http.post(
f"{GITHUB_API}/repos/{repo_full_name}/issues/{pr_number}/comments",
headers=headers,
json={"body": body},
)
response.raise_for_status()
logger.info("Posted PR comment", repo=repo_full_name, pr=pr_number)
return response.json()