Spaces:

NinjainPJs
/

ninja-code-guard

Running

App Files Files Community

ninja-code-guard / app /github /client.py

NinjainPJs

initial - commit

4b445f6 3 months ago

raw

history blame contribute delete

13.3 kB

	"""
	GitHub API Client
	==================

	This module handles all communication with GitHub's REST API. It provides
	methods to:

	1. Fetch PR diff (the raw unified diff showing what changed)
	2. Fetch file contents (full source code for context/RAG)
	3. Fetch changed file list (which files were modified)
	4. Post a PR review with inline comments (anchored to specific lines)
	5. Post a summary comment on the PR conversation

	GitHub API Authentication:
	- We authenticate using installation access tokens (from auth.py)
	- Every request includes the token in the Authorization header
	- The token is scoped to the specific repos where our app is installed

	GitHub API Versioning:
	- We pin to version "2022-11-28" via X-GitHub-Api-Version header
	- This ensures our code doesn't break when GitHub ships API changes
	- This is a best practice for any API integration in production

	Rate Limits:
	- GitHub Apps get 5,000 requests/hour per installation
	- That's plenty for our use case (~10-20 API calls per PR review)

	Reference: https://docs.github.com/en/rest
	"""

	from __future__ import annotations

	import base64
	from dataclasses import dataclass

	import httpx
	import structlog

	from app.github.auth import get_installation_token

	logger = structlog.get_logger()

	GITHUB_API = "https://api.github.com"


	@dataclass
	class PRData:
	"""
	All the data we fetch about a PR, bundled together.

	This is passed to the agent orchestrator so agents have full context.
	A dataclass (vs a dict) gives us type safety and autocomplete in the IDE.
	"""

	repo_full_name: str # e.g. "ninjacode911/myapp"
	pr_number: int
	commit_sha: str # HEAD commit of the PR
	title: str
	diff: str # Raw unified diff (the actual code changes)
	changed_files: list[dict] # List of {filename, status, additions, deletions, patch}
	file_contents: dict[str, str] # {filepath: full_file_content} for changed files


	class GitHubClient:
	"""
	Async GitHub API client for a specific installation.

	Usage:
	client = GitHubClient(installation_id=12345)
	pr_data = await client.fetch_pr_data("ninjacode911/myapp", 42)
	await client.post_review_comment(...)

	Why a class instead of standalone functions?
	- The installation_id and token are shared across all API calls for one webhook event
	- A class groups these related operations together with shared state
	- Makes it easy to test by mocking one object
	"""

	def __init__(self, installation_id: int):
	self.installation_id = installation_id

	async def _get_headers(self) -> dict[str, str]:
	"""
	Build the authorization headers for GitHub API requests.

	Delegates to auth.py which handles token caching and refresh.
	No client-level cache — auth.py's cache is the single source of truth.
	"""
	token = await get_installation_token(self.installation_id)

	return {
	"Authorization": f"token {token}",
	"Accept": "application/vnd.github+json",
	"X-GitHub-Api-Version": "2022-11-28",
	}

	async def fetch_pr_data(self, repo_full_name: str, pr_number: int) -> PRData:
	"""
	Fetch all data needed to review a PR in one method.

	This makes 3 API calls:
	1. GET /repos/{owner}/{repo}/pulls/{pr_number} — PR metadata + diff
	2. GET /repos/{owner}/{repo}/pulls/{pr_number}/files — list of changed files
	3. GET /repos/{owner}/{repo}/contents/{path} — full content per changed file

	We fetch full file contents (not just the diff) because our agents need
	surrounding context. The diff alone doesn't show imports, class definitions,
	or the rest of the function — all critical for understanding security and
	performance implications.

	Args:
	repo_full_name: "owner/repo" format (e.g. "ninjacode911/myapp")
	pr_number: The PR number

	Returns:
	PRData with diff, changed files, and full file contents
	"""
	headers = await self._get_headers()

	async with httpx.AsyncClient(timeout=30.0) as http:
	# --- 1. Fetch PR metadata ---
	pr_response = await http.get(
	f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}",
	headers=headers,
	)
	pr_response.raise_for_status()
	pr_json = pr_response.json()

	commit_sha = pr_json["head"]["sha"]
	title = pr_json["title"]

	# --- 2. Fetch the raw diff ---
	# By setting Accept to "application/vnd.github.diff", GitHub returns
	# the raw unified diff instead of JSON. This is the same format you
	# see with `git diff` — it's what our agents will analyze.
	diff_response = await http.get(
	f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}",
	headers={**headers, "Accept": "application/vnd.github.diff"},
	)
	diff_response.raise_for_status()
	diff = diff_response.text

	# --- 3. Fetch list of changed files ---
	# This gives us structured data: filename, status (added/modified/removed),
	# number of additions/deletions, and the patch (per-file diff).
	# We paginate because large PRs can have 100+ files.
	changed_files = []
	page = 1
	while page <= 30: # Cap at 3000 files to prevent runaway loops
	files_response = await http.get(
	f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}/files",
	headers=headers,
	params={"per_page": 100, "page": page},
	)
	files_response.raise_for_status()
	batch = files_response.json()
	if not batch:
	break
	changed_files.extend(batch)
	if len(batch) < 100:
	break
	page += 1

	# --- 4. Fetch full file contents for each changed file ---
	# We need the complete source code (not just the diff) for RAG context.
	# The agents can then understand imports, class hierarchy, etc.
	file_contents = {}
	for file_info in changed_files:
	filename = file_info["filename"]
	status = file_info["status"]

	# Skip deleted files and binary files — no content to review
	if status == "removed":
	continue

	try:
	content = await self._fetch_file_content(
	http, headers, repo_full_name, filename, commit_sha
	)
	if content is not None:
	file_contents[filename] = content
	except Exception as e:
	# Non-fatal: if we can't fetch one file, continue with the rest
	logger.warning(
	"Failed to fetch file content",
	filename=filename,
	error=str(e),
	)

	logger.info(
	"Fetched PR data",
	repo=repo_full_name,
	pr=pr_number,
	changed_files=len(changed_files),
	files_with_content=len(file_contents),
	)

	return PRData(
	repo_full_name=repo_full_name,
	pr_number=pr_number,
	commit_sha=commit_sha,
	title=title,
	diff=diff,
	changed_files=changed_files,
	file_contents=file_contents,
	)

	async def _fetch_file_content(
	self,
	http: httpx.AsyncClient,
	headers: dict,
	repo_full_name: str,
	filepath: str,
	ref: str,
	) -> str \| None:
	"""
	Fetch the full content of a single file at a specific commit.

	GitHub's Contents API returns file content as base64-encoded string.
	We decode it to get the actual source code text.

	Why base64? Because GitHub's API is JSON-based, and JSON can't safely
	contain arbitrary binary content. Base64 encodes binary as ASCII text.
	This is the same encoding used in email attachments (MIME).

	Args:
	http: The httpx client (reused for connection pooling)
	headers: Auth headers
	repo_full_name: "owner/repo"
	filepath: Path to the file in the repo
	ref: Git ref (commit SHA) to fetch the file at

	Returns:
	The file content as a string, or None if the file is binary/too large
	"""
	response = await http.get(
	f"{GITHUB_API}/repos/{repo_full_name}/contents/{filepath}",
	headers=headers,
	params={"ref": ref},
	)

	if response.status_code == 404:
	return None

	response.raise_for_status()
	data = response.json()

	# GitHub returns "file" type for regular files.
	# Skip directories, symlinks, or submodules.
	if data.get("type") != "file":
	return None

	# Files > 1MB use a different API (Blobs). Skip for now — these are
	# usually auto-generated or binary files, not worth reviewing.
	if data.get("size", 0) > 1_000_000:
	logger.info("Skipping large file", filepath=filepath, size=data["size"])
	return None

	# Decode the base64-encoded content
	content_b64 = data.get("content", "")
	try:
	return base64.b64decode(content_b64).decode("utf-8")
	except (UnicodeDecodeError, Exception):
	# Binary file — can't decode as UTF-8
	return None

	async def post_review(
	self,
	repo_full_name: str,
	pr_number: int,
	commit_sha: str,
	body: str,
	comments: list[dict],
	) -> dict:
	"""
	Post a pull request review with inline comments.

	This is the core output mechanism of CodeProbe. A "review" in GitHub terms
	is a batch of inline comments submitted together, optionally with a top-level
	body and an event type (APPROVE, REQUEST_CHANGES, COMMENT).

	Each inline comment is anchored to a specific file and line, so it appears
	right next to the relevant code — just like a human reviewer would comment.

	GitHub's review API is atomic: either all comments post successfully, or
	none do. This prevents partial reviews that would confuse developers.

	Args:
	repo_full_name: "owner/repo"
	pr_number: PR number
	commit_sha: The exact commit SHA these comments reference
	body: The top-level review summary (shown above inline comments)
	comments: List of dicts with keys:
	- path: file path (e.g. "src/auth/login.py")
	- line: line number in the diff (the new file's line number)
	- body: the comment text (Markdown supported)

	Returns:
	The GitHub API response as a dict
	"""
	headers = await self._get_headers()

	# We use "COMMENT" event — this posts the review without approving or
	# requesting changes. Our bot shouldn't block PRs at the GitHub level;
	# instead, we indicate blocking via the Health Score in the summary.
	review_payload = {
	"commit_id": commit_sha,
	"body": body,
	"event": "COMMENT",
	"comments": comments,
	}

	async with httpx.AsyncClient(timeout=30.0) as http:
	response = await http.post(
	f"{GITHUB_API}/repos/{repo_full_name}/pulls/{pr_number}/reviews",
	headers=headers,
	json=review_payload,
	)
	response.raise_for_status()

	logger.info(
	"Posted PR review",
	repo=repo_full_name,
	pr=pr_number,
	inline_comments=len(comments),
	)

	return response.json()

	async def post_comment(
	self, repo_full_name: str, pr_number: int, body: str
	) -> dict:
	"""
	Post a standalone comment on the PR conversation (not inline).

	Used for the summary comment (Health Score, finding counts, executive summary)
	when we don't have inline comments, or as a fallback.

	This uses the Issues API (PRs are issues in GitHub's data model) rather
	than the Pull Request Review API.

	Args:
	repo_full_name: "owner/repo"
	pr_number: PR number
	body: Comment text (Markdown)

	Returns:
	The GitHub API response as a dict
	"""
	headers = await self._get_headers()

	async with httpx.AsyncClient(timeout=30.0) as http:
	response = await http.post(
	f"{GITHUB_API}/repos/{repo_full_name}/issues/{pr_number}/comments",
	headers=headers,
	json={"body": body},
	)
	response.raise_for_status()

	logger.info("Posted PR comment", repo=repo_full_name, pr=pr_number)

	return response.json()