Spaces:

NinjainPJs
/

ninja-code-guard

Running

App Files Files Community

ninja-code-guard / app /context /retriever.py

NinjainPJs

Fix all ruff lint issues — 0 errors, 92 tests passing

b9da50c 3 months ago

raw

history blame contribute delete

3.83 kB

	"""
	RAG Context Retriever
	======================

	Retrieves relevant code context from ChromaDB based on the PR diff.
	This is the "R" in RAG (Retrieval-Augmented Generation).

	How retrieval works:
	1. Take the PR diff text as a query
	2. Embed the query using the same model used for indexing
	3. Search ChromaDB for the most similar code chunks
	4. Return the top-k chunks as additional context for the LLM

	Why RAG for code review?
	The PR diff only shows CHANGED lines. But understanding a change often
	requires seeing RELATED code:
	- If a function is called from 5 places, changing it affects all callers
	- If a variable is validated in another file, the validation matters here
	- If the same pattern exists elsewhere, inconsistency is a style issue

	RAG gives the agents "peripheral vision" — they see not just the change,
	but the surrounding codebase context that makes the change meaningful.
	"""

	from __future__ import annotations

	import structlog

	from app.context.embedder import embed_texts
	from app.context.indexer import _get_chroma_client

	logger = structlog.get_logger()


	async def retrieve_context(
	collection_name: str,
	query_text: str,
	top_k: int = 5,
	) -> str:
	"""
	Retrieve relevant code context from ChromaDB.

	Args:
	collection_name: The ChromaDB collection to search
	query_text: The PR diff or a specific query
	top_k: Number of results to return (default: 5)

	Returns:
	A formatted string of relevant code chunks to include in the LLM prompt.
	Returns empty string if retrieval fails or no results found.
	"""
	try:
	client = _get_chroma_client()

	# Check if collection exists
	try:
	collection = client.get_collection(name=collection_name)
	except Exception:
	logger.debug("Collection not found — no RAG context", collection=collection_name)
	return ""

	# Skip if collection is empty
	if collection.count() == 0:
	return ""

	# Embed the query
	query_embeddings = embed_texts([query_text[:5000]]) # Cap query size
	if not query_embeddings:
	return ""

	# Search for similar code chunks
	results = collection.query(
	query_embeddings=query_embeddings,
	n_results=min(top_k, collection.count()),
	include=["documents", "metadatas", "distances"],
	)

	if not results or not results["documents"] or not results["documents"][0]:
	return ""

	# Format results as context for the LLM
	context_parts = ["## Related Code Context (from repository)\n"]

	for doc, metadata, distance in zip(
	results["documents"][0],
	results["metadatas"][0],
	results["distances"][0],
	strict=False,
	):
	filepath = metadata.get("filepath", "unknown")
	start = metadata.get("start_line", "?")
	end = metadata.get("end_line", "?")
	# ChromaDB returns L2 distance — lower = more similar
	similarity = max(0, 1 - distance / 2) # Rough conversion to 0-1

	if similarity < 0.3:
	continue # Skip low-relevance results

	context_parts.append(
	f"### {filepath} (lines {start}-{end}, relevance: {similarity:.0%})\n"
	f"```\n{doc}\n```\n"
	)

	if len(context_parts) == 1: # Only the header, no results
	return ""

	context = "\n".join(context_parts)
	logger.info(
	"Retrieved RAG context",
	collection=collection_name,
	chunks_returned=len(context_parts) - 1,
	)
	return context

	except Exception as e:
	logger.warning("RAG retrieval failed", error=str(e))
	return ""