Spaces:

NinjainPJs
/

ninja-code-guard

Running

File size: 3,827 Bytes

"""
RAG Context Retriever
======================

Retrieves relevant code context from ChromaDB based on the PR diff.
This is the "R" in RAG (Retrieval-Augmented Generation).

How retrieval works:
1. Take the PR diff text as a query
2. Embed the query using the same model used for indexing
3. Search ChromaDB for the most similar code chunks
4. Return the top-k chunks as additional context for the LLM

Why RAG for code review?
The PR diff only shows CHANGED lines. But understanding a change often
requires seeing RELATED code:
- If a function is called from 5 places, changing it affects all callers
- If a variable is validated in another file, the validation matters here
- If the same pattern exists elsewhere, inconsistency is a style issue

RAG gives the agents "peripheral vision" — they see not just the change,
but the surrounding codebase context that makes the change meaningful.
"""

from __future__ import annotations

import structlog

from app.context.embedder import embed_texts
from app.context.indexer import _get_chroma_client

logger = structlog.get_logger()


async def retrieve_context(
    collection_name: str,
    query_text: str,
    top_k: int = 5,
) -> str:
    """
    Retrieve relevant code context from ChromaDB.

    Args:
        collection_name: The ChromaDB collection to search
        query_text: The PR diff or a specific query
        top_k: Number of results to return (default: 5)

    Returns:
        A formatted string of relevant code chunks to include in the LLM prompt.
        Returns empty string if retrieval fails or no results found.
    """
    try:
        client = _get_chroma_client()

        # Check if collection exists
        try:
            collection = client.get_collection(name=collection_name)
        except Exception:
            logger.debug("Collection not found — no RAG context", collection=collection_name)
            return ""

        # Skip if collection is empty
        if collection.count() == 0:
            return ""

        # Embed the query
        query_embeddings = embed_texts([query_text[:5000]])  # Cap query size
        if not query_embeddings:
            return ""

        # Search for similar code chunks
        results = collection.query(
            query_embeddings=query_embeddings,
            n_results=min(top_k, collection.count()),
            include=["documents", "metadatas", "distances"],
        )

        if not results or not results["documents"] or not results["documents"][0]:
            return ""

        # Format results as context for the LLM
        context_parts = ["## Related Code Context (from repository)\n"]

        for doc, metadata, distance in zip(
            results["documents"][0],
            results["metadatas"][0],
            results["distances"][0],
            strict=False,
        ):
            filepath = metadata.get("filepath", "unknown")
            start = metadata.get("start_line", "?")
            end = metadata.get("end_line", "?")
            # ChromaDB returns L2 distance — lower = more similar
            similarity = max(0, 1 - distance / 2)  # Rough conversion to 0-1

            if similarity < 0.3:
                continue  # Skip low-relevance results

            context_parts.append(
                f"### {filepath} (lines {start}-{end}, relevance: {similarity:.0%})\n"
                f"```\n{doc}\n```\n"
            )

        if len(context_parts) == 1:  # Only the header, no results
            return ""

        context = "\n".join(context_parts)
        logger.info(
            "Retrieved RAG context",
            collection=collection_name,
            chunks_returned=len(context_parts) - 1,
        )
        return context

    except Exception as e:
        logger.warning("RAG retrieval failed", error=str(e))
        return ""