Spaces:

NinjainPJs
/

ninja-code-guard

Running

App Files Files Community

ninja-code-guard / app /agents /base_agent.py

NinjainPJs

initial - commit

4b445f6 3 months ago

raw

history blame contribute delete

11.2 kB

	"""
	Base Agent Interface
	=====================

	All domain agents (Security, Performance, Style) inherit from this base class.
	It provides shared infrastructure:

	1. Groq LLM client — ChatGroq configured with Llama-3.1-70B
	2. Structured output — LLM returns typed Finding objects, not raw text
	3. Error handling — graceful fallback if the LLM call fails
	4. Timing — measures how long each agent takes (for latency metrics)

	Design pattern: Template Method
	- The base class defines the algorithm skeleton (receive diff → run tools → call LLM → return findings)
	- Subclasses override specific steps (system_prompt, run_static_tools)
	- This prevents code duplication across 3 agents that follow the same flow

	Why LangChain?
	- Provides a unified interface across LLM providers (Groq, Gemini, OpenAI)
	- If Groq goes down, we swap to Gemini by changing one line
	- Structured output parsing is built in (with_structured_output)
	- Prompt templates with variable substitution
	"""

	from __future__ import annotations

	import time
	from abc import ABC, abstractmethod

	import structlog
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_groq import ChatGroq
	from pydantic import BaseModel, Field

	from app.config import settings
	from app.github.client import PRData
	from app.models.findings import Finding

	logger = structlog.get_logger()


	class AgentFindings(BaseModel):
	"""
	Schema for the LLM's structured output.

	By wrapping findings in a Pydantic model, we can use LangChain's
	`with_structured_output()` which constrains the LLM to return
	valid JSON matching this exact schema. No more parsing raw text!

	How with_structured_output() works under the hood:
	1. It adds the JSON schema to the system prompt
	2. It sets response_format to JSON mode (if the model supports it)
	3. It validates the response against the schema
	4. If validation fails, it retries (configurable)
	"""

	findings: list[FindingOutput] = Field(
	default_factory=list,
	description="List of security/performance/style findings",
	)


	class FindingOutput(BaseModel):
	"""
	The schema we ask the LLM to produce for each finding.

	This is slightly different from our internal Finding model because:
	- The LLM doesn't know which agent it is (we add that after)
	- We give the LLM freedom on field names that match its training
	- We validate and convert to our Finding model post-LLM

	Note: This class is defined BEFORE AgentFindings because Python
	needs it to exist when AgentFindings references it. But Pydantic
	handles forward references with model_rebuild().
	"""

	file_path: str = Field(description="Path to the file (e.g., 'app.py')")
	line_start: int = Field(description="Starting line number of the issue")
	line_end: int = Field(description="Ending line number of the issue")
	severity: str = Field(description="One of: critical, high, medium, low")
	category: str = Field(description="Issue category (e.g., 'sql_injection', 'hardcoded_secret')")
	title: str = Field(description="Short one-line title of the finding")
	description: str = Field(description="Detailed explanation of the issue and its impact")
	suggested_fix: str = Field(default="", description="Corrected code snippet")
	cwe_id: str \| None = Field(default=None, description="CWE ID if applicable (e.g., 'CWE-89')")
	confidence: float = Field(description="Confidence score from 0.0 to 1.0")


	# Rebuild the model to resolve the forward reference
	AgentFindings.model_rebuild()


	class BaseAgent(ABC):
	"""
	Abstract base class for all domain agents.

	Subclasses must implement:
	- agent_name: which agent this is ("security", "performance", "style")
	- system_prompt: the detailed system prompt for the LLM
	- run_static_analysis(): optional static tools (Bandit, Semgrep, etc.)

	Usage:
	agent = SecurityAgent()
	findings = await agent.review(pr_data)
	"""

	def __init__(self):
	"""
	Initialize the LLM client.

	ChatGroq connects to Groq's API which runs Llama-3.1-70B at
	500+ tokens/sec — the fastest open-source LLM inference available.
	This speed is critical: we need each agent to complete in 3-8 seconds
	so the full review stays under 15 seconds.

	Temperature=0.1: We want nearly deterministic output. Code review
	should be consistent — the same code should get the same findings.
	A small temperature (not 0) allows slight variation to avoid
	getting stuck in repetitive patterns.
	"""
	self.llm = ChatGroq(
	model="llama-3.3-70b-versatile",
	api_key=settings.groq_api_key,
	temperature=0.1,
	max_tokens=4096,
	)

	@property
	@abstractmethod
	def agent_name(self) -> str:
	"""The agent identifier: 'security', 'performance', or 'style'."""
	...

	@property
	@abstractmethod
	def system_prompt(self) -> str:
	"""The full system prompt for this agent."""
	...

	async def run_static_analysis(self, pr_data: PRData) -> str:
	"""
	Run static analysis tools on the PR files.

	Override in subclasses to run agent-specific tools:
	- SecurityAgent: Bandit + detect-secrets
	- PerformanceAgent: radon + AST analysis
	- StyleAgent: Ruff/pylint

	Returns a string summary of tool findings to include in the LLM prompt.
	Default: no static analysis (LLM-only review).
	"""
	return ""

	def _build_prompt(self) -> ChatPromptTemplate:
	"""
	Build the LangChain prompt template.

	ChatPromptTemplate.from_messages() creates a multi-turn prompt:
	- ("system", ...) → the system message (agent persona + instructions)
	- ("human", ...) → the user message (the actual PR data to review)

	Variables in {curly_braces} are substituted at runtime with .ainvoke().
	"""
	return ChatPromptTemplate.from_messages([
	("system", self.system_prompt),
	("human", (
	"## PR Diff\n"
	"```diff\n{diff}\n```\n\n"
	"## Changed File Contents\n"
	"{file_contents}\n\n"
	"## Static Analysis Results\n"
	"{static_analysis}\n\n"
	"{rag_context}\n\n"
	"Analyze this PR and return your findings as structured JSON."
	)),
	])

	def _convert_to_findings(self, agent_output: AgentFindings) -> list[Finding]:
	"""
	Convert the LLM's output to our internal Finding model.

	This adds the agent_name field and validates/clamps values:
	- Severity is lowercased and validated
	- Confidence is clamped to [0.0, 1.0]
	- Invalid findings are skipped (not crashed on)
	"""
	findings = []
	for f in agent_output.findings:
	try:
	severity = f.severity.lower().strip()
	if severity not in ("critical", "high", "medium", "low"):
	severity = "medium" # Default for ambiguous severity

	confidence = max(0.0, min(1.0, f.confidence))

	finding = Finding(
	agent=self.agent_name,
	file_path=f.file_path,
	line_start=f.line_start,
	line_end=f.line_end,
	severity=severity,
	category=f.category,
	title=f.title,
	description=f.description,
	suggested_fix=f.suggested_fix,
	cwe_id=f.cwe_id,
	confidence=confidence,
	)
	findings.append(finding)
	except Exception as e:
	logger.warning(
	"Skipping malformed finding",
	agent=self.agent_name,
	error=str(e),
	)
	return findings

	def _format_file_contents(self, file_contents: dict[str, str]) -> str:
	"""
	Format file contents for the LLM prompt.

	Each file is wrapped in a code block with its path as a header.
	We truncate very long files to stay within LLM context limits.
	Groq's Llama-3.1-70B has 128K context, so we have plenty of room
	for typical PRs, but we cap each file at 500 lines to be safe.
	"""
	parts = []
	for filepath, content in file_contents.items():
	lines = content.split("\n")
	if len(lines) > 500:
	content = "\n".join(lines[:500]) + "\n... (truncated)"
	parts.append(f"### {filepath}\n```\n{content}\n```")
	return "\n\n".join(parts) if parts else "No file contents available."

	async def review(self, pr_data: PRData, rag_context: str = "") -> list[Finding]:
	"""
	Main entry point: review a PR and return findings.

	This is the Template Method:
	1. Run static analysis tools (subclass-specific)
	2. Build the prompt with diff + files + tool output + RAG context
	3. Call the LLM with structured output
	4. Convert to Finding objects
	5. Log timing and return

	If the LLM call fails, we return an empty list rather than crashing
	the entire pipeline. The other agents can still contribute findings.

	Args:
	pr_data: The PR diff, file contents, and metadata
	rag_context: Optional RAG context from ChromaDB (related code chunks)
	"""
	start_time = time.time()

	try:
	# Step 1: Run static analysis tools
	static_results = await self.run_static_analysis(pr_data)

	# Step 2: Build the prompt
	prompt = self._build_prompt()

	# Step 3: Create the structured output chain
	structured_llm = self.llm.with_structured_output(AgentFindings)
	chain = prompt \| structured_llm

	# Step 4: Call the LLM
	result = await chain.ainvoke({
	"diff": pr_data.diff[:15000], # Cap diff size for token limits
	"file_contents": self._format_file_contents(pr_data.file_contents),
	"static_analysis": static_results or "No static analysis results.",
	"rag_context": rag_context or "",
	})

	# Step 5: Convert to Finding objects
	findings = self._convert_to_findings(result)

	elapsed_ms = int((time.time() - start_time) * 1000)
	logger.info(
	"Agent review completed",
	agent=self.agent_name,
	findings_count=len(findings),
	elapsed_ms=elapsed_ms,
	)

	return findings

	except Exception as e:
	elapsed_ms = int((time.time() - start_time) * 1000)
	logger.error(
	"Agent review failed",
	agent=self.agent_name,
	error=str(e),
	elapsed_ms=elapsed_ms,
	)
	return [] # Don't crash the pipeline — other agents can still work