Spaces:

NinjainPJs
/

ninja-code-guard

Sleeping

App Files Files Community

ninja-code-guard / tests /eval /run_eval.py

NinjainPJs

Fix all ruff lint issues — 0 errors, 92 tests passing

b9da50c 3 months ago

raw

history blame contribute delete

3.85 kB

	"""
	Evaluation Harness
	===================

	Runs the Ninja Code Guard pipeline against a set of test PRs with
	known issues (ground truth) and measures precision, recall, and latency.

	Usage:
	python -m tests.eval.run_eval

	Dataset format (JSON files in tests/eval/dataset/):
	{
	"pr_id": "sql_injection_basic",
	"diff": "...",
	"file_contents": {"app.py": "..."},
	"expected_findings": [
	{"file_path": "app.py", "line_start": 5, "category": "sql_injection"},
	]
	}
	"""

	from __future__ import annotations

	import asyncio
	import json
	import time
	from pathlib import Path

	from tests.eval.metrics import EvalResult, EvalSummary


	async def evaluate_single_pr(test_case: dict) -> EvalResult:
	"""
	Run the pipeline on one test PR and compare against ground truth.

	A finding is considered a true positive if it matches an expected
	finding on the same file_path and within 3 lines of the expected line.
	"""
	from app.agents.performance_agent import PerformanceAgent
	from app.agents.security_agent import SecurityAgent
	from app.agents.style_agent import StyleAgent
	from app.agents.synthesizer import synthesize
	from app.github.client import PRData

	pr_data = PRData(
	repo_full_name="eval/test",
	pr_number=0,
	commit_sha="eval",
	title=test_case.get("pr_id", "eval"),
	diff=test_case["diff"],
	changed_files=[],
	file_contents=test_case.get("file_contents", {}),
	)

	start = time.time()

	# Run all agents
	security = SecurityAgent()
	performance = PerformanceAgent()
	style = StyleAgent()

	sec_findings, perf_findings, style_findings = await asyncio.gather(
	security.review(pr_data),
	performance.review(pr_data),
	style.review(pr_data),
	)

	review = synthesize(sec_findings, perf_findings, style_findings)
	elapsed_ms = int((time.time() - start) * 1000)

	# Compare against ground truth
	expected = test_case.get("expected_findings", [])
	actual = review.findings

	matched_expected = set()
	matched_actual = set()

	for i, exp in enumerate(expected):
	for j, act in enumerate(actual):
	if j in matched_actual:
	continue
	# Match: same file, within 3 lines, same category
	if (
	act.file_path == exp["file_path"]
	and abs(act.line_start - exp["line_start"]) <= 3
	and act.category == exp.get("category", act.category)
	):
	matched_expected.add(i)
	matched_actual.add(j)
	break

	tp = len(matched_expected)
	fp = len(actual) - len(matched_actual)
	fn = len(expected) - len(matched_expected)

	return EvalResult(
	pr_id=test_case.get("pr_id", "unknown"),
	true_positives=tp,
	false_positives=fp,
	false_negatives=fn,
	latency_ms=elapsed_ms,
	)


	async def run_evaluation():
	"""Run evaluation on all test cases in the dataset directory."""
	dataset_dir = Path(__file__).parent / "dataset"

	if not dataset_dir.exists() or not list(dataset_dir.glob("*.json")):
	print("No evaluation dataset found. Create JSON files in tests/eval/dataset/")
	print("See tests/eval/run_eval.py docstring for format.")
	return

	summary = EvalSummary()

	for test_file in sorted(dataset_dir.glob("*.json")):
	print(f"Evaluating: {test_file.name}...")
	test_case = json.loads(test_file.read_text())
	result = await evaluate_single_pr(test_case)
	summary.results.append(result)
	print(f" P={result.precision:.0%} R={result.recall:.0%} F1={result.f1:.0%} ({result.latency_ms}ms)")

	print("\n" + summary.summary())


	if __name__ == "__main__":
	asyncio.run(run_evaluation())