Spaces:
Sleeping
Sleeping
File size: 3,851 Bytes
4b445f6 b9da50c 4b445f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | """
Evaluation Harness
===================
Runs the Ninja Code Guard pipeline against a set of test PRs with
known issues (ground truth) and measures precision, recall, and latency.
Usage:
python -m tests.eval.run_eval
Dataset format (JSON files in tests/eval/dataset/):
{
"pr_id": "sql_injection_basic",
"diff": "...",
"file_contents": {"app.py": "..."},
"expected_findings": [
{"file_path": "app.py", "line_start": 5, "category": "sql_injection"},
]
}
"""
from __future__ import annotations
import asyncio
import json
import time
from pathlib import Path
from tests.eval.metrics import EvalResult, EvalSummary
async def evaluate_single_pr(test_case: dict) -> EvalResult:
"""
Run the pipeline on one test PR and compare against ground truth.
A finding is considered a true positive if it matches an expected
finding on the same file_path and within 3 lines of the expected line.
"""
from app.agents.performance_agent import PerformanceAgent
from app.agents.security_agent import SecurityAgent
from app.agents.style_agent import StyleAgent
from app.agents.synthesizer import synthesize
from app.github.client import PRData
pr_data = PRData(
repo_full_name="eval/test",
pr_number=0,
commit_sha="eval",
title=test_case.get("pr_id", "eval"),
diff=test_case["diff"],
changed_files=[],
file_contents=test_case.get("file_contents", {}),
)
start = time.time()
# Run all agents
security = SecurityAgent()
performance = PerformanceAgent()
style = StyleAgent()
sec_findings, perf_findings, style_findings = await asyncio.gather(
security.review(pr_data),
performance.review(pr_data),
style.review(pr_data),
)
review = synthesize(sec_findings, perf_findings, style_findings)
elapsed_ms = int((time.time() - start) * 1000)
# Compare against ground truth
expected = test_case.get("expected_findings", [])
actual = review.findings
matched_expected = set()
matched_actual = set()
for i, exp in enumerate(expected):
for j, act in enumerate(actual):
if j in matched_actual:
continue
# Match: same file, within 3 lines, same category
if (
act.file_path == exp["file_path"]
and abs(act.line_start - exp["line_start"]) <= 3
and act.category == exp.get("category", act.category)
):
matched_expected.add(i)
matched_actual.add(j)
break
tp = len(matched_expected)
fp = len(actual) - len(matched_actual)
fn = len(expected) - len(matched_expected)
return EvalResult(
pr_id=test_case.get("pr_id", "unknown"),
true_positives=tp,
false_positives=fp,
false_negatives=fn,
latency_ms=elapsed_ms,
)
async def run_evaluation():
"""Run evaluation on all test cases in the dataset directory."""
dataset_dir = Path(__file__).parent / "dataset"
if not dataset_dir.exists() or not list(dataset_dir.glob("*.json")):
print("No evaluation dataset found. Create JSON files in tests/eval/dataset/")
print("See tests/eval/run_eval.py docstring for format.")
return
summary = EvalSummary()
for test_file in sorted(dataset_dir.glob("*.json")):
print(f"Evaluating: {test_file.name}...")
test_case = json.loads(test_file.read_text())
result = await evaluate_single_pr(test_case)
summary.results.append(result)
print(f" P={result.precision:.0%} R={result.recall:.0%} F1={result.f1:.0%} ({result.latency_ms}ms)")
print("\n" + summary.summary())
if __name__ == "__main__":
asyncio.run(run_evaluation())
|