File size: 3,851 Bytes
4b445f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9da50c
4b445f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Evaluation Harness
===================

Runs the Ninja Code Guard pipeline against a set of test PRs with
known issues (ground truth) and measures precision, recall, and latency.

Usage:
    python -m tests.eval.run_eval

Dataset format (JSON files in tests/eval/dataset/):
    {
        "pr_id": "sql_injection_basic",
        "diff": "...",
        "file_contents": {"app.py": "..."},
        "expected_findings": [
            {"file_path": "app.py", "line_start": 5, "category": "sql_injection"},
        ]
    }
"""

from __future__ import annotations

import asyncio
import json
import time
from pathlib import Path

from tests.eval.metrics import EvalResult, EvalSummary


async def evaluate_single_pr(test_case: dict) -> EvalResult:
    """
    Run the pipeline on one test PR and compare against ground truth.

    A finding is considered a true positive if it matches an expected
    finding on the same file_path and within 3 lines of the expected line.
    """
    from app.agents.performance_agent import PerformanceAgent
    from app.agents.security_agent import SecurityAgent
    from app.agents.style_agent import StyleAgent
    from app.agents.synthesizer import synthesize
    from app.github.client import PRData

    pr_data = PRData(
        repo_full_name="eval/test",
        pr_number=0,
        commit_sha="eval",
        title=test_case.get("pr_id", "eval"),
        diff=test_case["diff"],
        changed_files=[],
        file_contents=test_case.get("file_contents", {}),
    )

    start = time.time()

    # Run all agents
    security = SecurityAgent()
    performance = PerformanceAgent()
    style = StyleAgent()

    sec_findings, perf_findings, style_findings = await asyncio.gather(
        security.review(pr_data),
        performance.review(pr_data),
        style.review(pr_data),
    )

    review = synthesize(sec_findings, perf_findings, style_findings)
    elapsed_ms = int((time.time() - start) * 1000)

    # Compare against ground truth
    expected = test_case.get("expected_findings", [])
    actual = review.findings

    matched_expected = set()
    matched_actual = set()

    for i, exp in enumerate(expected):
        for j, act in enumerate(actual):
            if j in matched_actual:
                continue
            # Match: same file, within 3 lines, same category
            if (
                act.file_path == exp["file_path"]
                and abs(act.line_start - exp["line_start"]) <= 3
                and act.category == exp.get("category", act.category)
            ):
                matched_expected.add(i)
                matched_actual.add(j)
                break

    tp = len(matched_expected)
    fp = len(actual) - len(matched_actual)
    fn = len(expected) - len(matched_expected)

    return EvalResult(
        pr_id=test_case.get("pr_id", "unknown"),
        true_positives=tp,
        false_positives=fp,
        false_negatives=fn,
        latency_ms=elapsed_ms,
    )


async def run_evaluation():
    """Run evaluation on all test cases in the dataset directory."""
    dataset_dir = Path(__file__).parent / "dataset"

    if not dataset_dir.exists() or not list(dataset_dir.glob("*.json")):
        print("No evaluation dataset found. Create JSON files in tests/eval/dataset/")
        print("See tests/eval/run_eval.py docstring for format.")
        return

    summary = EvalSummary()

    for test_file in sorted(dataset_dir.glob("*.json")):
        print(f"Evaluating: {test_file.name}...")
        test_case = json.loads(test_file.read_text())
        result = await evaluate_single_pr(test_case)
        summary.results.append(result)
        print(f"  P={result.precision:.0%} R={result.recall:.0%} F1={result.f1:.0%} ({result.latency_ms}ms)")

    print("\n" + summary.summary())


if __name__ == "__main__":
    asyncio.run(run_evaluation())