Spaces:
Sleeping
Sleeping
File size: 2,654 Bytes
4b445f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | """
Evaluation Metrics
===================
Measures the quality of Ninja Code Guard's reviews against ground truth labels.
Metrics tracked:
- Precision: % of flagged findings that are genuine issues (not false positives)
- Recall: % of known issues that were detected
- F1 Score: Harmonic mean of precision and recall
- Latency: Time from webhook to review posted (p50, p95, p99)
"""
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass
class EvalResult:
"""Result of evaluating one PR against ground truth."""
pr_id: str
true_positives: int = 0
false_positives: int = 0
false_negatives: int = 0
latency_ms: int = 0
@property
def precision(self) -> float:
total = self.true_positives + self.false_positives
return self.true_positives / total if total > 0 else 1.0
@property
def recall(self) -> float:
total = self.true_positives + self.false_negatives
return self.true_positives / total if total > 0 else 1.0
@property
def f1(self) -> float:
p, r = self.precision, self.recall
return 2 * p * r / (p + r) if (p + r) > 0 else 0.0
@dataclass
class EvalSummary:
"""Aggregate metrics across all evaluated PRs."""
results: list[EvalResult] = field(default_factory=list)
@property
def avg_precision(self) -> float:
if not self.results:
return 0.0
return sum(r.precision for r in self.results) / len(self.results)
@property
def avg_recall(self) -> float:
if not self.results:
return 0.0
return sum(r.recall for r in self.results) / len(self.results)
@property
def avg_f1(self) -> float:
if not self.results:
return 0.0
return sum(r.f1 for r in self.results) / len(self.results)
@property
def latency_p50(self) -> int:
if not self.results:
return 0
latencies = sorted(r.latency_ms for r in self.results)
return latencies[len(latencies) // 2]
@property
def latency_p95(self) -> int:
if not self.results:
return 0
latencies = sorted(r.latency_ms for r in self.results)
idx = int(len(latencies) * 0.95)
return latencies[min(idx, len(latencies) - 1)]
def summary(self) -> str:
return (
f"Evaluation Summary ({len(self.results)} PRs)\n"
f" Precision: {self.avg_precision:.1%}\n"
f" Recall: {self.avg_recall:.1%}\n"
f" F1 Score: {self.avg_f1:.1%}\n"
f" Latency: p50={self.latency_p50}ms, p95={self.latency_p95}ms\n"
)
|