from __future__ import annotations from dataclasses import dataclass from typing import Iterable, Optional from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec MIN_STRICT_SCORE = 0.001 MAX_STRICT_SCORE = 0.999 @dataclass(frozen=True) class FindingEvaluation: component_score: float matched_vulnerability_id: Optional[str] is_confirmed_match: bool feedback: str confidence_calibration: float def _line_overlap_score(submitted_start: int, submitted_end: int, target_line: int) -> float: if submitted_start <= target_line <= submitted_end: return 1.0 min_distance = min(abs(target_line - submitted_start), abs(target_line - submitted_end)) if min_distance <= 2: return 0.6 if min_distance <= 5: return 0.3 return 0.0 def _best_candidate( task: TaskSpec, filename: str, vuln_type: str, severity: str, line_start: int, line_end: int, ) -> tuple[Optional[VulnerabilitySpec], float, float, float, float]: best_target = None best_score = -1.0 best_type_match = 0.0 best_line_match = 0.0 best_severity_match = 0.0 for target in task.vulnerabilities: file_match = 1.0 if target.filename == filename else 0.0 type_match = 1.0 if target.vuln_type == vuln_type else 0.0 severity_match = 1.0 if target.severity == severity else 0.0 line_match = _line_overlap_score(line_start, line_end, target.line) candidate_score = ( 0.35 * file_match + 0.30 * type_match + 0.20 * line_match + 0.15 * severity_match ) if candidate_score > best_score: best_score = candidate_score best_target = target best_type_match = type_match best_line_match = line_match best_severity_match = severity_match return best_target, max(best_score, 0.0), best_type_match, best_line_match, best_severity_match def evaluate_finding( *, task: TaskSpec, filename: str, vuln_type: str, severity: str, line_start: int, line_end: int, confidence: float, matched_already: Iterable[str], ) -> FindingEvaluation: target, structure_score, type_match, line_match, severity_match = _best_candidate( task, filename, vuln_type, severity, line_start, line_end, ) if target is None: return FindingEvaluation( component_score=0.0, matched_vulnerability_id=None, is_confirmed_match=False, feedback="No plausible vulnerability match for this finding.", confidence_calibration=0.0, ) target_conf = TARGET_CONFIDENCE[target.severity] calibration = max(0.0, 1.0 - abs(confidence - target_conf)) component_score = 0.8 * structure_score + 0.2 * calibration component_score = max(0.0, min(1.0, component_score)) confirmed = ( target.filename == filename and type_match == 1.0 and line_match >= 0.6 and severity_match == 1.0 ) if target.id in set(matched_already) and confirmed: return FindingEvaluation( component_score=0.25 * component_score, matched_vulnerability_id=target.id, is_confirmed_match=False, feedback="Duplicate of a previously confirmed vulnerability.", confidence_calibration=calibration, ) if confirmed: return FindingEvaluation( component_score=component_score, matched_vulnerability_id=target.id, is_confirmed_match=True, feedback="Confirmed vulnerability: file/type/line/severity align with ground truth.", confidence_calibration=calibration, ) if target.filename != filename: hint = "Wrong file." elif type_match == 0.0: hint = "Correct file, vulnerability type mismatch." elif line_match < 0.6: hint = "Correct file/type, but location is off." elif severity_match == 0.0: hint = "Severity mismatch." else: hint = "Partial match, refine details." return FindingEvaluation( component_score=component_score, matched_vulnerability_id=None, is_confirmed_match=False, feedback=hint, confidence_calibration=calibration, ) def final_grade( *, task: TaskSpec, confirmed_vulnerability_ids: Iterable[str], findings_count: int, false_positive_count: int, duplicate_count: int, avg_component_score: float, avg_confidence_calibration: float, ) -> float: confirmed_ids = set(confirmed_vulnerability_ids) total_weight = sum(SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities) covered_weight = sum( SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities if v.id in confirmed_ids ) weighted_recall = (covered_weight / total_weight) if total_weight > 0 else 0.0 precision = (len(confirmed_ids) / findings_count) if findings_count > 0 else 0.0 fp_penalty = min(0.5, 0.08 * false_positive_count) dup_penalty = min(0.2, 0.05 * duplicate_count) volume_penalty = 0.0 optimal_findings = len(task.vulnerabilities) + 1 if findings_count > optimal_findings: volume_penalty = min(0.2, 0.03 * (findings_count - optimal_findings)) score = ( 0.55 * weighted_recall + 0.20 * precision + 0.15 * avg_component_score + 0.10 * avg_confidence_calibration ) score -= fp_penalty + dup_penalty + volume_penalty return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, score))