Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import Iterable, Optional | |
| from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec | |
| MIN_STRICT_SCORE = 0.001 | |
| MAX_STRICT_SCORE = 0.999 | |
| class FindingEvaluation: | |
| component_score: float | |
| matched_vulnerability_id: Optional[str] | |
| is_confirmed_match: bool | |
| feedback: str | |
| confidence_calibration: float | |
| def _line_overlap_score(submitted_start: int, submitted_end: int, target_line: int) -> float: | |
| if submitted_start <= target_line <= submitted_end: | |
| return 1.0 | |
| min_distance = min(abs(target_line - submitted_start), abs(target_line - submitted_end)) | |
| if min_distance <= 2: | |
| return 0.6 | |
| if min_distance <= 5: | |
| return 0.3 | |
| return 0.0 | |
| def _best_candidate( | |
| task: TaskSpec, | |
| filename: str, | |
| vuln_type: str, | |
| severity: str, | |
| line_start: int, | |
| line_end: int, | |
| ) -> tuple[Optional[VulnerabilitySpec], float, float, float, float]: | |
| best_target = None | |
| best_score = -1.0 | |
| best_type_match = 0.0 | |
| best_line_match = 0.0 | |
| best_severity_match = 0.0 | |
| for target in task.vulnerabilities: | |
| file_match = 1.0 if target.filename == filename else 0.0 | |
| type_match = 1.0 if target.vuln_type == vuln_type else 0.0 | |
| severity_match = 1.0 if target.severity == severity else 0.0 | |
| line_match = _line_overlap_score(line_start, line_end, target.line) | |
| candidate_score = ( | |
| 0.35 * file_match | |
| + 0.30 * type_match | |
| + 0.20 * line_match | |
| + 0.15 * severity_match | |
| ) | |
| if candidate_score > best_score: | |
| best_score = candidate_score | |
| best_target = target | |
| best_type_match = type_match | |
| best_line_match = line_match | |
| best_severity_match = severity_match | |
| return best_target, max(best_score, 0.0), best_type_match, best_line_match, best_severity_match | |
| def evaluate_finding( | |
| *, | |
| task: TaskSpec, | |
| filename: str, | |
| vuln_type: str, | |
| severity: str, | |
| line_start: int, | |
| line_end: int, | |
| confidence: float, | |
| matched_already: Iterable[str], | |
| ) -> FindingEvaluation: | |
| target, structure_score, type_match, line_match, severity_match = _best_candidate( | |
| task, | |
| filename, | |
| vuln_type, | |
| severity, | |
| line_start, | |
| line_end, | |
| ) | |
| if target is None: | |
| return FindingEvaluation( | |
| component_score=0.0, | |
| matched_vulnerability_id=None, | |
| is_confirmed_match=False, | |
| feedback="No plausible vulnerability match for this finding.", | |
| confidence_calibration=0.0, | |
| ) | |
| target_conf = TARGET_CONFIDENCE[target.severity] | |
| calibration = max(0.0, 1.0 - abs(confidence - target_conf)) | |
| component_score = 0.8 * structure_score + 0.2 * calibration | |
| component_score = max(0.0, min(1.0, component_score)) | |
| confirmed = ( | |
| target.filename == filename | |
| and type_match == 1.0 | |
| and line_match >= 0.6 | |
| and severity_match == 1.0 | |
| ) | |
| if target.id in set(matched_already) and confirmed: | |
| return FindingEvaluation( | |
| component_score=0.25 * component_score, | |
| matched_vulnerability_id=target.id, | |
| is_confirmed_match=False, | |
| feedback="Duplicate of a previously confirmed vulnerability.", | |
| confidence_calibration=calibration, | |
| ) | |
| if confirmed: | |
| return FindingEvaluation( | |
| component_score=component_score, | |
| matched_vulnerability_id=target.id, | |
| is_confirmed_match=True, | |
| feedback="Confirmed vulnerability: file/type/line/severity align with ground truth.", | |
| confidence_calibration=calibration, | |
| ) | |
| if target.filename != filename: | |
| hint = "Wrong file." | |
| elif type_match == 0.0: | |
| hint = "Correct file, vulnerability type mismatch." | |
| elif line_match < 0.6: | |
| hint = "Correct file/type, but location is off." | |
| elif severity_match == 0.0: | |
| hint = "Severity mismatch." | |
| else: | |
| hint = "Partial match, refine details." | |
| return FindingEvaluation( | |
| component_score=component_score, | |
| matched_vulnerability_id=None, | |
| is_confirmed_match=False, | |
| feedback=hint, | |
| confidence_calibration=calibration, | |
| ) | |
| def final_grade( | |
| *, | |
| task: TaskSpec, | |
| confirmed_vulnerability_ids: Iterable[str], | |
| findings_count: int, | |
| false_positive_count: int, | |
| duplicate_count: int, | |
| avg_component_score: float, | |
| avg_confidence_calibration: float, | |
| ) -> float: | |
| confirmed_ids = set(confirmed_vulnerability_ids) | |
| total_weight = sum(SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities) | |
| covered_weight = sum( | |
| SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities if v.id in confirmed_ids | |
| ) | |
| weighted_recall = (covered_weight / total_weight) if total_weight > 0 else 0.0 | |
| precision = (len(confirmed_ids) / findings_count) if findings_count > 0 else 0.0 | |
| fp_penalty = min(0.5, 0.08 * false_positive_count) | |
| dup_penalty = min(0.2, 0.05 * duplicate_count) | |
| volume_penalty = 0.0 | |
| optimal_findings = len(task.vulnerabilities) + 1 | |
| if findings_count > optimal_findings: | |
| volume_penalty = min(0.2, 0.03 * (findings_count - optimal_findings)) | |
| score = ( | |
| 0.55 * weighted_recall | |
| + 0.20 * precision | |
| + 0.15 * avg_component_score | |
| + 0.10 * avg_confidence_calibration | |
| ) | |
| score -= fp_penalty + dup_penalty + volume_penalty | |
| return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, score)) | |