CodeSecure / server /grader.py
Hassan Shaikh
fix: enforce strict open-interval task scores
2916eb9
from __future__ import annotations
from dataclasses import dataclass
from typing import Iterable, Optional
from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec
MIN_STRICT_SCORE = 0.001
MAX_STRICT_SCORE = 0.999
@dataclass(frozen=True)
class FindingEvaluation:
component_score: float
matched_vulnerability_id: Optional[str]
is_confirmed_match: bool
feedback: str
confidence_calibration: float
def _line_overlap_score(submitted_start: int, submitted_end: int, target_line: int) -> float:
if submitted_start <= target_line <= submitted_end:
return 1.0
min_distance = min(abs(target_line - submitted_start), abs(target_line - submitted_end))
if min_distance <= 2:
return 0.6
if min_distance <= 5:
return 0.3
return 0.0
def _best_candidate(
task: TaskSpec,
filename: str,
vuln_type: str,
severity: str,
line_start: int,
line_end: int,
) -> tuple[Optional[VulnerabilitySpec], float, float, float, float]:
best_target = None
best_score = -1.0
best_type_match = 0.0
best_line_match = 0.0
best_severity_match = 0.0
for target in task.vulnerabilities:
file_match = 1.0 if target.filename == filename else 0.0
type_match = 1.0 if target.vuln_type == vuln_type else 0.0
severity_match = 1.0 if target.severity == severity else 0.0
line_match = _line_overlap_score(line_start, line_end, target.line)
candidate_score = (
0.35 * file_match
+ 0.30 * type_match
+ 0.20 * line_match
+ 0.15 * severity_match
)
if candidate_score > best_score:
best_score = candidate_score
best_target = target
best_type_match = type_match
best_line_match = line_match
best_severity_match = severity_match
return best_target, max(best_score, 0.0), best_type_match, best_line_match, best_severity_match
def evaluate_finding(
*,
task: TaskSpec,
filename: str,
vuln_type: str,
severity: str,
line_start: int,
line_end: int,
confidence: float,
matched_already: Iterable[str],
) -> FindingEvaluation:
target, structure_score, type_match, line_match, severity_match = _best_candidate(
task,
filename,
vuln_type,
severity,
line_start,
line_end,
)
if target is None:
return FindingEvaluation(
component_score=0.0,
matched_vulnerability_id=None,
is_confirmed_match=False,
feedback="No plausible vulnerability match for this finding.",
confidence_calibration=0.0,
)
target_conf = TARGET_CONFIDENCE[target.severity]
calibration = max(0.0, 1.0 - abs(confidence - target_conf))
component_score = 0.8 * structure_score + 0.2 * calibration
component_score = max(0.0, min(1.0, component_score))
confirmed = (
target.filename == filename
and type_match == 1.0
and line_match >= 0.6
and severity_match == 1.0
)
if target.id in set(matched_already) and confirmed:
return FindingEvaluation(
component_score=0.25 * component_score,
matched_vulnerability_id=target.id,
is_confirmed_match=False,
feedback="Duplicate of a previously confirmed vulnerability.",
confidence_calibration=calibration,
)
if confirmed:
return FindingEvaluation(
component_score=component_score,
matched_vulnerability_id=target.id,
is_confirmed_match=True,
feedback="Confirmed vulnerability: file/type/line/severity align with ground truth.",
confidence_calibration=calibration,
)
if target.filename != filename:
hint = "Wrong file."
elif type_match == 0.0:
hint = "Correct file, vulnerability type mismatch."
elif line_match < 0.6:
hint = "Correct file/type, but location is off."
elif severity_match == 0.0:
hint = "Severity mismatch."
else:
hint = "Partial match, refine details."
return FindingEvaluation(
component_score=component_score,
matched_vulnerability_id=None,
is_confirmed_match=False,
feedback=hint,
confidence_calibration=calibration,
)
def final_grade(
*,
task: TaskSpec,
confirmed_vulnerability_ids: Iterable[str],
findings_count: int,
false_positive_count: int,
duplicate_count: int,
avg_component_score: float,
avg_confidence_calibration: float,
) -> float:
confirmed_ids = set(confirmed_vulnerability_ids)
total_weight = sum(SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities)
covered_weight = sum(
SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities if v.id in confirmed_ids
)
weighted_recall = (covered_weight / total_weight) if total_weight > 0 else 0.0
precision = (len(confirmed_ids) / findings_count) if findings_count > 0 else 0.0
fp_penalty = min(0.5, 0.08 * false_positive_count)
dup_penalty = min(0.2, 0.05 * duplicate_count)
volume_penalty = 0.0
optimal_findings = len(task.vulnerabilities) + 1
if findings_count > optimal_findings:
volume_penalty = min(0.2, 0.03 * (findings_count - optimal_findings))
score = (
0.55 * weighted_recall
+ 0.20 * precision
+ 0.15 * avg_component_score
+ 0.10 * avg_confidence_calibration
)
score -= fp_penalty + dup_penalty + volume_penalty
return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, score))