Spaces:

Drac0528
/

CodeSecure

Sleeping

File size: 5,669 Bytes

from __future__ import annotations

from dataclasses import dataclass
from typing import Iterable, Optional

from .tasks import SEVERITY_WEIGHTS, TARGET_CONFIDENCE, TaskSpec, VulnerabilitySpec

MIN_STRICT_SCORE = 0.001
MAX_STRICT_SCORE = 0.999


@dataclass(frozen=True)
class FindingEvaluation:
    component_score: float
    matched_vulnerability_id: Optional[str]
    is_confirmed_match: bool
    feedback: str
    confidence_calibration: float


def _line_overlap_score(submitted_start: int, submitted_end: int, target_line: int) -> float:
    if submitted_start <= target_line <= submitted_end:
        return 1.0
    min_distance = min(abs(target_line - submitted_start), abs(target_line - submitted_end))
    if min_distance <= 2:
        return 0.6
    if min_distance <= 5:
        return 0.3
    return 0.0


def _best_candidate(
    task: TaskSpec,
    filename: str,
    vuln_type: str,
    severity: str,
    line_start: int,
    line_end: int,
) -> tuple[Optional[VulnerabilitySpec], float, float, float, float]:
    best_target = None
    best_score = -1.0
    best_type_match = 0.0
    best_line_match = 0.0
    best_severity_match = 0.0

    for target in task.vulnerabilities:
        file_match = 1.0 if target.filename == filename else 0.0
        type_match = 1.0 if target.vuln_type == vuln_type else 0.0
        severity_match = 1.0 if target.severity == severity else 0.0
        line_match = _line_overlap_score(line_start, line_end, target.line)

        candidate_score = (
            0.35 * file_match
            + 0.30 * type_match
            + 0.20 * line_match
            + 0.15 * severity_match
        )

        if candidate_score > best_score:
            best_score = candidate_score
            best_target = target
            best_type_match = type_match
            best_line_match = line_match
            best_severity_match = severity_match

    return best_target, max(best_score, 0.0), best_type_match, best_line_match, best_severity_match


def evaluate_finding(
    *,
    task: TaskSpec,
    filename: str,
    vuln_type: str,
    severity: str,
    line_start: int,
    line_end: int,
    confidence: float,
    matched_already: Iterable[str],
) -> FindingEvaluation:
    target, structure_score, type_match, line_match, severity_match = _best_candidate(
        task,
        filename,
        vuln_type,
        severity,
        line_start,
        line_end,
    )

    if target is None:
        return FindingEvaluation(
            component_score=0.0,
            matched_vulnerability_id=None,
            is_confirmed_match=False,
            feedback="No plausible vulnerability match for this finding.",
            confidence_calibration=0.0,
        )

    target_conf = TARGET_CONFIDENCE[target.severity]
    calibration = max(0.0, 1.0 - abs(confidence - target_conf))

    component_score = 0.8 * structure_score + 0.2 * calibration
    component_score = max(0.0, min(1.0, component_score))

    confirmed = (
        target.filename == filename
        and type_match == 1.0
        and line_match >= 0.6
        and severity_match == 1.0
    )

    if target.id in set(matched_already) and confirmed:
        return FindingEvaluation(
            component_score=0.25 * component_score,
            matched_vulnerability_id=target.id,
            is_confirmed_match=False,
            feedback="Duplicate of a previously confirmed vulnerability.",
            confidence_calibration=calibration,
        )

    if confirmed:
        return FindingEvaluation(
            component_score=component_score,
            matched_vulnerability_id=target.id,
            is_confirmed_match=True,
            feedback="Confirmed vulnerability: file/type/line/severity align with ground truth.",
            confidence_calibration=calibration,
        )

    if target.filename != filename:
        hint = "Wrong file."
    elif type_match == 0.0:
        hint = "Correct file, vulnerability type mismatch."
    elif line_match < 0.6:
        hint = "Correct file/type, but location is off."
    elif severity_match == 0.0:
        hint = "Severity mismatch."
    else:
        hint = "Partial match, refine details."

    return FindingEvaluation(
        component_score=component_score,
        matched_vulnerability_id=None,
        is_confirmed_match=False,
        feedback=hint,
        confidence_calibration=calibration,
    )


def final_grade(
    *,
    task: TaskSpec,
    confirmed_vulnerability_ids: Iterable[str],
    findings_count: int,
    false_positive_count: int,
    duplicate_count: int,
    avg_component_score: float,
    avg_confidence_calibration: float,
) -> float:
    confirmed_ids = set(confirmed_vulnerability_ids)

    total_weight = sum(SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities)
    covered_weight = sum(
        SEVERITY_WEIGHTS[v.severity] for v in task.vulnerabilities if v.id in confirmed_ids
    )
    weighted_recall = (covered_weight / total_weight) if total_weight > 0 else 0.0

    precision = (len(confirmed_ids) / findings_count) if findings_count > 0 else 0.0

    fp_penalty = min(0.5, 0.08 * false_positive_count)
    dup_penalty = min(0.2, 0.05 * duplicate_count)
    volume_penalty = 0.0
    optimal_findings = len(task.vulnerabilities) + 1
    if findings_count > optimal_findings:
        volume_penalty = min(0.2, 0.03 * (findings_count - optimal_findings))

    score = (
        0.55 * weighted_recall
        + 0.20 * precision
        + 0.15 * avg_component_score
        + 0.10 * avg_confidence_calibration
    )
    score -= fp_penalty + dup_penalty + volume_penalty

    return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, score))