| from typing import List |
| from codelens_env.models import Scenario, ActionRecord, Category, Severity, ActionType |
|
|
| def grade_bug_detection(scenario: Scenario, history: List[ActionRecord]) -> float: |
| if not history: |
| return 0.0 |
| |
| flag_actions = [a for a in history if a.action_type == ActionType.FLAG_ISSUE] |
| if not flag_actions: |
| return 0.0 |
| |
| total_issues = len(scenario.ground_truth_issues) |
| if total_issues == 0: |
| return 0.0 |
| |
| matched_gt_indices = set() |
| used_action_indices = set() |
| issue_scores = [] |
| |
| for i, truth in enumerate(scenario.ground_truth_issues): |
| if truth.category != Category.BUG: |
| continue |
| |
| |
| best_match_idx = -1 |
| for j, action in enumerate(flag_actions): |
| if j in used_action_indices: |
| continue |
| |
| |
| if (action.filename == truth.filename and |
| action.line_number is not None and |
| abs(action.line_number - truth.line_number) <= 3 and |
| action.category == Category.BUG): |
| |
| body_lower = (action.body or "").lower() |
| if any(kw.lower() in body_lower for kw in truth.keywords): |
| best_match_idx = j |
| break |
| |
| if best_match_idx != -1: |
| action = flag_actions[best_match_idx] |
| used_action_indices.add(best_match_idx) |
| matched_gt_indices.add(i) |
| |
| |
| sev_diff = abs(Severity.ordinal(truth.severity) - Severity.ordinal(action.severity)) |
| sev_score = max(0.0, 1.0 - sev_diff * 0.3) |
| |
| body_lower = (action.body or "").lower() |
| match_count = sum(1 for kw in truth.keywords if kw.lower() in body_lower) |
| kw_score = match_count / len(truth.keywords) |
| |
| issue_score = 0.5 * kw_score + 0.5 * sev_score |
| issue_scores.append(issue_score) |
| |
| coverage = len(matched_gt_indices) / total_issues |
| avg_issue_score = sum(issue_scores) / len(issue_scores) if issue_scores else 0.0 |
| |
| total_flags = len(flag_actions) |
| unmatched_flags = total_flags - len(used_action_indices) |
| precision_penalty = unmatched_flags / max(1, total_flags) |
| |
| final_score = 0.4 * coverage + 0.6 * avg_issue_score - 0.1 * precision_penalty |
| return float(max(0.0, min(1.0, final_score))) |
|
|
|
|