from typing import List from codelens_env.models import Scenario, ActionRecord, Category, Severity, ActionType def grade_bug_detection(scenario: Scenario, history: List[ActionRecord]) -> float: if not history: return 0.0 flag_actions = [a for a in history if a.action_type == ActionType.FLAG_ISSUE] if not flag_actions: return 0.0 total_issues = len(scenario.ground_truth_issues) if total_issues == 0: return 0.0 matched_gt_indices = set() used_action_indices = set() issue_scores = [] for i, truth in enumerate(scenario.ground_truth_issues): if truth.category != Category.BUG: continue # Try to find a matching action for this ground truth issue best_match_idx = -1 for j, action in enumerate(flag_actions): if j in used_action_indices: continue # Match criteria: filename, line +- 3, category BUG, >= 1 keyword if (action.filename == truth.filename and action.line_number is not None and abs(action.line_number - truth.line_number) <= 3 and action.category == Category.BUG): body_lower = (action.body or "").lower() if any(kw.lower() in body_lower for kw in truth.keywords): best_match_idx = j break if best_match_idx != -1: action = flag_actions[best_match_idx] used_action_indices.add(best_match_idx) matched_gt_indices.add(i) # Calculate issue score sev_diff = abs(Severity.ordinal(truth.severity) - Severity.ordinal(action.severity)) sev_score = max(0.0, 1.0 - sev_diff * 0.3) body_lower = (action.body or "").lower() match_count = sum(1 for kw in truth.keywords if kw.lower() in body_lower) kw_score = match_count / len(truth.keywords) issue_score = 0.5 * kw_score + 0.5 * sev_score issue_scores.append(issue_score) coverage = len(matched_gt_indices) / total_issues avg_issue_score = sum(issue_scores) / len(issue_scores) if issue_scores else 0.0 total_flags = len(flag_actions) unmatched_flags = total_flags - len(used_action_indices) precision_penalty = unmatched_flags / max(1, total_flags) final_score = 0.4 * coverage + 0.6 * avg_issue_score - 0.1 * precision_penalty return float(max(0.0, min(1.0, final_score)))