File size: 1,466 Bytes
6e7ce30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from typing import List
from environment.models import Issue

def compute_f1(agent_issues: List[Issue], ground_truth: List[Issue]) -> float:
    """
    Deterministic grader: exact match on line and category.
    Returns F1 score between 0.0 and 1.0.
    """
    # Convert ground truth to set of (line, category) tuples
    truth_set = {(issue.line, issue.category) for issue in ground_truth}
    agent_set = {(issue.line, issue.category) for issue in agent_issues}

    true_positives = len(truth_set & agent_set)
    false_positives = len(agent_set - truth_set)
    false_negatives = len(truth_set - agent_set)

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0

    if precision + recall == 0:
        return 0.0
    f1 = 2 * (precision * recall) / (precision + recall)
    return round(f1, 3)

def grade_easy(agent_issues: List[Issue]) -> float:
    from environment.tasks import TASKS
    return compute_f1(agent_issues, TASKS["easy"]["ground_truth"])

def grade_medium(agent_issues: List[Issue]) -> float:
    from environment.tasks import TASKS
    return compute_f1(agent_issues, TASKS["medium"]["ground_truth"])

def grade_hard(agent_issues: List[Issue]) -> float:
    from environment.tasks import TASKS
    return compute_f1(agent_issues, TASKS["hard"]["ground_truth"])