File size: 4,122 Bytes
3d44779
 
 
 
 
989722c
3d44779
 
989722c
3d44779
 
989722c
 
 
 
 
 
 
 
 
 
3d44779
 
989722c
3d44779
 
 
 
 
 
989722c
 
 
 
 
 
 
 
 
 
3d44779
 
989722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d44779
 
 
 
 
989722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d44779
 
 
989722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d44779
 
 
 
989722c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4451363
3d44779
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""Bug-fix task grader."""

from __future__ import annotations

try:
    from ..models import TaskGrade
    from ..tasks.catalog import ReviewTask
except ImportError:
    from models import TaskGrade
    from tasks.catalog import ReviewTask

from .shared import (
    base_grade,
    compile_code,
    composite_grade_score,
    component_score,
    execute_cases,
    quality_metrics,
    similarity_score,
    summarize_results,
)


def grade_bug_fix_task(
    task: ReviewTask,

    code: str,

    *,

    include_hidden: bool,

    timeout_s: float = 2.0,

) -> TaskGrade:
    """Grade a bug-fix task against public or full test suites."""

    compiled, compile_error = compile_code(code)
    quality = quality_metrics(code, task.function_name)
    similarity = similarity_score(code, task.reference_code)
    details = {
        "compile_error": compile_error,
        "quality_notes": quality["quality_notes"],
        "style_score": quality["style_score"],
        "visibility": "full" if include_hidden else "public",
    }

    if not compiled:
        details["test_results"] = []
        details["test_summary"] = "Code does not compile."
        return base_grade(
            score=composite_grade_score(
                correctness=0.0,
                quality=0.05,
                runtime=0.05,
                syntax=0.0,
                similarity=similarity,
                baseline=0.04,
                penalty=0.05,
            ),
            syntax_score=component_score(0.01),
            tests_passed=0,
            tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
            quality_score=component_score(0.01),
            runtime_score=component_score(0.01),
            timed_out=False,
            details=details,
        )

    cases = task.public_cases + (task.hidden_cases if include_hidden else [])
    result = execute_cases(code, task.function_name, cases, timeout_s=timeout_s)
    if result.get("timed_out"):
        details["test_results"] = []
        details["test_summary"] = result["error"]
        return base_grade(
            score=composite_grade_score(
                correctness=0.10,
                quality=quality["score"],
                runtime=0.0,
                syntax=0.95,
                similarity=similarity,
                baseline=0.06,
                penalty=0.12,
            ),
            syntax_score=component_score(0.95),
            tests_passed=0,
            tests_total=len(cases),
            quality_score=quality["score"],
            runtime_score=component_score(0.01),
            timed_out=True,
            details=details,
        )
    if "error" in result:
        details["test_results"] = []
        details["test_summary"] = result["error"]
        return base_grade(
            score=composite_grade_score(
                correctness=0.12,
                quality=quality["score"],
                runtime=0.0,
                syntax=0.95,
                similarity=similarity,
                baseline=0.06,
                penalty=0.08,
            ),
            syntax_score=component_score(0.95),
            tests_passed=0,
            tests_total=len(cases),
            quality_score=quality["score"],
            runtime_score=component_score(0.01),
            timed_out=False,
            details=details,
        )

    data = result["data"]
    pass_rate = data["passed"] / max(data["total"], 1)
    details["test_results"] = data["results"]
    details["test_summary"] = summarize_results("Test results", data["results"])
    return base_grade(
        score=composite_grade_score(
            correctness=pass_rate,
            quality=quality["score"],
            runtime=0.05,
            syntax=0.95,
            similarity=similarity,
            baseline=0.08,
        ),
        syntax_score=component_score(0.95),
        tests_passed=data["passed"],
        tests_total=data["total"],
        quality_score=quality["score"],
        runtime_score=component_score(0.01),
        timed_out=False,
        details=details,
    )