Spaces:
Sleeping
Sleeping
File size: 4,122 Bytes
3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 3d44779 989722c 4451363 3d44779 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | """Bug-fix task grader."""
from __future__ import annotations
try:
from ..models import TaskGrade
from ..tasks.catalog import ReviewTask
except ImportError:
from models import TaskGrade
from tasks.catalog import ReviewTask
from .shared import (
base_grade,
compile_code,
composite_grade_score,
component_score,
execute_cases,
quality_metrics,
similarity_score,
summarize_results,
)
def grade_bug_fix_task(
task: ReviewTask,
code: str,
*,
include_hidden: bool,
timeout_s: float = 2.0,
) -> TaskGrade:
"""Grade a bug-fix task against public or full test suites."""
compiled, compile_error = compile_code(code)
quality = quality_metrics(code, task.function_name)
similarity = similarity_score(code, task.reference_code)
details = {
"compile_error": compile_error,
"quality_notes": quality["quality_notes"],
"style_score": quality["style_score"],
"visibility": "full" if include_hidden else "public",
}
if not compiled:
details["test_results"] = []
details["test_summary"] = "Code does not compile."
return base_grade(
score=composite_grade_score(
correctness=0.0,
quality=0.05,
runtime=0.05,
syntax=0.0,
similarity=similarity,
baseline=0.04,
penalty=0.05,
),
syntax_score=component_score(0.01),
tests_passed=0,
tests_total=len(task.public_cases) + (len(task.hidden_cases) if include_hidden else 0),
quality_score=component_score(0.01),
runtime_score=component_score(0.01),
timed_out=False,
details=details,
)
cases = task.public_cases + (task.hidden_cases if include_hidden else [])
result = execute_cases(code, task.function_name, cases, timeout_s=timeout_s)
if result.get("timed_out"):
details["test_results"] = []
details["test_summary"] = result["error"]
return base_grade(
score=composite_grade_score(
correctness=0.10,
quality=quality["score"],
runtime=0.0,
syntax=0.95,
similarity=similarity,
baseline=0.06,
penalty=0.12,
),
syntax_score=component_score(0.95),
tests_passed=0,
tests_total=len(cases),
quality_score=quality["score"],
runtime_score=component_score(0.01),
timed_out=True,
details=details,
)
if "error" in result:
details["test_results"] = []
details["test_summary"] = result["error"]
return base_grade(
score=composite_grade_score(
correctness=0.12,
quality=quality["score"],
runtime=0.0,
syntax=0.95,
similarity=similarity,
baseline=0.06,
penalty=0.08,
),
syntax_score=component_score(0.95),
tests_passed=0,
tests_total=len(cases),
quality_score=quality["score"],
runtime_score=component_score(0.01),
timed_out=False,
details=details,
)
data = result["data"]
pass_rate = data["passed"] / max(data["total"], 1)
details["test_results"] = data["results"]
details["test_summary"] = summarize_results("Test results", data["results"])
return base_grade(
score=composite_grade_score(
correctness=pass_rate,
quality=quality["score"],
runtime=0.05,
syntax=0.95,
similarity=similarity,
baseline=0.08,
),
syntax_score=component_score(0.95),
tests_passed=data["passed"],
tests_total=data["total"],
quality_score=quality["score"],
runtime_score=component_score(0.01),
timed_out=False,
details=details,
)
|