File size: 6,611 Bytes
4b07aaf 4de7d31 6a5922c 4de7d31 6a5922c 4de7d31 6a5922c 4de7d31 4b07aaf 85b7ac8 4de7d31 85b7ac8 4de7d31 6a5922c 4b07aaf 6a5922c 4b07aaf 4de7d31 2794920 6a5922c 4b07aaf 85b7ac8 6a5922c 4de7d31 6a5922c 4de7d31 85b7ac8 4de7d31 85b7ac8 6a5922c 4de7d31 6a5922c 85b7ac8 6a5922c 85b7ac8 4de7d31 4b07aaf 85b7ac8 4de7d31 4b07aaf 85b7ac8 4de7d31 4b07aaf 85b7ac8 4b07aaf 4de7d31 4b07aaf 4de7d31 85b7ac8 4de7d31 4b07aaf 85b7ac8 4de7d31 6a5922c 4de7d31 6a5922c 85b7ac8 6a5922c 85b7ac8 4b07aaf 85b7ac8 6a5922c 4de7d31 6a5922c 85b7ac8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | """Deterministic grader for trajectory scoring.
Scoring weights (difficulty-aware):
base score 5% (participation β guarantees score > 0)
partial fixes 35% (proportional to fix ratio)
complete bonus 25% (all issues fixed β scales with difficulty)
efficiency 25% (decays with extra steps β slower decay for harder tasks)
hint penalty -4% each (reduced to -3% for hard/expert)
failed edit -2% each
difficulty +5% bonus for hard/expert tasks when fully solved
Score is clamped to [0.0, 1.0].
"""
from typing import Any, Dict, List
from server.models import GraderResult, TaskDifficulty
from server.tasks.task_registry import TASK_REGISTRY
# ββ Base weights ββββββββββββββββββββββββββββββββββββββββββββββ
BASE_SCORE = 0.05
PARTIAL_FIX_WEIGHT = 0.35
COMPLETE_BONUS = 0.25
EFFICIENCY_MAX = 0.25
EFFICIENCY_DECAY = 0.03 # per extra step beyond optimal
HINT_PENALTY = 0.04
FAILED_ACTION_PENALTY = 0.02
# ββ Difficulty modifiers ββββββββββββββββββββββββββββββββββββββ
# Maps difficulty β (complete_bonus_extra, efficiency_decay_mult, hint_penalty_mult)
# complete_bonus_extra: added to COMPLETE_BONUS when all issues fixed
# efficiency_decay_mult: multiplier on decay (lower = more forgiving)
# hint_penalty_mult: multiplier on hint cost (lower = cheaper hints)
DIFFICULTY_MODIFIERS = {
TaskDifficulty.EASY: (0.00, 1.0, 1.0),
TaskDifficulty.MEDIUM: (0.00, 0.9, 1.0),
TaskDifficulty.HARD: (0.03, 0.7, 0.75),
}
SCORE_FLOOR = 0.01
SCORE_CEIL = 0.99
EDIT_ACTION_TYPES = frozenset({
"edit_file", "replace_line", "add_line",
"delete_line", "add_block", "delete_block",
})
def _clamp(value: float) -> float:
"""Clamp score to [0, 1]."""
return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4)))
def _get_difficulty(task_id: str) -> TaskDifficulty:
"""Look up a task's difficulty from the registry."""
task_cls = TASK_REGISTRY.get(task_id)
if task_cls is None:
return TaskDifficulty.MEDIUM
return task_cls.DIFFICULTY
def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
if task_id not in TASK_REGISTRY:
raise ValueError(f"Unknown task: {task_id}")
difficulty = _get_difficulty(task_id)
bonus_extra, decay_mult, hint_mult = DIFFICULTY_MODIFIERS.get(
difficulty, (0.00, 1.0, 1.0)
)
if not trajectory:
return GraderResult(
task_id=task_id,
score=_clamp(BASE_SCORE),
breakdown={
"base": BASE_SCORE,
"partial_fixes": 0.0,
"complete_solution": 0.0,
"efficiency": 0.0,
"difficulty_bonus": 0.0,
"hint_penalty": 0.0,
"failed_action_penalty": 0.0,
},
feedback="No actions taken.",
steps_taken=0,
hints_used=0,
)
final_step = trajectory[-1]
steps_taken = len(trajectory)
hints_used = sum(
1 for s in trajectory
if s.get("action", {}).get("action_type") == "request_hint"
)
issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
fix_ratio = issues_fixed / issues_total
# ββ Component 1: Partial fix credit (proportional) ββββββββ
partial_score = PARTIAL_FIX_WEIGHT * fix_ratio
# ββ Component 2: Full-solution bonus ββββββββββββββββββββββ
complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0
# ββ Component 3: Difficulty bonus βββββββββββββββββββββββββ
# Extra reward for fully solving harder tasks
diff_bonus = bonus_extra if issues_fixed == issues_total else 0.0
# ββ Component 4: Efficiency bonus βββββββββββββββββββββββββ
# Harder tasks get slower decay (more forgiving on step count)
if issues_fixed == 0:
efficiency_score = 0.0
elif steps_taken <= issues_total:
efficiency_score = EFFICIENCY_MAX
else:
extra = steps_taken - issues_total
effective_decay = EFFICIENCY_DECAY * decay_mult
efficiency_score = max(0.0, EFFICIENCY_MAX - effective_decay * extra)
# ββ Component 5: Hint penalty βββββββββββββββββββββββββββββ
# Harder tasks get reduced hint penalty (hints are more reasonable)
hint_pen = HINT_PENALTY * hint_mult * hints_used
# ββ Component 6: Failed action penalty ββββββββββββββββββββ
failed_edits = 0
for step in trajectory:
action = step.get("action", {})
if action.get("action_type") in EDIT_ACTION_TYPES:
edits = action.get("edits") or []
if not any(e.get("file_path") for e in edits):
failed_edits += 1
failed_pen = FAILED_ACTION_PENALTY * failed_edits
raw = (
BASE_SCORE
+ partial_score
+ complete_bonus
+ diff_bonus
+ efficiency_score
- hint_pen
- failed_pen
)
score = _clamp(raw)
# ββ Feedback ββββββββββββββββββββββββββββββββββββββββββββββ
if score >= 0.85:
feedback = "Excellent β all issues fixed efficiently."
elif score >= 0.65:
feedback = "Good job β most issues fixed."
elif score >= 0.45:
feedback = "Partial success β some issues remain."
elif score >= 0.25:
feedback = "Limited progress β review the error messages carefully."
else:
feedback = "Needs improvement β try analyzing the error phase first."
return GraderResult(
task_id=task_id,
score=score,
breakdown={
"base": BASE_SCORE,
"partial_fixes": round(partial_score, 4),
"complete_solution": round(complete_bonus, 4),
"difficulty_bonus": round(diff_bonus, 4),
"efficiency": round(efficiency_score, 4),
"hint_penalty": round(-hint_pen, 4),
"failed_action_penalty": round(-failed_pen, 4),
},
feedback=feedback,
steps_taken=steps_taken,
hints_used=hints_used,
)
|