"""Deterministic grader for trajectory scoring. Scoring weights (difficulty-aware): base score 5% (participation — guarantees score > 0) partial fixes 35% (proportional to fix ratio) complete bonus 25% (all issues fixed — scales with difficulty) efficiency 25% (decays with extra steps — slower decay for harder tasks) hint penalty -4% each (reduced to -3% for hard/expert) failed edit -2% each difficulty +5% bonus for hard/expert tasks when fully solved Score is clamped to [0.0, 1.0]. """ from typing import Any, Dict, List from server.models import GraderResult, TaskDifficulty from server.tasks.task_registry import TASK_REGISTRY # ── Base weights ────────────────────────────────────────────── BASE_SCORE = 0.05 PARTIAL_FIX_WEIGHT = 0.35 COMPLETE_BONUS = 0.25 EFFICIENCY_MAX = 0.25 EFFICIENCY_DECAY = 0.03 # per extra step beyond optimal HINT_PENALTY = 0.04 FAILED_ACTION_PENALTY = 0.02 # ── Difficulty modifiers ────────────────────────────────────── # Maps difficulty → (complete_bonus_extra, efficiency_decay_mult, hint_penalty_mult) # complete_bonus_extra: added to COMPLETE_BONUS when all issues fixed # efficiency_decay_mult: multiplier on decay (lower = more forgiving) # hint_penalty_mult: multiplier on hint cost (lower = cheaper hints) DIFFICULTY_MODIFIERS = { TaskDifficulty.EASY: (0.00, 1.0, 1.0), TaskDifficulty.MEDIUM: (0.00, 0.9, 1.0), TaskDifficulty.HARD: (0.03, 0.7, 0.75), } SCORE_FLOOR = 0.01 SCORE_CEIL = 0.99 EDIT_ACTION_TYPES = frozenset({ "edit_file", "replace_line", "add_line", "delete_line", "add_block", "delete_block", }) def _clamp(value: float) -> float: """Clamp score to [0, 1].""" return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4))) def _get_difficulty(task_id: str) -> TaskDifficulty: """Look up a task's difficulty from the registry.""" task_cls = TASK_REGISTRY.get(task_id) if task_cls is None: return TaskDifficulty.MEDIUM return task_cls.DIFFICULTY def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult: if task_id not in TASK_REGISTRY: raise ValueError(f"Unknown task: {task_id}") difficulty = _get_difficulty(task_id) bonus_extra, decay_mult, hint_mult = DIFFICULTY_MODIFIERS.get( difficulty, (0.00, 1.0, 1.0) ) if not trajectory: return GraderResult( task_id=task_id, score=_clamp(BASE_SCORE), breakdown={ "base": BASE_SCORE, "partial_fixes": 0.0, "complete_solution": 0.0, "efficiency": 0.0, "difficulty_bonus": 0.0, "hint_penalty": 0.0, "failed_action_penalty": 0.0, }, feedback="No actions taken.", steps_taken=0, hints_used=0, ) final_step = trajectory[-1] steps_taken = len(trajectory) hints_used = sum( 1 for s in trajectory if s.get("action", {}).get("action_type") == "request_hint" ) issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0)) issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1))) fix_ratio = issues_fixed / issues_total # ── Component 1: Partial fix credit (proportional) ──────── partial_score = PARTIAL_FIX_WEIGHT * fix_ratio # ── Component 2: Full-solution bonus ────────────────────── complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0 # ── Component 3: Difficulty bonus ───────────────────────── # Extra reward for fully solving harder tasks diff_bonus = bonus_extra if issues_fixed == issues_total else 0.0 # ── Component 4: Efficiency bonus ───────────────────────── # Harder tasks get slower decay (more forgiving on step count) if issues_fixed == 0: efficiency_score = 0.0 elif steps_taken <= issues_total: efficiency_score = EFFICIENCY_MAX else: extra = steps_taken - issues_total effective_decay = EFFICIENCY_DECAY * decay_mult efficiency_score = max(0.0, EFFICIENCY_MAX - effective_decay * extra) # ── Component 5: Hint penalty ───────────────────────────── # Harder tasks get reduced hint penalty (hints are more reasonable) hint_pen = HINT_PENALTY * hint_mult * hints_used # ── Component 6: Failed action penalty ──────────────────── failed_edits = 0 for step in trajectory: action = step.get("action", {}) if action.get("action_type") in EDIT_ACTION_TYPES: edits = action.get("edits") or [] if not any(e.get("file_path") for e in edits): failed_edits += 1 failed_pen = FAILED_ACTION_PENALTY * failed_edits raw = ( BASE_SCORE + partial_score + complete_bonus + diff_bonus + efficiency_score - hint_pen - failed_pen ) score = _clamp(raw) # ── Feedback ────────────────────────────────────────────── if score >= 0.85: feedback = "Excellent — all issues fixed efficiently." elif score >= 0.65: feedback = "Good job — most issues fixed." elif score >= 0.45: feedback = "Partial success — some issues remain." elif score >= 0.25: feedback = "Limited progress — review the error messages carefully." else: feedback = "Needs improvement — try analyzing the error phase first." return GraderResult( task_id=task_id, score=score, breakdown={ "base": BASE_SCORE, "partial_fixes": round(partial_score, 4), "complete_solution": round(complete_bonus, 4), "difficulty_bonus": round(diff_bonus, 4), "efficiency": round(efficiency_score, 4), "hint_penalty": round(-hint_pen, 4), "failed_action_penalty": round(-failed_pen, 4), }, feedback=feedback, steps_taken=steps_taken, hints_used=hints_used, )