| """Deterministic grader for trajectory scoring. |
| |
| Scoring weights (difficulty-aware): |
| base score 5% (participation β guarantees score > 0) |
| partial fixes 35% (proportional to fix ratio) |
| complete bonus 25% (all issues fixed β scales with difficulty) |
| efficiency 25% (decays with extra steps β slower decay for harder tasks) |
| hint penalty -4% each (reduced to -3% for hard/expert) |
| failed edit -2% each |
| difficulty +5% bonus for hard/expert tasks when fully solved |
| |
| Score is clamped to [0.0, 1.0]. |
| """ |
|
|
| from typing import Any, Dict, List |
|
|
| from server.models import GraderResult, TaskDifficulty |
| from server.tasks.task_registry import TASK_REGISTRY |
|
|
| |
| BASE_SCORE = 0.05 |
| PARTIAL_FIX_WEIGHT = 0.35 |
| COMPLETE_BONUS = 0.25 |
| EFFICIENCY_MAX = 0.25 |
| EFFICIENCY_DECAY = 0.03 |
| HINT_PENALTY = 0.04 |
| FAILED_ACTION_PENALTY = 0.02 |
|
|
| |
| |
| |
| |
| |
| DIFFICULTY_MODIFIERS = { |
| TaskDifficulty.EASY: (0.00, 1.0, 1.0), |
| TaskDifficulty.MEDIUM: (0.00, 0.9, 1.0), |
| TaskDifficulty.HARD: (0.03, 0.7, 0.75), |
| } |
|
|
| SCORE_FLOOR = 0.01 |
| SCORE_CEIL = 0.99 |
|
|
| EDIT_ACTION_TYPES = frozenset({ |
| "edit_file", "replace_line", "add_line", |
| "delete_line", "add_block", "delete_block", |
| }) |
|
|
|
|
| def _clamp(value: float) -> float: |
| """Clamp score to [0, 1].""" |
| return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4))) |
|
|
|
|
| def _get_difficulty(task_id: str) -> TaskDifficulty: |
| """Look up a task's difficulty from the registry.""" |
| task_cls = TASK_REGISTRY.get(task_id) |
| if task_cls is None: |
| return TaskDifficulty.MEDIUM |
| return task_cls.DIFFICULTY |
|
|
|
|
| def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult: |
| if task_id not in TASK_REGISTRY: |
| raise ValueError(f"Unknown task: {task_id}") |
|
|
| difficulty = _get_difficulty(task_id) |
| bonus_extra, decay_mult, hint_mult = DIFFICULTY_MODIFIERS.get( |
| difficulty, (0.00, 1.0, 1.0) |
| ) |
|
|
| if not trajectory: |
| return GraderResult( |
| task_id=task_id, |
| score=_clamp(BASE_SCORE), |
| breakdown={ |
| "base": BASE_SCORE, |
| "partial_fixes": 0.0, |
| "complete_solution": 0.0, |
| "efficiency": 0.0, |
| "difficulty_bonus": 0.0, |
| "hint_penalty": 0.0, |
| "failed_action_penalty": 0.0, |
| }, |
| feedback="No actions taken.", |
| steps_taken=0, |
| hints_used=0, |
| ) |
|
|
| final_step = trajectory[-1] |
| steps_taken = len(trajectory) |
| hints_used = sum( |
| 1 for s in trajectory |
| if s.get("action", {}).get("action_type") == "request_hint" |
| ) |
|
|
| issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0)) |
| issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1))) |
| fix_ratio = issues_fixed / issues_total |
|
|
| |
| partial_score = PARTIAL_FIX_WEIGHT * fix_ratio |
|
|
| |
| complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0 |
|
|
| |
| |
| diff_bonus = bonus_extra if issues_fixed == issues_total else 0.0 |
|
|
| |
| |
| if issues_fixed == 0: |
| efficiency_score = 0.0 |
| elif steps_taken <= issues_total: |
| efficiency_score = EFFICIENCY_MAX |
| else: |
| extra = steps_taken - issues_total |
| effective_decay = EFFICIENCY_DECAY * decay_mult |
| efficiency_score = max(0.0, EFFICIENCY_MAX - effective_decay * extra) |
|
|
| |
| |
| hint_pen = HINT_PENALTY * hint_mult * hints_used |
|
|
| |
| failed_edits = 0 |
| for step in trajectory: |
| action = step.get("action", {}) |
| if action.get("action_type") in EDIT_ACTION_TYPES: |
| edits = action.get("edits") or [] |
| if not any(e.get("file_path") for e in edits): |
| failed_edits += 1 |
| failed_pen = FAILED_ACTION_PENALTY * failed_edits |
|
|
| raw = ( |
| BASE_SCORE |
| + partial_score |
| + complete_bonus |
| + diff_bonus |
| + efficiency_score |
| - hint_pen |
| - failed_pen |
| ) |
| score = _clamp(raw) |
|
|
| |
| if score >= 0.85: |
| feedback = "Excellent β all issues fixed efficiently." |
| elif score >= 0.65: |
| feedback = "Good job β most issues fixed." |
| elif score >= 0.45: |
| feedback = "Partial success β some issues remain." |
| elif score >= 0.25: |
| feedback = "Limited progress β review the error messages carefully." |
| else: |
| feedback = "Needs improvement β try analyzing the error phase first." |
|
|
| return GraderResult( |
| task_id=task_id, |
| score=score, |
| breakdown={ |
| "base": BASE_SCORE, |
| "partial_fixes": round(partial_score, 4), |
| "complete_solution": round(complete_bonus, 4), |
| "difficulty_bonus": round(diff_bonus, 4), |
| "efficiency": round(efficiency_score, 4), |
| "hint_penalty": round(-hint_pen, 4), |
| "failed_action_penalty": round(-failed_pen, 4), |
| }, |
| feedback=feedback, |
| steps_taken=steps_taken, |
| hints_used=hints_used, |
| ) |
|
|