File size: 6,611 Bytes
4b07aaf
 
4de7d31
6a5922c
 
4de7d31
 
 
6a5922c
4de7d31
6a5922c
4de7d31
4b07aaf
85b7ac8
 
 
4de7d31
85b7ac8
 
4de7d31
6a5922c
 
 
 
4b07aaf
6a5922c
4b07aaf
 
4de7d31
 
 
 
 
 
 
 
 
 
 
2794920
 
6a5922c
4b07aaf
 
 
 
 
85b7ac8
6a5922c
4de7d31
6a5922c
 
 
4de7d31
 
 
 
 
 
 
 
85b7ac8
 
 
 
4de7d31
 
 
 
 
85b7ac8
 
 
6a5922c
 
 
 
 
 
4de7d31
6a5922c
 
 
 
85b7ac8
 
 
 
 
 
6a5922c
 
 
 
85b7ac8
 
 
 
 
4de7d31
4b07aaf
85b7ac8
4de7d31
4b07aaf
85b7ac8
4de7d31
 
 
 
 
 
4b07aaf
 
 
 
85b7ac8
4b07aaf
4de7d31
 
4b07aaf
4de7d31
 
 
85b7ac8
4de7d31
4b07aaf
 
 
 
 
 
 
 
85b7ac8
4de7d31
 
 
 
 
 
 
 
 
6a5922c
 
4de7d31
6a5922c
 
 
 
 
 
 
 
85b7ac8
6a5922c
85b7ac8
 
 
4b07aaf
85b7ac8
6a5922c
 
 
4de7d31
6a5922c
 
 
85b7ac8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""Deterministic grader for trajectory scoring.

Scoring weights (difficulty-aware):
  base score      5%   (participation β€” guarantees score > 0)
  partial fixes  35%   (proportional to fix ratio)
  complete bonus 25%   (all issues fixed β€” scales with difficulty)
  efficiency     25%   (decays with extra steps β€” slower decay for harder tasks)
  hint penalty   -4%   each (reduced to -3% for hard/expert)
  failed edit    -2%   each
  difficulty     +5%   bonus for hard/expert tasks when fully solved

Score is clamped to [0.0, 1.0].
"""

from typing import Any, Dict, List

from server.models import GraderResult, TaskDifficulty
from server.tasks.task_registry import TASK_REGISTRY

# ── Base weights ──────────────────────────────────────────────
BASE_SCORE = 0.05
PARTIAL_FIX_WEIGHT = 0.35
COMPLETE_BONUS = 0.25
EFFICIENCY_MAX = 0.25
EFFICIENCY_DECAY = 0.03  # per extra step beyond optimal
HINT_PENALTY = 0.04
FAILED_ACTION_PENALTY = 0.02

# ── Difficulty modifiers ──────────────────────────────────────
# Maps difficulty β†’ (complete_bonus_extra, efficiency_decay_mult, hint_penalty_mult)
#   complete_bonus_extra: added to COMPLETE_BONUS when all issues fixed
#   efficiency_decay_mult: multiplier on decay (lower = more forgiving)
#   hint_penalty_mult: multiplier on hint cost (lower = cheaper hints)
DIFFICULTY_MODIFIERS = {
    TaskDifficulty.EASY:   (0.00, 1.0, 1.0),
    TaskDifficulty.MEDIUM: (0.00, 0.9, 1.0),
    TaskDifficulty.HARD:   (0.03, 0.7, 0.75),
}

SCORE_FLOOR = 0.01
SCORE_CEIL = 0.99

EDIT_ACTION_TYPES = frozenset({
    "edit_file", "replace_line", "add_line",
    "delete_line", "add_block", "delete_block",
})


def _clamp(value: float) -> float:
    """Clamp score to [0, 1]."""
    return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4)))


def _get_difficulty(task_id: str) -> TaskDifficulty:
    """Look up a task's difficulty from the registry."""
    task_cls = TASK_REGISTRY.get(task_id)
    if task_cls is None:
        return TaskDifficulty.MEDIUM
    return task_cls.DIFFICULTY


def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
    if task_id not in TASK_REGISTRY:
        raise ValueError(f"Unknown task: {task_id}")

    difficulty = _get_difficulty(task_id)
    bonus_extra, decay_mult, hint_mult = DIFFICULTY_MODIFIERS.get(
        difficulty, (0.00, 1.0, 1.0)
    )

    if not trajectory:
        return GraderResult(
            task_id=task_id,
            score=_clamp(BASE_SCORE),
            breakdown={
                "base": BASE_SCORE,
                "partial_fixes": 0.0,
                "complete_solution": 0.0,
                "efficiency": 0.0,
                "difficulty_bonus": 0.0,
                "hint_penalty": 0.0,
                "failed_action_penalty": 0.0,
            },
            feedback="No actions taken.",
            steps_taken=0,
            hints_used=0,
        )

    final_step = trajectory[-1]
    steps_taken = len(trajectory)
    hints_used = sum(
        1 for s in trajectory
        if s.get("action", {}).get("action_type") == "request_hint"
    )

    issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
    issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
    fix_ratio = issues_fixed / issues_total

    # ── Component 1: Partial fix credit (proportional) ────────
    partial_score = PARTIAL_FIX_WEIGHT * fix_ratio

    # ── Component 2: Full-solution bonus ──────────────────────
    complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0

    # ── Component 3: Difficulty bonus ─────────────────────────
    # Extra reward for fully solving harder tasks
    diff_bonus = bonus_extra if issues_fixed == issues_total else 0.0

    # ── Component 4: Efficiency bonus ─────────────────────────
    # Harder tasks get slower decay (more forgiving on step count)
    if issues_fixed == 0:
        efficiency_score = 0.0
    elif steps_taken <= issues_total:
        efficiency_score = EFFICIENCY_MAX
    else:
        extra = steps_taken - issues_total
        effective_decay = EFFICIENCY_DECAY * decay_mult
        efficiency_score = max(0.0, EFFICIENCY_MAX - effective_decay * extra)

    # ── Component 5: Hint penalty ─────────────────────────────
    # Harder tasks get reduced hint penalty (hints are more reasonable)
    hint_pen = HINT_PENALTY * hint_mult * hints_used

    # ── Component 6: Failed action penalty ────────────────────
    failed_edits = 0
    for step in trajectory:
        action = step.get("action", {})
        if action.get("action_type") in EDIT_ACTION_TYPES:
            edits = action.get("edits") or []
            if not any(e.get("file_path") for e in edits):
                failed_edits += 1
    failed_pen = FAILED_ACTION_PENALTY * failed_edits

    raw = (
        BASE_SCORE
        + partial_score
        + complete_bonus
        + diff_bonus
        + efficiency_score
        - hint_pen
        - failed_pen
    )
    score = _clamp(raw)

    # ── Feedback ──────────────────────────────────────────────
    if score >= 0.85:
        feedback = "Excellent β€” all issues fixed efficiently."
    elif score >= 0.65:
        feedback = "Good job β€” most issues fixed."
    elif score >= 0.45:
        feedback = "Partial success β€” some issues remain."
    elif score >= 0.25:
        feedback = "Limited progress β€” review the error messages carefully."
    else:
        feedback = "Needs improvement β€” try analyzing the error phase first."

    return GraderResult(
        task_id=task_id,
        score=score,
        breakdown={
            "base": BASE_SCORE,
            "partial_fixes": round(partial_score, 4),
            "complete_solution": round(complete_bonus, 4),
            "difficulty_bonus": round(diff_bonus, 4),
            "efficiency": round(efficiency_score, 4),
            "hint_penalty": round(-hint_pen, 4),
            "failed_action_penalty": round(-failed_pen, 4),
        },
        feedback=feedback,
        steps_taken=steps_taken,
        hints_used=hints_used,
    )