Krishna1107's picture
fixed inference
2794920
"""Deterministic grader for trajectory scoring.
Scoring weights (difficulty-aware):
base score 5% (participation β€” guarantees score > 0)
partial fixes 35% (proportional to fix ratio)
complete bonus 25% (all issues fixed β€” scales with difficulty)
efficiency 25% (decays with extra steps β€” slower decay for harder tasks)
hint penalty -4% each (reduced to -3% for hard/expert)
failed edit -2% each
difficulty +5% bonus for hard/expert tasks when fully solved
Score is clamped to [0.0, 1.0].
"""
from typing import Any, Dict, List
from server.models import GraderResult, TaskDifficulty
from server.tasks.task_registry import TASK_REGISTRY
# ── Base weights ──────────────────────────────────────────────
BASE_SCORE = 0.05
PARTIAL_FIX_WEIGHT = 0.35
COMPLETE_BONUS = 0.25
EFFICIENCY_MAX = 0.25
EFFICIENCY_DECAY = 0.03 # per extra step beyond optimal
HINT_PENALTY = 0.04
FAILED_ACTION_PENALTY = 0.02
# ── Difficulty modifiers ──────────────────────────────────────
# Maps difficulty β†’ (complete_bonus_extra, efficiency_decay_mult, hint_penalty_mult)
# complete_bonus_extra: added to COMPLETE_BONUS when all issues fixed
# efficiency_decay_mult: multiplier on decay (lower = more forgiving)
# hint_penalty_mult: multiplier on hint cost (lower = cheaper hints)
DIFFICULTY_MODIFIERS = {
TaskDifficulty.EASY: (0.00, 1.0, 1.0),
TaskDifficulty.MEDIUM: (0.00, 0.9, 1.0),
TaskDifficulty.HARD: (0.03, 0.7, 0.75),
}
SCORE_FLOOR = 0.01
SCORE_CEIL = 0.99
EDIT_ACTION_TYPES = frozenset({
"edit_file", "replace_line", "add_line",
"delete_line", "add_block", "delete_block",
})
def _clamp(value: float) -> float:
"""Clamp score to [0, 1]."""
return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4)))
def _get_difficulty(task_id: str) -> TaskDifficulty:
"""Look up a task's difficulty from the registry."""
task_cls = TASK_REGISTRY.get(task_id)
if task_cls is None:
return TaskDifficulty.MEDIUM
return task_cls.DIFFICULTY
def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
if task_id not in TASK_REGISTRY:
raise ValueError(f"Unknown task: {task_id}")
difficulty = _get_difficulty(task_id)
bonus_extra, decay_mult, hint_mult = DIFFICULTY_MODIFIERS.get(
difficulty, (0.00, 1.0, 1.0)
)
if not trajectory:
return GraderResult(
task_id=task_id,
score=_clamp(BASE_SCORE),
breakdown={
"base": BASE_SCORE,
"partial_fixes": 0.0,
"complete_solution": 0.0,
"efficiency": 0.0,
"difficulty_bonus": 0.0,
"hint_penalty": 0.0,
"failed_action_penalty": 0.0,
},
feedback="No actions taken.",
steps_taken=0,
hints_used=0,
)
final_step = trajectory[-1]
steps_taken = len(trajectory)
hints_used = sum(
1 for s in trajectory
if s.get("action", {}).get("action_type") == "request_hint"
)
issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
fix_ratio = issues_fixed / issues_total
# ── Component 1: Partial fix credit (proportional) ────────
partial_score = PARTIAL_FIX_WEIGHT * fix_ratio
# ── Component 2: Full-solution bonus ──────────────────────
complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0
# ── Component 3: Difficulty bonus ─────────────────────────
# Extra reward for fully solving harder tasks
diff_bonus = bonus_extra if issues_fixed == issues_total else 0.0
# ── Component 4: Efficiency bonus ─────────────────────────
# Harder tasks get slower decay (more forgiving on step count)
if issues_fixed == 0:
efficiency_score = 0.0
elif steps_taken <= issues_total:
efficiency_score = EFFICIENCY_MAX
else:
extra = steps_taken - issues_total
effective_decay = EFFICIENCY_DECAY * decay_mult
efficiency_score = max(0.0, EFFICIENCY_MAX - effective_decay * extra)
# ── Component 5: Hint penalty ─────────────────────────────
# Harder tasks get reduced hint penalty (hints are more reasonable)
hint_pen = HINT_PENALTY * hint_mult * hints_used
# ── Component 6: Failed action penalty ────────────────────
failed_edits = 0
for step in trajectory:
action = step.get("action", {})
if action.get("action_type") in EDIT_ACTION_TYPES:
edits = action.get("edits") or []
if not any(e.get("file_path") for e in edits):
failed_edits += 1
failed_pen = FAILED_ACTION_PENALTY * failed_edits
raw = (
BASE_SCORE
+ partial_score
+ complete_bonus
+ diff_bonus
+ efficiency_score
- hint_pen
- failed_pen
)
score = _clamp(raw)
# ── Feedback ──────────────────────────────────────────────
if score >= 0.85:
feedback = "Excellent β€” all issues fixed efficiently."
elif score >= 0.65:
feedback = "Good job β€” most issues fixed."
elif score >= 0.45:
feedback = "Partial success β€” some issues remain."
elif score >= 0.25:
feedback = "Limited progress β€” review the error messages carefully."
else:
feedback = "Needs improvement β€” try analyzing the error phase first."
return GraderResult(
task_id=task_id,
score=score,
breakdown={
"base": BASE_SCORE,
"partial_fixes": round(partial_score, 4),
"complete_solution": round(complete_bonus, 4),
"difficulty_bonus": round(diff_bonus, 4),
"efficiency": round(efficiency_score, 4),
"hint_penalty": round(-hint_pen, 4),
"failed_action_penalty": round(-failed_pen, 4),
},
feedback=feedback,
steps_taken=steps_taken,
hints_used=hints_used,
)