Spaces:

jester1177
/

cloudnative-devops-debug-env

Sleeping

App Files Files Community

cloudnative-devops-debug-env / server /graders /__init__.py

Krishna1107

fixed inference

2794920 about 1 month ago

raw

history blame contribute delete

6.61 kB

	"""Deterministic grader for trajectory scoring.

	Scoring weights (difficulty-aware):
	base score 5% (participation — guarantees score > 0)
	partial fixes 35% (proportional to fix ratio)
	complete bonus 25% (all issues fixed — scales with difficulty)
	efficiency 25% (decays with extra steps — slower decay for harder tasks)
	hint penalty -4% each (reduced to -3% for hard/expert)
	failed edit -2% each
	difficulty +5% bonus for hard/expert tasks when fully solved

	Score is clamped to [0.0, 1.0].
	"""

	from typing import Any, Dict, List

	from server.models import GraderResult, TaskDifficulty
	from server.tasks.task_registry import TASK_REGISTRY

	# ── Base weights ──────────────────────────────────────────────
	BASE_SCORE = 0.05
	PARTIAL_FIX_WEIGHT = 0.35
	COMPLETE_BONUS = 0.25
	EFFICIENCY_MAX = 0.25
	EFFICIENCY_DECAY = 0.03 # per extra step beyond optimal
	HINT_PENALTY = 0.04
	FAILED_ACTION_PENALTY = 0.02

	# ── Difficulty modifiers ──────────────────────────────────────
	# Maps difficulty → (complete_bonus_extra, efficiency_decay_mult, hint_penalty_mult)
	# complete_bonus_extra: added to COMPLETE_BONUS when all issues fixed
	# efficiency_decay_mult: multiplier on decay (lower = more forgiving)
	# hint_penalty_mult: multiplier on hint cost (lower = cheaper hints)
	DIFFICULTY_MODIFIERS = {
	TaskDifficulty.EASY: (0.00, 1.0, 1.0),
	TaskDifficulty.MEDIUM: (0.00, 0.9, 1.0),
	TaskDifficulty.HARD: (0.03, 0.7, 0.75),
	}

	SCORE_FLOOR = 0.01
	SCORE_CEIL = 0.99

	EDIT_ACTION_TYPES = frozenset({
	"edit_file", "replace_line", "add_line",
	"delete_line", "add_block", "delete_block",
	})


	def _clamp(value: float) -> float:
	"""Clamp score to [0, 1]."""
	return max(SCORE_FLOOR, min(SCORE_CEIL, round(value, 4)))


	def _get_difficulty(task_id: str) -> TaskDifficulty:
	"""Look up a task's difficulty from the registry."""
	task_cls = TASK_REGISTRY.get(task_id)
	if task_cls is None:
	return TaskDifficulty.MEDIUM
	return task_cls.DIFFICULTY


	def run_grader(task_id: str, trajectory: List[Dict[str, Any]]) -> GraderResult:
	if task_id not in TASK_REGISTRY:
	raise ValueError(f"Unknown task: {task_id}")

	difficulty = _get_difficulty(task_id)
	bonus_extra, decay_mult, hint_mult = DIFFICULTY_MODIFIERS.get(
	difficulty, (0.00, 1.0, 1.0)
	)

	if not trajectory:
	return GraderResult(
	task_id=task_id,
	score=_clamp(BASE_SCORE),
	breakdown={
	"base": BASE_SCORE,
	"partial_fixes": 0.0,
	"complete_solution": 0.0,
	"efficiency": 0.0,
	"difficulty_bonus": 0.0,
	"hint_penalty": 0.0,
	"failed_action_penalty": 0.0,
	},
	feedback="No actions taken.",
	steps_taken=0,
	hints_used=0,
	)

	final_step = trajectory[-1]
	steps_taken = len(trajectory)
	hints_used = sum(
	1 for s in trajectory
	if s.get("action", {}).get("action_type") == "request_hint"
	)

	issues_fixed = int(final_step.get("info", {}).get("issues_fixed", 0))
	issues_total = max(1, int(final_step.get("info", {}).get("issues_total", 1)))
	fix_ratio = issues_fixed / issues_total

	# ── Component 1: Partial fix credit (proportional) ────────
	partial_score = PARTIAL_FIX_WEIGHT * fix_ratio

	# ── Component 2: Full-solution bonus ──────────────────────
	complete_bonus = COMPLETE_BONUS if issues_fixed == issues_total else 0.0

	# ── Component 3: Difficulty bonus ─────────────────────────
	# Extra reward for fully solving harder tasks
	diff_bonus = bonus_extra if issues_fixed == issues_total else 0.0

	# ── Component 4: Efficiency bonus ─────────────────────────
	# Harder tasks get slower decay (more forgiving on step count)
	if issues_fixed == 0:
	efficiency_score = 0.0
	elif steps_taken <= issues_total:
	efficiency_score = EFFICIENCY_MAX
	else:
	extra = steps_taken - issues_total
	effective_decay = EFFICIENCY_DECAY * decay_mult
	efficiency_score = max(0.0, EFFICIENCY_MAX - effective_decay * extra)

	# ── Component 5: Hint penalty ─────────────────────────────
	# Harder tasks get reduced hint penalty (hints are more reasonable)
	hint_pen = HINT_PENALTY * hint_mult * hints_used

	# ── Component 6: Failed action penalty ────────────────────
	failed_edits = 0
	for step in trajectory:
	action = step.get("action", {})
	if action.get("action_type") in EDIT_ACTION_TYPES:
	edits = action.get("edits") or []
	if not any(e.get("file_path") for e in edits):
	failed_edits += 1
	failed_pen = FAILED_ACTION_PENALTY * failed_edits

	raw = (
	BASE_SCORE
	+ partial_score
	+ complete_bonus
	+ diff_bonus
	+ efficiency_score
	- hint_pen
	- failed_pen
	)
	score = _clamp(raw)

	# ── Feedback ──────────────────────────────────────────────
	if score >= 0.85:
	feedback = "Excellent — all issues fixed efficiently."
	elif score >= 0.65:
	feedback = "Good job — most issues fixed."
	elif score >= 0.45:
	feedback = "Partial success — some issues remain."
	elif score >= 0.25:
	feedback = "Limited progress — review the error messages carefully."
	else:
	feedback = "Needs improvement — try analyzing the error phase first."

	return GraderResult(
	task_id=task_id,
	score=score,
	breakdown={
	"base": BASE_SCORE,
	"partial_fixes": round(partial_score, 4),
	"complete_solution": round(complete_bonus, 4),
	"difficulty_bonus": round(diff_bonus, 4),
	"efficiency": round(efficiency_score, 4),
	"hint_penalty": round(-hint_pen, 4),
	"failed_action_penalty": round(-failed_pen, 4),
	},
	feedback=feedback,
	steps_taken=steps_taken,
	hints_used=hints_used,
	)