""" Grader for the CI/CD Doctor environment. Reward shape: fixes_applied_fraction * 0.35 proportional credit for each answer_key fix that is present in the filesystem (emitted incrementally as each fix lands, not all-or- nothing — on a 2-fix task, each fix is worth +0.175) pipeline_passed +0.50 pipeline_status == "passed" (terminal) Total positive: 0.85 from grade() + shaped bonuses from balance_score(). Investigation milestones (investigated, logs_read, correct_file_located) are still tracked in state.milestones for the balance_score() logic but give no reward — reading a file is not progress, fixing it is. balance_score() applies per-step shaped adjustments on top of the tier delta: +0.05 First read of each answer-key file (exploration bonus, max 2 files) -0.05 cat on a file already read this episode (redundant read penalty) -0.10 pipeline run with no filesystem changes since last run (idle run) -0.01 * overage each step taken beyond the task's ideal step count (efficiency penalty scales linearly with how far past ideal — at ideal+1 it's -0.01, at ideal+5 it's -0.05; cumulative cost on a 9-step overage tops out around -0.45) -0.08 agent has read the correct file but runs pipeline again with no edit (exploitation trap — knows the problem, not acting on it) """ from dataclasses import dataclass, field from models import PipelineState CORRECT_FILE_EDITED_TOTAL = 0.2 TIER_REWARDS: dict[str, float] = { "investigated": 0.0, "logs_read": 0.0, "correct_file_located": 0.01, "pipeline_passed": 0.50, "optimal_step":0.05 } PENALTIES: dict[str, float] = { "idle_pipeline_run": -0.10, "redundant_read": -0.05, "over_ideal_step": -0.01, "exploitation_trap": -0.08, } @dataclass class StepContext: cmd_type: str filename: str | None = None files_read: set[str] = field(default_factory=set) fs_changed_since_last_run: bool = True step_count: int = 0 max_steps: int = 15 ideal_steps: int = 6 pipeline_runs_since_last_edit: int = 0 def _fixes_applied_fraction(state: PipelineState) -> float: """ Fraction of answer_key fixes that are currently present in the filesystem. Returns a value in [0.0, 1.0]. Each fix contributes incrementally the moment its fragment appears in the target file, so a 2-fix task rewards each correct edit as it happens rather than only when both are done. """ fixes = state.answer_key.get("fixes", {}) if not fixes: return 0.0 applied = sum( 1 for filename, fragment in fixes.items() if fragment in state.filesystem.get(filename, "") ) return applied / len(fixes) def grade(state: PipelineState) -> float: """ Compute the total earned grade from state. Fractional credit for fixes in the filesystem, plus the terminal bonus on pipeline pass. Investigation milestones contribute 0 — reading a file is not progress, fixing it is. """ score = CORRECT_FILE_EDITED_TOTAL * _fixes_applied_fraction(state) unlocked = set(state.milestones) if state.pipeline_status == "passed": unlocked.add("pipeline_passed") score += sum(TIER_REWARDS[tier] for tier in unlocked if tier in TIER_REWARDS) return round(score, 2) def balance_score(state: PipelineState, ctx: StepContext) -> float: """ Per-step shaped reward adjustment on top of the raw grade delta. Returns a float (may be negative). The caller adds this to the grade delta to produce the final step reward. The two goals: - Encourage exploration: small bonus the first time the agent reads a file that needs fixing (up to 2 files per episode). - Discourage waste: penalties for re-reading, idle pipeline runs, burning the step budget, and knowing the fix but not applying it. """ adjustment = 0.0 fix_files = set(state.answer_key.get("fixes", {}).keys()) if ctx.cmd_type == "cat" and ctx.filename: if ctx.filename in fix_files and ctx.filename not in ctx.files_read: # First read of a file that needs fixing — exploration bonus. # Cap at 2 files total to avoid rewarding excessive exploration. already_explored = sum(1 for f in ctx.files_read if f in fix_files) if already_explored < 2: adjustment += 0.05 elif ctx.filename in ctx.files_read: # Already read this file — wasted step. adjustment += PENALTIES["redundant_read"] if ctx.cmd_type == "pipeline_run": if not ctx.fs_changed_since_last_run: # Nothing changed since the last run — this reveals no new info. adjustment += PENALTIES["idle_pipeline_run"] if ( "correct_file_located" in state.milestones and ctx.pipeline_runs_since_last_edit >= 1 ): # Agent has already read the right file and run the pipeline at # least once since its last edit — it knows what to fix but is # stalling instead of applying the fix. adjustment += PENALTIES["exploitation_trap"] if ctx.step_count > ctx.ideal_steps: overage = ctx.step_count - ctx.ideal_steps adjustment += PENALTIES["over_ideal_step"] * overage else: adjustment += TIER_REWARDS["optimal_step"] return round(adjustment, 2)