samrat-rm's picture
Upload folder using huggingface_hub
45c94ac verified
"""
Grader for the CI/CD Doctor environment.
Reward shape:
fixes_applied_fraction * 0.35 proportional credit for each answer_key fix
that is present in the filesystem (emitted
incrementally as each fix lands, not all-or-
nothing β€” on a 2-fix task, each fix is worth
+0.175)
pipeline_passed +0.50 pipeline_status == "passed" (terminal)
Total positive: 0.85 from grade() + shaped bonuses from balance_score().
Investigation milestones (investigated, logs_read, correct_file_located) are
still tracked in state.milestones for the balance_score() logic but give no
reward β€” reading a file is not progress, fixing it is.
balance_score() applies per-step shaped adjustments on top of the tier delta:
+0.05 First read of each answer-key file (exploration bonus, max 2 files)
-0.05 cat on a file already read this episode (redundant read penalty)
-0.10 pipeline run with no filesystem changes since last run (idle run)
-0.01 * overage each step taken beyond the task's ideal step count
(efficiency penalty scales linearly with how far past ideal β€” at
ideal+1 it's -0.01, at ideal+5 it's -0.05; cumulative cost on a
9-step overage tops out around -0.45)
-0.08 agent has read the correct file but runs pipeline again with no edit
(exploitation trap β€” knows the problem, not acting on it)
"""
from dataclasses import dataclass, field
from models import PipelineState
CORRECT_FILE_EDITED_TOTAL = 0.2
TIER_REWARDS: dict[str, float] = {
"investigated": 0.0,
"logs_read": 0.0,
"correct_file_located": 0.01,
"pipeline_passed": 0.50,
"optimal_step":0.05
}
PENALTIES: dict[str, float] = {
"idle_pipeline_run": -0.10,
"redundant_read": -0.05,
"over_ideal_step": -0.01,
"exploitation_trap": -0.08,
}
@dataclass
class StepContext:
cmd_type: str
filename: str | None = None
files_read: set[str] = field(default_factory=set)
fs_changed_since_last_run: bool = True
step_count: int = 0
max_steps: int = 15
ideal_steps: int = 6
pipeline_runs_since_last_edit: int = 0
def _fixes_applied_fraction(state: PipelineState) -> float:
"""
Fraction of answer_key fixes that are currently present in the filesystem.
Returns a value in [0.0, 1.0]. Each fix contributes incrementally the
moment its fragment appears in the target file, so a 2-fix task rewards
each correct edit as it happens rather than only when both are done.
"""
fixes = state.answer_key.get("fixes", {})
if not fixes:
return 0.0
applied = sum(
1 for filename, fragment in fixes.items()
if fragment in state.filesystem.get(filename, "")
)
return applied / len(fixes)
def grade(state: PipelineState) -> float:
"""
Compute the total earned grade from state. Fractional credit for fixes
in the filesystem, plus the terminal bonus on pipeline pass. Investigation
milestones contribute 0 β€” reading a file is not progress, fixing it is.
"""
score = CORRECT_FILE_EDITED_TOTAL * _fixes_applied_fraction(state)
unlocked = set(state.milestones)
if state.pipeline_status == "passed":
unlocked.add("pipeline_passed")
score += sum(TIER_REWARDS[tier] for tier in unlocked if tier in TIER_REWARDS)
return round(score, 2)
def balance_score(state: PipelineState, ctx: StepContext) -> float:
"""
Per-step shaped reward adjustment on top of the raw grade delta.
Returns a float (may be negative). The caller adds this to the grade
delta to produce the final step reward.
The two goals:
- Encourage exploration: small bonus the first time the agent reads a
file that needs fixing (up to 2 files per episode).
- Discourage waste: penalties for re-reading, idle pipeline runs,
burning the step budget, and knowing the fix but not applying it.
"""
adjustment = 0.0
fix_files = set(state.answer_key.get("fixes", {}).keys())
if ctx.cmd_type == "cat" and ctx.filename:
if ctx.filename in fix_files and ctx.filename not in ctx.files_read:
# First read of a file that needs fixing β€” exploration bonus.
# Cap at 2 files total to avoid rewarding excessive exploration.
already_explored = sum(1 for f in ctx.files_read if f in fix_files)
if already_explored < 2:
adjustment += 0.05
elif ctx.filename in ctx.files_read:
# Already read this file β€” wasted step.
adjustment += PENALTIES["redundant_read"]
if ctx.cmd_type == "pipeline_run":
if not ctx.fs_changed_since_last_run:
# Nothing changed since the last run β€” this reveals no new info.
adjustment += PENALTIES["idle_pipeline_run"]
if (
"correct_file_located" in state.milestones
and ctx.pipeline_runs_since_last_edit >= 1
):
# Agent has already read the right file and run the pipeline at
# least once since its last edit β€” it knows what to fix but is
# stalling instead of applying the fix.
adjustment += PENALTIES["exploitation_trap"]
if ctx.step_count > ctx.ideal_steps:
overage = ctx.step_count - ctx.ideal_steps
adjustment += PENALTIES["over_ideal_step"] * overage
else:
adjustment += TIER_REWARDS["optimal_step"]
return round(adjustment, 2)