Spaces:
Sleeping
Sleeping
| """ | |
| Grader for the CI/CD Doctor environment. | |
| Reward shape: | |
| fixes_applied_fraction * 0.35 proportional credit for each answer_key fix | |
| that is present in the filesystem (emitted | |
| incrementally as each fix lands, not all-or- | |
| nothing β on a 2-fix task, each fix is worth | |
| +0.175) | |
| pipeline_passed +0.50 pipeline_status == "passed" (terminal) | |
| Total positive: 0.85 from grade() + shaped bonuses from balance_score(). | |
| Investigation milestones (investigated, logs_read, correct_file_located) are | |
| still tracked in state.milestones for the balance_score() logic but give no | |
| reward β reading a file is not progress, fixing it is. | |
| balance_score() applies per-step shaped adjustments on top of the tier delta: | |
| +0.05 First read of each answer-key file (exploration bonus, max 2 files) | |
| -0.05 cat on a file already read this episode (redundant read penalty) | |
| -0.10 pipeline run with no filesystem changes since last run (idle run) | |
| -0.01 * overage each step taken beyond the task's ideal step count | |
| (efficiency penalty scales linearly with how far past ideal β at | |
| ideal+1 it's -0.01, at ideal+5 it's -0.05; cumulative cost on a | |
| 9-step overage tops out around -0.45) | |
| -0.08 agent has read the correct file but runs pipeline again with no edit | |
| (exploitation trap β knows the problem, not acting on it) | |
| """ | |
| from dataclasses import dataclass, field | |
| from models import PipelineState | |
| CORRECT_FILE_EDITED_TOTAL = 0.2 | |
| TIER_REWARDS: dict[str, float] = { | |
| "investigated": 0.0, | |
| "logs_read": 0.0, | |
| "correct_file_located": 0.01, | |
| "pipeline_passed": 0.50, | |
| "optimal_step":0.05 | |
| } | |
| PENALTIES: dict[str, float] = { | |
| "idle_pipeline_run": -0.10, | |
| "redundant_read": -0.05, | |
| "over_ideal_step": -0.01, | |
| "exploitation_trap": -0.08, | |
| } | |
| class StepContext: | |
| cmd_type: str | |
| filename: str | None = None | |
| files_read: set[str] = field(default_factory=set) | |
| fs_changed_since_last_run: bool = True | |
| step_count: int = 0 | |
| max_steps: int = 15 | |
| ideal_steps: int = 6 | |
| pipeline_runs_since_last_edit: int = 0 | |
| def _fixes_applied_fraction(state: PipelineState) -> float: | |
| """ | |
| Fraction of answer_key fixes that are currently present in the filesystem. | |
| Returns a value in [0.0, 1.0]. Each fix contributes incrementally the | |
| moment its fragment appears in the target file, so a 2-fix task rewards | |
| each correct edit as it happens rather than only when both are done. | |
| """ | |
| fixes = state.answer_key.get("fixes", {}) | |
| if not fixes: | |
| return 0.0 | |
| applied = sum( | |
| 1 for filename, fragment in fixes.items() | |
| if fragment in state.filesystem.get(filename, "") | |
| ) | |
| return applied / len(fixes) | |
| def grade(state: PipelineState) -> float: | |
| """ | |
| Compute the total earned grade from state. Fractional credit for fixes | |
| in the filesystem, plus the terminal bonus on pipeline pass. Investigation | |
| milestones contribute 0 β reading a file is not progress, fixing it is. | |
| """ | |
| score = CORRECT_FILE_EDITED_TOTAL * _fixes_applied_fraction(state) | |
| unlocked = set(state.milestones) | |
| if state.pipeline_status == "passed": | |
| unlocked.add("pipeline_passed") | |
| score += sum(TIER_REWARDS[tier] for tier in unlocked if tier in TIER_REWARDS) | |
| return round(score, 2) | |
| def balance_score(state: PipelineState, ctx: StepContext) -> float: | |
| """ | |
| Per-step shaped reward adjustment on top of the raw grade delta. | |
| Returns a float (may be negative). The caller adds this to the grade | |
| delta to produce the final step reward. | |
| The two goals: | |
| - Encourage exploration: small bonus the first time the agent reads a | |
| file that needs fixing (up to 2 files per episode). | |
| - Discourage waste: penalties for re-reading, idle pipeline runs, | |
| burning the step budget, and knowing the fix but not applying it. | |
| """ | |
| adjustment = 0.0 | |
| fix_files = set(state.answer_key.get("fixes", {}).keys()) | |
| if ctx.cmd_type == "cat" and ctx.filename: | |
| if ctx.filename in fix_files and ctx.filename not in ctx.files_read: | |
| # First read of a file that needs fixing β exploration bonus. | |
| # Cap at 2 files total to avoid rewarding excessive exploration. | |
| already_explored = sum(1 for f in ctx.files_read if f in fix_files) | |
| if already_explored < 2: | |
| adjustment += 0.05 | |
| elif ctx.filename in ctx.files_read: | |
| # Already read this file β wasted step. | |
| adjustment += PENALTIES["redundant_read"] | |
| if ctx.cmd_type == "pipeline_run": | |
| if not ctx.fs_changed_since_last_run: | |
| # Nothing changed since the last run β this reveals no new info. | |
| adjustment += PENALTIES["idle_pipeline_run"] | |
| if ( | |
| "correct_file_located" in state.milestones | |
| and ctx.pipeline_runs_since_last_edit >= 1 | |
| ): | |
| # Agent has already read the right file and run the pipeline at | |
| # least once since its last edit β it knows what to fix but is | |
| # stalling instead of applying the fix. | |
| adjustment += PENALTIES["exploitation_trap"] | |
| if ctx.step_count > ctx.ideal_steps: | |
| overage = ctx.step_count - ctx.ideal_steps | |
| adjustment += PENALTIES["over_ideal_step"] * overage | |
| else: | |
| adjustment += TIER_REWARDS["optimal_step"] | |
| return round(adjustment, 2) | |