"""
Grader for the CI/CD Doctor environment.

Reward shape:
  fixes_applied_fraction * 0.35   proportional credit for each answer_key fix
                                  that is present in the filesystem (emitted
                                  incrementally as each fix lands, not all-or-
                                  nothing — on a 2-fix task, each fix is worth
                                  +0.175)
  pipeline_passed       +0.50     pipeline_status == "passed" (terminal)

Total positive: 0.85 from grade() + shaped bonuses from balance_score().

Investigation milestones (investigated, logs_read, correct_file_located) are
still tracked in state.milestones for the balance_score() logic but give no
reward — reading a file is not progress, fixing it is.

balance_score() applies per-step shaped adjustments on top of the tier delta:
  +0.05  First read of each answer-key file (exploration bonus, max 2 files)
  -0.05  cat on a file already read this episode (redundant read penalty)
  -0.10  pipeline run with no filesystem changes since last run (idle run)
  -0.01 * overage  each step taken beyond the task's ideal step count
         (efficiency penalty scales linearly with how far past ideal — at
         ideal+1 it's -0.01, at ideal+5 it's -0.05; cumulative cost on a
         9-step overage tops out around -0.45)
  -0.08  agent has read the correct file but runs pipeline again with no edit
         (exploitation trap — knows the problem, not acting on it)
"""

from dataclasses import dataclass, field

from models import PipelineState

CORRECT_FILE_EDITED_TOTAL = 0.2

TIER_REWARDS: dict[str, float] = {
    "investigated": 0.0,
    "logs_read": 0.0,
    "correct_file_located": 0.01,
    "pipeline_passed": 0.50,
    "optimal_step":0.05
}

PENALTIES: dict[str, float] = {
    "idle_pipeline_run": -0.10,
    "redundant_read": -0.05,
    "over_ideal_step": -0.01,
    "exploitation_trap": -0.08,
}

@dataclass
class StepContext:
    cmd_type: str
    filename: str | None = None
    files_read: set[str] = field(default_factory=set)
    fs_changed_since_last_run: bool = True
    step_count: int = 0
    max_steps: int = 15
    ideal_steps: int = 6
    pipeline_runs_since_last_edit: int = 0


def _fixes_applied_fraction(state: PipelineState) -> float:
    """
    Fraction of answer_key fixes that are currently present in the filesystem.
    Returns a value in [0.0, 1.0]. Each fix contributes incrementally the
    moment its fragment appears in the target file, so a 2-fix task rewards
    each correct edit as it happens rather than only when both are done.
    """
    fixes = state.answer_key.get("fixes", {})
    if not fixes:
        return 0.0
    applied = sum(
        1 for filename, fragment in fixes.items()
        if fragment in state.filesystem.get(filename, "")
    )
    return applied / len(fixes)


def grade(state: PipelineState) -> float:
    """
    Compute the total earned grade from state. Fractional credit for fixes
    in the filesystem, plus the terminal bonus on pipeline pass. Investigation
    milestones contribute 0 — reading a file is not progress, fixing it is.
    """
    score = CORRECT_FILE_EDITED_TOTAL * _fixes_applied_fraction(state)

    unlocked = set(state.milestones)
    if state.pipeline_status == "passed":
        unlocked.add("pipeline_passed")
    score += sum(TIER_REWARDS[tier] for tier in unlocked if tier in TIER_REWARDS)

    return round(score, 2)


def balance_score(state: PipelineState, ctx: StepContext) -> float:
    """
    Per-step shaped reward adjustment on top of the raw grade delta.

    Returns a float (may be negative). The caller adds this to the grade
    delta to produce the final step reward.

    The two goals:
      - Encourage exploration: small bonus the first time the agent reads a
        file that needs fixing (up to 2 files per episode).
      - Discourage waste: penalties for re-reading, idle pipeline runs,
        burning the step budget, and knowing the fix but not applying it.
    """
    adjustment = 0.0
    fix_files = set(state.answer_key.get("fixes", {}).keys())

    if ctx.cmd_type == "cat" and ctx.filename:
        if ctx.filename in fix_files and ctx.filename not in ctx.files_read:
            # First read of a file that needs fixing — exploration bonus.
            # Cap at 2 files total to avoid rewarding excessive exploration.
            already_explored = sum(1 for f in ctx.files_read if f in fix_files)
            if already_explored < 2:
                adjustment += 0.05
        elif ctx.filename in ctx.files_read:
            # Already read this file — wasted step.
            adjustment += PENALTIES["redundant_read"]

    if ctx.cmd_type == "pipeline_run":
        if not ctx.fs_changed_since_last_run:
            # Nothing changed since the last run — this reveals no new info.
            adjustment += PENALTIES["idle_pipeline_run"]

        if (
            "correct_file_located" in state.milestones
            and ctx.pipeline_runs_since_last_edit >= 1
        ):
            # Agent has already read the right file and run the pipeline at
            # least once since its last edit — it knows what to fix but is
            # stalling instead of applying the fix.
            adjustment += PENALTIES["exploitation_trap"]

    if ctx.step_count > ctx.ideal_steps:
        overage = ctx.step_count - ctx.ideal_steps
        adjustment += PENALTIES["over_ideal_step"] * overage
    else: 
        adjustment += TIER_REWARDS["optimal_step"]

    return round(adjustment, 2)