"""Test the deterministic grading system.

Verifies that the grader produces correct, reproducible scores across
different trajectories and decision qualities. All tests use easy_001.

Grading formula:
  score = 0.35 * evidence_coverage
        + 0.25 * risk_signal_discovery
        + 0.30 * decision_correctness
        + 0.10 * efficiency
        - forbidden_penalty (0.3 if any forbidden action taken)
  clamped to [0.0, 1.0]

risk_signal_discovery measures which required_risk_signals were actually
triggered (emitted by the environment) during the episode — objective,
not dependent on what strings the agent typed in reason_codes.
"""

import pytest
from server.releaseops_environment import ReleaseOpsEnvironment
from releaseops_env.models import ReleaseAction


# ── Fixtures ─────────────────────────────────────────────────────────


@pytest.fixture
def env():
    return ReleaseOpsEnvironment()


def _gather_all_required_evidence(env):
    """Play the 3 required evidence steps for easy_001."""
    env.step(ReleaseAction(action_type="inspect_change", section="diff"))
    env.step(ReleaseAction(action_type="inspect_change", section="tests"))
    env.step(ReleaseAction(action_type="check_policy"))


def _submit(env, decision, reason_codes):
    """Submit a final decision and return the observation."""
    return env.step(
        ReleaseAction(
            action_type="submit_decision",
            final_decision=decision,
            reason_codes=reason_codes,
        )
    )


# ── Score range tests ────────────────────────────────────────────────


def test_perfect_play_scores_above_085(env):
    """All evidence + optimal decision + all reason codes → ≥ 0.85."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    assert (
        obs.final_score >= 0.85
    ), f"Got {obs.final_score}, breakdown: {obs.grader_breakdown}"


def test_wrong_decision_no_evidence_scores_below_020(env):
    """No evidence + wrong decision + no codes → very low."""
    env.reset(task_id="easy_001")
    obs = _submit(env, "approve", [])
    # decision_correctness=0, evidence=0, reason_accuracy=0.5 (no codes, no required → but there ARE required)
    # Actually: required_codes exist, submitted=none → precision=0/1=0, recall=0/3=0 → 0.0
    # efficiency: 1 step / 12 = 0.083 → 0.083/0.3 = 0.278
    # score = 0.35*0 + 0.25*0 + 0.30*0 + 0.10*0.278 = 0.028
    assert obs.final_score < 0.20, f"Got {obs.final_score}"


def test_acceptable_decision_gets_partial_credit(env):
    """'block' is acceptable but not optimal for easy_001 → decision_score = 0.5."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs = _submit(
        env,
        "block",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    bd = obs.grader_breakdown
    assert bd["decision_correctness"] == 0.5
    # Still gets full evidence + reason codes, so score should be decent
    assert 0.55 <= obs.final_score <= 0.85, f"Got {obs.final_score}"


def test_completely_wrong_decision_zero_decision_score(env):
    """'escalate' is not in acceptable_decisions → decision_score = 0."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs = _submit(
        env,
        "escalate",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    assert obs.grader_breakdown["decision_correctness"] == 0.0


# ── Evidence coverage component ──────────────────────────────────────


def test_full_evidence_coverage(env):
    """Gathering all 3 required evidence items → evidence_coverage = 1.0."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
    assert obs.grader_breakdown["evidence_coverage"] == 1.0


def test_partial_evidence_coverage(env):
    """Gathering 1 of 3 required → evidence_coverage ≈ 0.333."""
    env.reset(task_id="easy_001")
    env.step(ReleaseAction(action_type="inspect_change", section="diff"))
    obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
    ec = obs.grader_breakdown["evidence_coverage"]
    assert abs(ec - 1.0 / 3.0) < 0.01, f"Expected ~0.333, got {ec}"


def test_zero_evidence_coverage(env):
    """No evidence gathered → evidence_coverage = 0.0."""
    env.reset(task_id="easy_001")
    obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
    assert obs.grader_breakdown["evidence_coverage"] == 0.0


def test_extra_evidence_doesnt_hurt(env):
    """Gathering more than required evidence still gives 1.0 coverage."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    # Extra steps beyond the 3 required
    env.step(ReleaseAction(action_type="inspect_change", section="approvals"))
    env.step(ReleaseAction(action_type="inspect_dependencies"))
    obs = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    assert obs.grader_breakdown["evidence_coverage"] == 1.0


# ── Risk signal discovery component ─────────────────────────────────
# required_risk_signals for easy_001: hot_path_sync_io, integration_test_failure,
# missing_load_test, policy_peak_traffic
# hot_path_sync_io + missing_load_test → inspect_change(diff) + inspect_change(tests)
# integration_test_failure → inspect_change(tests)
# policy_peak_traffic → check_policy


def test_full_signal_discovery_after_complete_investigation(env):
    """Gathering all required evidence triggers all 4 required risk signals → 1.0."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)  # diff + tests + check_policy
    obs = _submit(env, "request_changes", [])
    assert obs.grader_breakdown["risk_signal_discovery"] == 1.0


def test_partial_signal_discovery_diff_only(env):
    """Only inspecting diff discovers hot_path_sync_io → 1/4 = 0.25."""
    env.reset(task_id="easy_001")
    env.step(ReleaseAction(action_type="inspect_change", section="diff"))
    obs = _submit(env, "request_changes", [])
    rsd = obs.grader_breakdown["risk_signal_discovery"]
    # Only hot_path_sync_io triggered; missing_load_test, integration_test_failure,
    # policy_peak_traffic not yet discovered → 1/4 = 0.25
    assert abs(rsd - 0.25) < 0.01, f"Expected ~0.25, got {rsd}"


def test_reason_codes_do_not_affect_signal_discovery(env):
    """Submitting wrong/extra reason_codes has no effect on risk_signal_discovery."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs_correct = _submit(env, "request_changes", ["hot_path_sync_io"])

    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs_bogus = _submit(env, "request_changes", ["BOGUS_1", "BOGUS_2", "BOGUS_3"])

    # Both should have the same risk_signal_discovery — it's env-side, not string-matching
    assert obs_correct.grader_breakdown["risk_signal_discovery"] == \
           obs_bogus.grader_breakdown["risk_signal_discovery"]


def test_zero_signal_discovery_no_investigation(env):
    """Skipping all investigation → no signals discovered → 0.0."""
    env.reset(task_id="easy_001")
    obs = _submit(env, "request_changes", [])
    assert obs.grader_breakdown["risk_signal_discovery"] == 0.0


# ── Efficiency component ─────────────────────────────────────────────


def test_efficiency_sweet_spot(env):
    """Using 4-8 of 12 steps (33-67%) → efficiency = 1.0."""
    env.reset(task_id="easy_001")
    # 4 steps: 3 evidence + 1 submit = 4/12 = 0.333 (just in sweet spot)
    _gather_all_required_evidence(env)
    obs = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    assert obs.grader_breakdown["efficiency"] == 1.0


def test_efficiency_too_fast(env):
    """1 step out of 12 = 8.3% → efficiency < 1.0 (too hasty)."""
    env.reset(task_id="easy_001")
    obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
    eff = obs.grader_breakdown["efficiency"]
    # usage = 1/12 = 0.083, efficiency = 0.083/0.3 = 0.278
    assert eff < 0.5, f"Expected < 0.5, got {eff}"


def test_efficiency_too_slow(env):
    """Using 11 of 12 steps = 91.7% → efficiency tapers."""
    env.reset(task_id="easy_001")
    # Burn 10 steps with redundant inspections (each gets -0.05 after first)
    env.step(ReleaseAction(action_type="inspect_change", section="diff"))
    env.step(ReleaseAction(action_type="inspect_change", section="tests"))
    env.step(ReleaseAction(action_type="check_policy"))
    for _ in range(7):
        env.step(ReleaseAction(action_type="inspect_change", section="diff"))
    # Now at step 10, submit at step 11
    obs = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    eff = obs.grader_breakdown["efficiency"]
    # usage = 11/12 = 0.917, eff = max(0, 1-(0.917-0.7)/0.3) = 1-0.723 = 0.277
    assert eff < 0.5, f"Expected < 0.5, got {eff}"


# ── Forbidden actions ────────────────────────────────────────────────


def test_forbidden_action_penalty_applied(env):
    """Taking a forbidden action deducts 0.3 from score."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)

    # "approve_with_missing_approval" is in easy_001's forbidden_actions.
    # We fake it by directly injecting into actions_taken.
    env._state.actions_taken.append("approve_with_missing_approval")

    obs = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    assert obs.grader_breakdown["forbidden_penalty"] == 0.3
    # Without penalty this would be ~1.00, with penalty = 0.70
    assert obs.final_score <= 0.70


def test_no_forbidden_penalty_by_default(env):
    """Normal trajectory has no forbidden penalty."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    assert obs.grader_breakdown["forbidden_penalty"] == 0.0


# ── Score clamping ───────────────────────────────────────────────────


def test_score_never_exceeds_one(env):
    """Score is clamped to [0.0, 1.0]."""
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )
    assert 0.0 <= obs.final_score <= 1.0


def test_score_never_below_zero(env):
    """Even with forbidden penalty, score clamps to 0.0."""
    env.reset(task_id="easy_001")
    env._state.actions_taken.append("approve_with_missing_approval")
    obs = _submit(env, "escalate", [])  # wrong decision + forbidden
    assert obs.final_score >= 0.0


# ── Determinism ──────────────────────────────────────────────────────


def test_grader_deterministic_across_runs(env):
    """Identical trajectory produces identical score every time."""
    scores = []
    for _ in range(5):
        env.reset(task_id="easy_001")
        _gather_all_required_evidence(env)
        obs = _submit(
            env,
            "request_changes",
            [
                "HOT_PATH_SYNC_IO",
                "MISSING_LOAD_TEST",
                "INTEGRATION_TEST_FAILURE",
            ],
        )
        scores.append(obs.final_score)
    assert len(set(scores)) == 1, f"Scores vary: {scores}"


def test_grader_breakdown_deterministic(env):
    """Every breakdown component is identical across runs."""
    breakdowns = []
    for _ in range(3):
        env.reset(task_id="easy_001")
        _gather_all_required_evidence(env)
        obs = _submit(
            env,
            "request_changes",
            [
                "HOT_PATH_SYNC_IO",
                "MISSING_LOAD_TEST",
                "INTEGRATION_TEST_FAILURE",
            ],
        )
        breakdowns.append(obs.grader_breakdown)
    assert breakdowns[0] == breakdowns[1] == breakdowns[2]


# ── Score ordering (difficulty spread) ───────────────────────────────


def test_optimal_beats_partial_beats_wrong(env):
    """Optimal decision > acceptable decision > wrong decision."""
    # Optimal: request_changes
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs_opt = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )

    # Acceptable: block
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs_partial = _submit(
        env,
        "block",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )

    # Wrong: approve
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs_wrong = _submit(
        env,
        "approve",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )

    assert (
        obs_opt.final_score > obs_partial.final_score > obs_wrong.final_score
    ), f"Expected optimal({obs_opt.final_score}) > partial({obs_partial.final_score}) > wrong({obs_wrong.final_score})"


def test_more_evidence_higher_score(env):
    """Gathering more required evidence → higher score (same decision)."""
    # 0 evidence
    env.reset(task_id="easy_001")
    obs_0 = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )

    # 1 evidence
    env.reset(task_id="easy_001")
    env.step(ReleaseAction(action_type="inspect_change", section="diff"))
    obs_1 = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )

    # 3 evidence (all required)
    env.reset(task_id="easy_001")
    _gather_all_required_evidence(env)
    obs_3 = _submit(
        env,
        "request_changes",
        [
            "HOT_PATH_SYNC_IO",
            "MISSING_LOAD_TEST",
            "INTEGRATION_TEST_FAILURE",
        ],
    )

    assert (
        obs_3.final_score > obs_1.final_score > obs_0.final_score
    ), f"Expected 3ev({obs_3.final_score}) > 1ev({obs_1.final_score}) > 0ev({obs_0.final_score})"