releaseops-env / tests /test_grader.py
eastbrick's picture
first commit
408d02c
"""Test the deterministic grading system.
Verifies that the grader produces correct, reproducible scores across
different trajectories and decision qualities. All tests use easy_001.
Grading formula:
score = 0.35 * evidence_coverage
+ 0.25 * risk_signal_discovery
+ 0.30 * decision_correctness
+ 0.10 * efficiency
- forbidden_penalty (0.3 if any forbidden action taken)
clamped to [0.0, 1.0]
risk_signal_discovery measures which required_risk_signals were actually
triggered (emitted by the environment) during the episode β€” objective,
not dependent on what strings the agent typed in reason_codes.
"""
import pytest
from server.releaseops_environment import ReleaseOpsEnvironment
from releaseops_env.models import ReleaseAction
# ── Fixtures ─────────────────────────────────────────────────────────
@pytest.fixture
def env():
return ReleaseOpsEnvironment()
def _gather_all_required_evidence(env):
"""Play the 3 required evidence steps for easy_001."""
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
env.step(ReleaseAction(action_type="inspect_change", section="tests"))
env.step(ReleaseAction(action_type="check_policy"))
def _submit(env, decision, reason_codes):
"""Submit a final decision and return the observation."""
return env.step(
ReleaseAction(
action_type="submit_decision",
final_decision=decision,
reason_codes=reason_codes,
)
)
# ── Score range tests ────────────────────────────────────────────────
def test_perfect_play_scores_above_085(env):
"""All evidence + optimal decision + all reason codes β†’ β‰₯ 0.85."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert (
obs.final_score >= 0.85
), f"Got {obs.final_score}, breakdown: {obs.grader_breakdown}"
def test_wrong_decision_no_evidence_scores_below_020(env):
"""No evidence + wrong decision + no codes β†’ very low."""
env.reset(task_id="easy_001")
obs = _submit(env, "approve", [])
# decision_correctness=0, evidence=0, reason_accuracy=0.5 (no codes, no required β†’ but there ARE required)
# Actually: required_codes exist, submitted=none β†’ precision=0/1=0, recall=0/3=0 β†’ 0.0
# efficiency: 1 step / 12 = 0.083 β†’ 0.083/0.3 = 0.278
# score = 0.35*0 + 0.25*0 + 0.30*0 + 0.10*0.278 = 0.028
assert obs.final_score < 0.20, f"Got {obs.final_score}"
def test_acceptable_decision_gets_partial_credit(env):
"""'block' is acceptable but not optimal for easy_001 β†’ decision_score = 0.5."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs = _submit(
env,
"block",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
bd = obs.grader_breakdown
assert bd["decision_correctness"] == 0.5
# Still gets full evidence + reason codes, so score should be decent
assert 0.55 <= obs.final_score <= 0.85, f"Got {obs.final_score}"
def test_completely_wrong_decision_zero_decision_score(env):
"""'escalate' is not in acceptable_decisions β†’ decision_score = 0."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs = _submit(
env,
"escalate",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert obs.grader_breakdown["decision_correctness"] == 0.0
# ── Evidence coverage component ──────────────────────────────────────
def test_full_evidence_coverage(env):
"""Gathering all 3 required evidence items β†’ evidence_coverage = 1.0."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
assert obs.grader_breakdown["evidence_coverage"] == 1.0
def test_partial_evidence_coverage(env):
"""Gathering 1 of 3 required β†’ evidence_coverage β‰ˆ 0.333."""
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
ec = obs.grader_breakdown["evidence_coverage"]
assert abs(ec - 1.0 / 3.0) < 0.01, f"Expected ~0.333, got {ec}"
def test_zero_evidence_coverage(env):
"""No evidence gathered β†’ evidence_coverage = 0.0."""
env.reset(task_id="easy_001")
obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
assert obs.grader_breakdown["evidence_coverage"] == 0.0
def test_extra_evidence_doesnt_hurt(env):
"""Gathering more than required evidence still gives 1.0 coverage."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
# Extra steps beyond the 3 required
env.step(ReleaseAction(action_type="inspect_change", section="approvals"))
env.step(ReleaseAction(action_type="inspect_dependencies"))
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert obs.grader_breakdown["evidence_coverage"] == 1.0
# ── Risk signal discovery component ─────────────────────────────────
# required_risk_signals for easy_001: hot_path_sync_io, integration_test_failure,
# missing_load_test, policy_peak_traffic
# hot_path_sync_io + missing_load_test β†’ inspect_change(diff) + inspect_change(tests)
# integration_test_failure β†’ inspect_change(tests)
# policy_peak_traffic β†’ check_policy
def test_full_signal_discovery_after_complete_investigation(env):
"""Gathering all required evidence triggers all 4 required risk signals β†’ 1.0."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env) # diff + tests + check_policy
obs = _submit(env, "request_changes", [])
assert obs.grader_breakdown["risk_signal_discovery"] == 1.0
def test_partial_signal_discovery_diff_only(env):
"""Only inspecting diff discovers hot_path_sync_io β†’ 1/4 = 0.25."""
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
obs = _submit(env, "request_changes", [])
rsd = obs.grader_breakdown["risk_signal_discovery"]
# Only hot_path_sync_io triggered; missing_load_test, integration_test_failure,
# policy_peak_traffic not yet discovered β†’ 1/4 = 0.25
assert abs(rsd - 0.25) < 0.01, f"Expected ~0.25, got {rsd}"
def test_reason_codes_do_not_affect_signal_discovery(env):
"""Submitting wrong/extra reason_codes has no effect on risk_signal_discovery."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs_correct = _submit(env, "request_changes", ["hot_path_sync_io"])
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs_bogus = _submit(env, "request_changes", ["BOGUS_1", "BOGUS_2", "BOGUS_3"])
# Both should have the same risk_signal_discovery β€” it's env-side, not string-matching
assert obs_correct.grader_breakdown["risk_signal_discovery"] == \
obs_bogus.grader_breakdown["risk_signal_discovery"]
def test_zero_signal_discovery_no_investigation(env):
"""Skipping all investigation β†’ no signals discovered β†’ 0.0."""
env.reset(task_id="easy_001")
obs = _submit(env, "request_changes", [])
assert obs.grader_breakdown["risk_signal_discovery"] == 0.0
# ── Efficiency component ─────────────────────────────────────────────
def test_efficiency_sweet_spot(env):
"""Using 4-8 of 12 steps (33-67%) β†’ efficiency = 1.0."""
env.reset(task_id="easy_001")
# 4 steps: 3 evidence + 1 submit = 4/12 = 0.333 (just in sweet spot)
_gather_all_required_evidence(env)
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert obs.grader_breakdown["efficiency"] == 1.0
def test_efficiency_too_fast(env):
"""1 step out of 12 = 8.3% β†’ efficiency < 1.0 (too hasty)."""
env.reset(task_id="easy_001")
obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
eff = obs.grader_breakdown["efficiency"]
# usage = 1/12 = 0.083, efficiency = 0.083/0.3 = 0.278
assert eff < 0.5, f"Expected < 0.5, got {eff}"
def test_efficiency_too_slow(env):
"""Using 11 of 12 steps = 91.7% β†’ efficiency tapers."""
env.reset(task_id="easy_001")
# Burn 10 steps with redundant inspections (each gets -0.05 after first)
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
env.step(ReleaseAction(action_type="inspect_change", section="tests"))
env.step(ReleaseAction(action_type="check_policy"))
for _ in range(7):
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
# Now at step 10, submit at step 11
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
eff = obs.grader_breakdown["efficiency"]
# usage = 11/12 = 0.917, eff = max(0, 1-(0.917-0.7)/0.3) = 1-0.723 = 0.277
assert eff < 0.5, f"Expected < 0.5, got {eff}"
# ── Forbidden actions ────────────────────────────────────────────────
def test_forbidden_action_penalty_applied(env):
"""Taking a forbidden action deducts 0.3 from score."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
# "approve_with_missing_approval" is in easy_001's forbidden_actions.
# We fake it by directly injecting into actions_taken.
env._state.actions_taken.append("approve_with_missing_approval")
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert obs.grader_breakdown["forbidden_penalty"] == 0.3
# Without penalty this would be ~1.00, with penalty = 0.70
assert obs.final_score <= 0.70
def test_no_forbidden_penalty_by_default(env):
"""Normal trajectory has no forbidden penalty."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert obs.grader_breakdown["forbidden_penalty"] == 0.0
# ── Score clamping ───────────────────────────────────────────────────
def test_score_never_exceeds_one(env):
"""Score is clamped to [0.0, 1.0]."""
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert 0.0 <= obs.final_score <= 1.0
def test_score_never_below_zero(env):
"""Even with forbidden penalty, score clamps to 0.0."""
env.reset(task_id="easy_001")
env._state.actions_taken.append("approve_with_missing_approval")
obs = _submit(env, "escalate", []) # wrong decision + forbidden
assert obs.final_score >= 0.0
# ── Determinism ──────────────────────────────────────────────────────
def test_grader_deterministic_across_runs(env):
"""Identical trajectory produces identical score every time."""
scores = []
for _ in range(5):
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
scores.append(obs.final_score)
assert len(set(scores)) == 1, f"Scores vary: {scores}"
def test_grader_breakdown_deterministic(env):
"""Every breakdown component is identical across runs."""
breakdowns = []
for _ in range(3):
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
breakdowns.append(obs.grader_breakdown)
assert breakdowns[0] == breakdowns[1] == breakdowns[2]
# ── Score ordering (difficulty spread) ───────────────────────────────
def test_optimal_beats_partial_beats_wrong(env):
"""Optimal decision > acceptable decision > wrong decision."""
# Optimal: request_changes
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs_opt = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
# Acceptable: block
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs_partial = _submit(
env,
"block",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
# Wrong: approve
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs_wrong = _submit(
env,
"approve",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert (
obs_opt.final_score > obs_partial.final_score > obs_wrong.final_score
), f"Expected optimal({obs_opt.final_score}) > partial({obs_partial.final_score}) > wrong({obs_wrong.final_score})"
def test_more_evidence_higher_score(env):
"""Gathering more required evidence β†’ higher score (same decision)."""
# 0 evidence
env.reset(task_id="easy_001")
obs_0 = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
# 1 evidence
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
obs_1 = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
# 3 evidence (all required)
env.reset(task_id="easy_001")
_gather_all_required_evidence(env)
obs_3 = _submit(
env,
"request_changes",
[
"HOT_PATH_SYNC_IO",
"MISSING_LOAD_TEST",
"INTEGRATION_TEST_FAILURE",
],
)
assert (
obs_3.final_score > obs_1.final_score > obs_0.final_score
), f"Expected 3ev({obs_3.final_score}) > 1ev({obs_1.final_score}) > 0ev({obs_0.final_score})"