Spaces:
Sleeping
Sleeping
File size: 8,357 Bytes
408d02c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | """Core contract tests for ReleaseOps-Env."""
import pytest
from server.releaseops_environment import ReleaseOpsEnvironment
from releaseops_env.models import ReleaseAction
# ββ RESET ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_reset_returns_observation(env):
obs = env.reset(task_id="easy_001")
assert obs.done is False
assert obs.change_summary != ""
assert obs.rollout_phase == "precheck"
assert obs.time_remaining == 12
assert obs.cumulative_reward == 0.0
assert obs.known_risk_signals == []
def test_reset_sets_state(env):
env.reset(task_id="easy_001")
state = env.state
assert state.task_id == "easy_001"
assert state.step_count == 0
assert state.rollout_phase == "precheck"
assert state.evidence_gathered == []
def test_reset_is_idempotent(env):
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
env.reset(task_id="easy_001") # reset again
assert env.state.step_count == 0
assert env.state.evidence_gathered == []
# ββ INSPECT CHANGE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_inspect_change_diff_returns_content(easy_env):
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
assert obs.last_tool_result is not None
assert obs.last_tool_result.success is True
assert obs.last_tool_result.content != ""
assert obs.reward > 0 # new required evidence
def test_inspect_change_tests_discovers_risks(easy_env):
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="tests"))
assert obs.last_tool_result.success is True
risk_ids = [r.signal_id for r in obs.known_risk_signals]
# easy_001 has a failed integration test and no load test
assert "integration_test_failure" in risk_ids or "missing_load_test" in risk_ids
def test_redundant_inspection_penalized(easy_env):
easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
assert obs.reward < 0 # penalty for redundancy
def test_inspect_approvals_discovers_pending(easy_env):
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="approvals"))
risk_ids = [r.signal_id for r in obs.known_risk_signals]
assert any("approval" in rid for rid in risk_ids)
# ββ CHECK POLICY βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_check_policy_returns_rules(easy_env):
obs = easy_env.step(ReleaseAction(action_type="check_policy"))
assert obs.last_tool_result.success is True
assert "POL-" in obs.last_tool_result.content
assert obs.reward > 0
def test_check_policy_discovers_violations(easy_env):
obs = easy_env.step(ReleaseAction(action_type="check_policy"))
risk_ids = [r.signal_id for r in obs.known_risk_signals]
# easy_001 has peak traffic window + missing owner approval
assert len(obs.known_risk_signals) > 0
# ββ SUBMIT DECISION βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_submit_decision_ends_episode(easy_env):
obs = easy_env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="block",
reason_codes=["MISSING_LOAD_TEST"],
)
)
assert obs.done is True
assert obs.final_score is not None
assert 0.0 <= obs.final_score <= 1.0
assert obs.grader_breakdown is not None
assert obs.rollout_phase == "terminal"
def test_optimal_trajectory_high_score(env):
"""Agent that gathers required evidence + correct decision β score β₯ 0.85."""
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
env.step(ReleaseAction(action_type="inspect_change", section="tests"))
env.step(ReleaseAction(action_type="check_policy"))
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="request_changes",
reason_codes=["HOT_PATH_SYNC_IO", "MISSING_LOAD_TEST", "INTEGRATION_TEST_FAILURE"],
)
)
assert obs.final_score >= 0.80, f"Expected β₯0.80, got {obs.final_score}. Breakdown: {obs.grader_breakdown}"
def test_wrong_decision_low_score(env):
"""Agent that approves without evidence β score < 0.45."""
env.reset(task_id="easy_001")
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="approve",
reason_codes=[],
)
)
assert obs.final_score < 0.45, f"Expected <0.45, got {obs.final_score}"
def test_submit_without_final_decision_doesnt_end(easy_env):
obs = easy_env.step(ReleaseAction(action_type="submit_decision", reason_codes=[]))
assert obs.done is False
assert obs.reward < 0
def test_grader_is_deterministic(env):
"""Same trajectory must produce the same score every run."""
scores = []
for _ in range(3):
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
env.step(ReleaseAction(action_type="inspect_change", section="tests"))
env.step(ReleaseAction(action_type="check_policy"))
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="request_changes",
reason_codes=["HOT_PATH_SYNC_IO", "MISSING_LOAD_TEST", "INTEGRATION_TEST_FAILURE"],
)
)
scores.append(obs.final_score)
assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}"
# ββ PHASE TRANSITIONS βββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_invalid_phase_transition_penalized(easy_env):
# Can't promote from precheck
obs = easy_env.step(ReleaseAction(action_type="control_rollout", decision="promote"))
assert obs.reward < 0
assert obs.rollout_phase == "precheck" # phase unchanged
def test_start_canary_transitions_phase(easy_env):
obs = easy_env.step(ReleaseAction(action_type="control_rollout", decision="start_canary"))
assert obs.rollout_phase == "canary"
assert obs.reward >= 0
def test_max_steps_terminates(env):
env.reset(task_id="easy_001")
for _ in range(12):
obs = env.step(ReleaseAction(action_type="inspect_change", section="diff"))
assert obs.done is True
# ββ STATE βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_state_tracks_evidence(easy_env):
easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
easy_env.step(ReleaseAction(action_type="inspect_change", section="tests"))
state = easy_env.state
assert "inspected_diff" in state.evidence_gathered
assert "inspected_tests" in state.evidence_gathered
def test_state_tracks_step_count(easy_env):
easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
easy_env.step(ReleaseAction(action_type="check_policy"))
assert easy_env.state.step_count == 2
def test_allowed_actions_in_observation(easy_env):
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
assert "inspect_change" in obs.allowed_actions
assert "submit_decision" in obs.allowed_actions
def test_allowed_actions_empty_after_terminal(env):
env.reset(task_id="easy_001")
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="block",
reason_codes=["MISSING_LOAD_TEST"],
)
)
assert obs.allowed_actions == []
|