releaseops-env / tests /test_environment.py
eastbrick's picture
first commit
408d02c
"""Core contract tests for ReleaseOps-Env."""
import pytest
from server.releaseops_environment import ReleaseOpsEnvironment
from releaseops_env.models import ReleaseAction
# ── RESET ──────────────────────────────────────────────────────────────────
def test_reset_returns_observation(env):
obs = env.reset(task_id="easy_001")
assert obs.done is False
assert obs.change_summary != ""
assert obs.rollout_phase == "precheck"
assert obs.time_remaining == 12
assert obs.cumulative_reward == 0.0
assert obs.known_risk_signals == []
def test_reset_sets_state(env):
env.reset(task_id="easy_001")
state = env.state
assert state.task_id == "easy_001"
assert state.step_count == 0
assert state.rollout_phase == "precheck"
assert state.evidence_gathered == []
def test_reset_is_idempotent(env):
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
env.reset(task_id="easy_001") # reset again
assert env.state.step_count == 0
assert env.state.evidence_gathered == []
# ── INSPECT CHANGE ────────────────────────────────────────────────────────
def test_inspect_change_diff_returns_content(easy_env):
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
assert obs.last_tool_result is not None
assert obs.last_tool_result.success is True
assert obs.last_tool_result.content != ""
assert obs.reward > 0 # new required evidence
def test_inspect_change_tests_discovers_risks(easy_env):
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="tests"))
assert obs.last_tool_result.success is True
risk_ids = [r.signal_id for r in obs.known_risk_signals]
# easy_001 has a failed integration test and no load test
assert "integration_test_failure" in risk_ids or "missing_load_test" in risk_ids
def test_redundant_inspection_penalized(easy_env):
easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
assert obs.reward < 0 # penalty for redundancy
def test_inspect_approvals_discovers_pending(easy_env):
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="approvals"))
risk_ids = [r.signal_id for r in obs.known_risk_signals]
assert any("approval" in rid for rid in risk_ids)
# ── CHECK POLICY ─────────────────────────────────────────────────────────
def test_check_policy_returns_rules(easy_env):
obs = easy_env.step(ReleaseAction(action_type="check_policy"))
assert obs.last_tool_result.success is True
assert "POL-" in obs.last_tool_result.content
assert obs.reward > 0
def test_check_policy_discovers_violations(easy_env):
obs = easy_env.step(ReleaseAction(action_type="check_policy"))
risk_ids = [r.signal_id for r in obs.known_risk_signals]
# easy_001 has peak traffic window + missing owner approval
assert len(obs.known_risk_signals) > 0
# ── SUBMIT DECISION ───────────────────────────────────────────────────────
def test_submit_decision_ends_episode(easy_env):
obs = easy_env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="block",
reason_codes=["MISSING_LOAD_TEST"],
)
)
assert obs.done is True
assert obs.final_score is not None
assert 0.0 <= obs.final_score <= 1.0
assert obs.grader_breakdown is not None
assert obs.rollout_phase == "terminal"
def test_optimal_trajectory_high_score(env):
"""Agent that gathers required evidence + correct decision β†’ score β‰₯ 0.85."""
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
env.step(ReleaseAction(action_type="inspect_change", section="tests"))
env.step(ReleaseAction(action_type="check_policy"))
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="request_changes",
reason_codes=["HOT_PATH_SYNC_IO", "MISSING_LOAD_TEST", "INTEGRATION_TEST_FAILURE"],
)
)
assert obs.final_score >= 0.80, f"Expected β‰₯0.80, got {obs.final_score}. Breakdown: {obs.grader_breakdown}"
def test_wrong_decision_low_score(env):
"""Agent that approves without evidence β†’ score < 0.45."""
env.reset(task_id="easy_001")
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="approve",
reason_codes=[],
)
)
assert obs.final_score < 0.45, f"Expected <0.45, got {obs.final_score}"
def test_submit_without_final_decision_doesnt_end(easy_env):
obs = easy_env.step(ReleaseAction(action_type="submit_decision", reason_codes=[]))
assert obs.done is False
assert obs.reward < 0
def test_grader_is_deterministic(env):
"""Same trajectory must produce the same score every run."""
scores = []
for _ in range(3):
env.reset(task_id="easy_001")
env.step(ReleaseAction(action_type="inspect_change", section="diff"))
env.step(ReleaseAction(action_type="inspect_change", section="tests"))
env.step(ReleaseAction(action_type="check_policy"))
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="request_changes",
reason_codes=["HOT_PATH_SYNC_IO", "MISSING_LOAD_TEST", "INTEGRATION_TEST_FAILURE"],
)
)
scores.append(obs.final_score)
assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}"
# ── PHASE TRANSITIONS ─────────────────────────────────────────────────────
def test_invalid_phase_transition_penalized(easy_env):
# Can't promote from precheck
obs = easy_env.step(ReleaseAction(action_type="control_rollout", decision="promote"))
assert obs.reward < 0
assert obs.rollout_phase == "precheck" # phase unchanged
def test_start_canary_transitions_phase(easy_env):
obs = easy_env.step(ReleaseAction(action_type="control_rollout", decision="start_canary"))
assert obs.rollout_phase == "canary"
assert obs.reward >= 0
def test_max_steps_terminates(env):
env.reset(task_id="easy_001")
for _ in range(12):
obs = env.step(ReleaseAction(action_type="inspect_change", section="diff"))
assert obs.done is True
# ── STATE ─────────────────────────────────────────────────────────────────
def test_state_tracks_evidence(easy_env):
easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
easy_env.step(ReleaseAction(action_type="inspect_change", section="tests"))
state = easy_env.state
assert "inspected_diff" in state.evidence_gathered
assert "inspected_tests" in state.evidence_gathered
def test_state_tracks_step_count(easy_env):
easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
easy_env.step(ReleaseAction(action_type="check_policy"))
assert easy_env.state.step_count == 2
def test_allowed_actions_in_observation(easy_env):
obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff"))
assert "inspect_change" in obs.allowed_actions
assert "submit_decision" in obs.allowed_actions
def test_allowed_actions_empty_after_terminal(env):
env.reset(task_id="easy_001")
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision="block",
reason_codes=["MISSING_LOAD_TEST"],
)
)
assert obs.allowed_actions == []