Spaces:
Sleeping
Sleeping
| """Core contract tests for ReleaseOps-Env.""" | |
| import pytest | |
| from server.releaseops_environment import ReleaseOpsEnvironment | |
| from releaseops_env.models import ReleaseAction | |
| # ββ RESET ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_reset_returns_observation(env): | |
| obs = env.reset(task_id="easy_001") | |
| assert obs.done is False | |
| assert obs.change_summary != "" | |
| assert obs.rollout_phase == "precheck" | |
| assert obs.time_remaining == 12 | |
| assert obs.cumulative_reward == 0.0 | |
| assert obs.known_risk_signals == [] | |
| def test_reset_sets_state(env): | |
| env.reset(task_id="easy_001") | |
| state = env.state | |
| assert state.task_id == "easy_001" | |
| assert state.step_count == 0 | |
| assert state.rollout_phase == "precheck" | |
| assert state.evidence_gathered == [] | |
| def test_reset_is_idempotent(env): | |
| env.reset(task_id="easy_001") | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| env.reset(task_id="easy_001") # reset again | |
| assert env.state.step_count == 0 | |
| assert env.state.evidence_gathered == [] | |
| # ββ INSPECT CHANGE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_inspect_change_diff_returns_content(easy_env): | |
| obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| assert obs.last_tool_result is not None | |
| assert obs.last_tool_result.success is True | |
| assert obs.last_tool_result.content != "" | |
| assert obs.reward > 0 # new required evidence | |
| def test_inspect_change_tests_discovers_risks(easy_env): | |
| obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="tests")) | |
| assert obs.last_tool_result.success is True | |
| risk_ids = [r.signal_id for r in obs.known_risk_signals] | |
| # easy_001 has a failed integration test and no load test | |
| assert "integration_test_failure" in risk_ids or "missing_load_test" in risk_ids | |
| def test_redundant_inspection_penalized(easy_env): | |
| easy_env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| assert obs.reward < 0 # penalty for redundancy | |
| def test_inspect_approvals_discovers_pending(easy_env): | |
| obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="approvals")) | |
| risk_ids = [r.signal_id for r in obs.known_risk_signals] | |
| assert any("approval" in rid for rid in risk_ids) | |
| # ββ CHECK POLICY βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_check_policy_returns_rules(easy_env): | |
| obs = easy_env.step(ReleaseAction(action_type="check_policy")) | |
| assert obs.last_tool_result.success is True | |
| assert "POL-" in obs.last_tool_result.content | |
| assert obs.reward > 0 | |
| def test_check_policy_discovers_violations(easy_env): | |
| obs = easy_env.step(ReleaseAction(action_type="check_policy")) | |
| risk_ids = [r.signal_id for r in obs.known_risk_signals] | |
| # easy_001 has peak traffic window + missing owner approval | |
| assert len(obs.known_risk_signals) > 0 | |
| # ββ SUBMIT DECISION βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_submit_decision_ends_episode(easy_env): | |
| obs = easy_env.step( | |
| ReleaseAction( | |
| action_type="submit_decision", | |
| final_decision="block", | |
| reason_codes=["MISSING_LOAD_TEST"], | |
| ) | |
| ) | |
| assert obs.done is True | |
| assert obs.final_score is not None | |
| assert 0.0 <= obs.final_score <= 1.0 | |
| assert obs.grader_breakdown is not None | |
| assert obs.rollout_phase == "terminal" | |
| def test_optimal_trajectory_high_score(env): | |
| """Agent that gathers required evidence + correct decision β score β₯ 0.85.""" | |
| env.reset(task_id="easy_001") | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| env.step(ReleaseAction(action_type="inspect_change", section="tests")) | |
| env.step(ReleaseAction(action_type="check_policy")) | |
| obs = env.step( | |
| ReleaseAction( | |
| action_type="submit_decision", | |
| final_decision="request_changes", | |
| reason_codes=["HOT_PATH_SYNC_IO", "MISSING_LOAD_TEST", "INTEGRATION_TEST_FAILURE"], | |
| ) | |
| ) | |
| assert obs.final_score >= 0.80, f"Expected β₯0.80, got {obs.final_score}. Breakdown: {obs.grader_breakdown}" | |
| def test_wrong_decision_low_score(env): | |
| """Agent that approves without evidence β score < 0.45.""" | |
| env.reset(task_id="easy_001") | |
| obs = env.step( | |
| ReleaseAction( | |
| action_type="submit_decision", | |
| final_decision="approve", | |
| reason_codes=[], | |
| ) | |
| ) | |
| assert obs.final_score < 0.45, f"Expected <0.45, got {obs.final_score}" | |
| def test_submit_without_final_decision_doesnt_end(easy_env): | |
| obs = easy_env.step(ReleaseAction(action_type="submit_decision", reason_codes=[])) | |
| assert obs.done is False | |
| assert obs.reward < 0 | |
| def test_grader_is_deterministic(env): | |
| """Same trajectory must produce the same score every run.""" | |
| scores = [] | |
| for _ in range(3): | |
| env.reset(task_id="easy_001") | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| env.step(ReleaseAction(action_type="inspect_change", section="tests")) | |
| env.step(ReleaseAction(action_type="check_policy")) | |
| obs = env.step( | |
| ReleaseAction( | |
| action_type="submit_decision", | |
| final_decision="request_changes", | |
| reason_codes=["HOT_PATH_SYNC_IO", "MISSING_LOAD_TEST", "INTEGRATION_TEST_FAILURE"], | |
| ) | |
| ) | |
| scores.append(obs.final_score) | |
| assert len(set(scores)) == 1, f"Non-deterministic scores: {scores}" | |
| # ββ PHASE TRANSITIONS βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_invalid_phase_transition_penalized(easy_env): | |
| # Can't promote from precheck | |
| obs = easy_env.step(ReleaseAction(action_type="control_rollout", decision="promote")) | |
| assert obs.reward < 0 | |
| assert obs.rollout_phase == "precheck" # phase unchanged | |
| def test_start_canary_transitions_phase(easy_env): | |
| obs = easy_env.step(ReleaseAction(action_type="control_rollout", decision="start_canary")) | |
| assert obs.rollout_phase == "canary" | |
| assert obs.reward >= 0 | |
| def test_max_steps_terminates(env): | |
| env.reset(task_id="easy_001") | |
| for _ in range(12): | |
| obs = env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| assert obs.done is True | |
| # ββ STATE βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_state_tracks_evidence(easy_env): | |
| easy_env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| easy_env.step(ReleaseAction(action_type="inspect_change", section="tests")) | |
| state = easy_env.state | |
| assert "inspected_diff" in state.evidence_gathered | |
| assert "inspected_tests" in state.evidence_gathered | |
| def test_state_tracks_step_count(easy_env): | |
| easy_env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| easy_env.step(ReleaseAction(action_type="check_policy")) | |
| assert easy_env.state.step_count == 2 | |
| def test_allowed_actions_in_observation(easy_env): | |
| obs = easy_env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| assert "inspect_change" in obs.allowed_actions | |
| assert "submit_decision" in obs.allowed_actions | |
| def test_allowed_actions_empty_after_terminal(env): | |
| env.reset(task_id="easy_001") | |
| obs = env.step( | |
| ReleaseAction( | |
| action_type="submit_decision", | |
| final_decision="block", | |
| reason_codes=["MISSING_LOAD_TEST"], | |
| ) | |
| ) | |
| assert obs.allowed_actions == [] | |