Spaces:
Sleeping
Sleeping
| """Test the deterministic grading system. | |
| Verifies that the grader produces correct, reproducible scores across | |
| different trajectories and decision qualities. All tests use easy_001. | |
| Grading formula: | |
| score = 0.35 * evidence_coverage | |
| + 0.25 * risk_signal_discovery | |
| + 0.30 * decision_correctness | |
| + 0.10 * efficiency | |
| - forbidden_penalty (0.3 if any forbidden action taken) | |
| clamped to [0.0, 1.0] | |
| risk_signal_discovery measures which required_risk_signals were actually | |
| triggered (emitted by the environment) during the episode β objective, | |
| not dependent on what strings the agent typed in reason_codes. | |
| """ | |
| import pytest | |
| from server.releaseops_environment import ReleaseOpsEnvironment | |
| from releaseops_env.models import ReleaseAction | |
| # ββ Fixtures βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def env(): | |
| return ReleaseOpsEnvironment() | |
| def _gather_all_required_evidence(env): | |
| """Play the 3 required evidence steps for easy_001.""" | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| env.step(ReleaseAction(action_type="inspect_change", section="tests")) | |
| env.step(ReleaseAction(action_type="check_policy")) | |
| def _submit(env, decision, reason_codes): | |
| """Submit a final decision and return the observation.""" | |
| return env.step( | |
| ReleaseAction( | |
| action_type="submit_decision", | |
| final_decision=decision, | |
| reason_codes=reason_codes, | |
| ) | |
| ) | |
| # ββ Score range tests ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_perfect_play_scores_above_085(env): | |
| """All evidence + optimal decision + all reason codes β β₯ 0.85.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert ( | |
| obs.final_score >= 0.85 | |
| ), f"Got {obs.final_score}, breakdown: {obs.grader_breakdown}" | |
| def test_wrong_decision_no_evidence_scores_below_020(env): | |
| """No evidence + wrong decision + no codes β very low.""" | |
| env.reset(task_id="easy_001") | |
| obs = _submit(env, "approve", []) | |
| # decision_correctness=0, evidence=0, reason_accuracy=0.5 (no codes, no required β but there ARE required) | |
| # Actually: required_codes exist, submitted=none β precision=0/1=0, recall=0/3=0 β 0.0 | |
| # efficiency: 1 step / 12 = 0.083 β 0.083/0.3 = 0.278 | |
| # score = 0.35*0 + 0.25*0 + 0.30*0 + 0.10*0.278 = 0.028 | |
| assert obs.final_score < 0.20, f"Got {obs.final_score}" | |
| def test_acceptable_decision_gets_partial_credit(env): | |
| """'block' is acceptable but not optimal for easy_001 β decision_score = 0.5.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs = _submit( | |
| env, | |
| "block", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| bd = obs.grader_breakdown | |
| assert bd["decision_correctness"] == 0.5 | |
| # Still gets full evidence + reason codes, so score should be decent | |
| assert 0.55 <= obs.final_score <= 0.85, f"Got {obs.final_score}" | |
| def test_completely_wrong_decision_zero_decision_score(env): | |
| """'escalate' is not in acceptable_decisions β decision_score = 0.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs = _submit( | |
| env, | |
| "escalate", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert obs.grader_breakdown["decision_correctness"] == 0.0 | |
| # ββ Evidence coverage component ββββββββββββββββββββββββββββββββββββββ | |
| def test_full_evidence_coverage(env): | |
| """Gathering all 3 required evidence items β evidence_coverage = 1.0.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"]) | |
| assert obs.grader_breakdown["evidence_coverage"] == 1.0 | |
| def test_partial_evidence_coverage(env): | |
| """Gathering 1 of 3 required β evidence_coverage β 0.333.""" | |
| env.reset(task_id="easy_001") | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"]) | |
| ec = obs.grader_breakdown["evidence_coverage"] | |
| assert abs(ec - 1.0 / 3.0) < 0.01, f"Expected ~0.333, got {ec}" | |
| def test_zero_evidence_coverage(env): | |
| """No evidence gathered β evidence_coverage = 0.0.""" | |
| env.reset(task_id="easy_001") | |
| obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"]) | |
| assert obs.grader_breakdown["evidence_coverage"] == 0.0 | |
| def test_extra_evidence_doesnt_hurt(env): | |
| """Gathering more than required evidence still gives 1.0 coverage.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| # Extra steps beyond the 3 required | |
| env.step(ReleaseAction(action_type="inspect_change", section="approvals")) | |
| env.step(ReleaseAction(action_type="inspect_dependencies")) | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert obs.grader_breakdown["evidence_coverage"] == 1.0 | |
| # ββ Risk signal discovery component βββββββββββββββββββββββββββββββββ | |
| # required_risk_signals for easy_001: hot_path_sync_io, integration_test_failure, | |
| # missing_load_test, policy_peak_traffic | |
| # hot_path_sync_io + missing_load_test β inspect_change(diff) + inspect_change(tests) | |
| # integration_test_failure β inspect_change(tests) | |
| # policy_peak_traffic β check_policy | |
| def test_full_signal_discovery_after_complete_investigation(env): | |
| """Gathering all required evidence triggers all 4 required risk signals β 1.0.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) # diff + tests + check_policy | |
| obs = _submit(env, "request_changes", []) | |
| assert obs.grader_breakdown["risk_signal_discovery"] == 1.0 | |
| def test_partial_signal_discovery_diff_only(env): | |
| """Only inspecting diff discovers hot_path_sync_io β 1/4 = 0.25.""" | |
| env.reset(task_id="easy_001") | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| obs = _submit(env, "request_changes", []) | |
| rsd = obs.grader_breakdown["risk_signal_discovery"] | |
| # Only hot_path_sync_io triggered; missing_load_test, integration_test_failure, | |
| # policy_peak_traffic not yet discovered β 1/4 = 0.25 | |
| assert abs(rsd - 0.25) < 0.01, f"Expected ~0.25, got {rsd}" | |
| def test_reason_codes_do_not_affect_signal_discovery(env): | |
| """Submitting wrong/extra reason_codes has no effect on risk_signal_discovery.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs_correct = _submit(env, "request_changes", ["hot_path_sync_io"]) | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs_bogus = _submit(env, "request_changes", ["BOGUS_1", "BOGUS_2", "BOGUS_3"]) | |
| # Both should have the same risk_signal_discovery β it's env-side, not string-matching | |
| assert obs_correct.grader_breakdown["risk_signal_discovery"] == \ | |
| obs_bogus.grader_breakdown["risk_signal_discovery"] | |
| def test_zero_signal_discovery_no_investigation(env): | |
| """Skipping all investigation β no signals discovered β 0.0.""" | |
| env.reset(task_id="easy_001") | |
| obs = _submit(env, "request_changes", []) | |
| assert obs.grader_breakdown["risk_signal_discovery"] == 0.0 | |
| # ββ Efficiency component βββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_efficiency_sweet_spot(env): | |
| """Using 4-8 of 12 steps (33-67%) β efficiency = 1.0.""" | |
| env.reset(task_id="easy_001") | |
| # 4 steps: 3 evidence + 1 submit = 4/12 = 0.333 (just in sweet spot) | |
| _gather_all_required_evidence(env) | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert obs.grader_breakdown["efficiency"] == 1.0 | |
| def test_efficiency_too_fast(env): | |
| """1 step out of 12 = 8.3% β efficiency < 1.0 (too hasty).""" | |
| env.reset(task_id="easy_001") | |
| obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"]) | |
| eff = obs.grader_breakdown["efficiency"] | |
| # usage = 1/12 = 0.083, efficiency = 0.083/0.3 = 0.278 | |
| assert eff < 0.5, f"Expected < 0.5, got {eff}" | |
| def test_efficiency_too_slow(env): | |
| """Using 11 of 12 steps = 91.7% β efficiency tapers.""" | |
| env.reset(task_id="easy_001") | |
| # Burn 10 steps with redundant inspections (each gets -0.05 after first) | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| env.step(ReleaseAction(action_type="inspect_change", section="tests")) | |
| env.step(ReleaseAction(action_type="check_policy")) | |
| for _ in range(7): | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| # Now at step 10, submit at step 11 | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| eff = obs.grader_breakdown["efficiency"] | |
| # usage = 11/12 = 0.917, eff = max(0, 1-(0.917-0.7)/0.3) = 1-0.723 = 0.277 | |
| assert eff < 0.5, f"Expected < 0.5, got {eff}" | |
| # ββ Forbidden actions ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_forbidden_action_penalty_applied(env): | |
| """Taking a forbidden action deducts 0.3 from score.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| # "approve_with_missing_approval" is in easy_001's forbidden_actions. | |
| # We fake it by directly injecting into actions_taken. | |
| env._state.actions_taken.append("approve_with_missing_approval") | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert obs.grader_breakdown["forbidden_penalty"] == 0.3 | |
| # Without penalty this would be ~1.00, with penalty = 0.70 | |
| assert obs.final_score <= 0.70 | |
| def test_no_forbidden_penalty_by_default(env): | |
| """Normal trajectory has no forbidden penalty.""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert obs.grader_breakdown["forbidden_penalty"] == 0.0 | |
| # ββ Score clamping βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_score_never_exceeds_one(env): | |
| """Score is clamped to [0.0, 1.0].""" | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert 0.0 <= obs.final_score <= 1.0 | |
| def test_score_never_below_zero(env): | |
| """Even with forbidden penalty, score clamps to 0.0.""" | |
| env.reset(task_id="easy_001") | |
| env._state.actions_taken.append("approve_with_missing_approval") | |
| obs = _submit(env, "escalate", []) # wrong decision + forbidden | |
| assert obs.final_score >= 0.0 | |
| # ββ Determinism ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def test_grader_deterministic_across_runs(env): | |
| """Identical trajectory produces identical score every time.""" | |
| scores = [] | |
| for _ in range(5): | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| scores.append(obs.final_score) | |
| assert len(set(scores)) == 1, f"Scores vary: {scores}" | |
| def test_grader_breakdown_deterministic(env): | |
| """Every breakdown component is identical across runs.""" | |
| breakdowns = [] | |
| for _ in range(3): | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| breakdowns.append(obs.grader_breakdown) | |
| assert breakdowns[0] == breakdowns[1] == breakdowns[2] | |
| # ββ Score ordering (difficulty spread) βββββββββββββββββββββββββββββββ | |
| def test_optimal_beats_partial_beats_wrong(env): | |
| """Optimal decision > acceptable decision > wrong decision.""" | |
| # Optimal: request_changes | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs_opt = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| # Acceptable: block | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs_partial = _submit( | |
| env, | |
| "block", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| # Wrong: approve | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs_wrong = _submit( | |
| env, | |
| "approve", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert ( | |
| obs_opt.final_score > obs_partial.final_score > obs_wrong.final_score | |
| ), f"Expected optimal({obs_opt.final_score}) > partial({obs_partial.final_score}) > wrong({obs_wrong.final_score})" | |
| def test_more_evidence_higher_score(env): | |
| """Gathering more required evidence β higher score (same decision).""" | |
| # 0 evidence | |
| env.reset(task_id="easy_001") | |
| obs_0 = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| # 1 evidence | |
| env.reset(task_id="easy_001") | |
| env.step(ReleaseAction(action_type="inspect_change", section="diff")) | |
| obs_1 = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| # 3 evidence (all required) | |
| env.reset(task_id="easy_001") | |
| _gather_all_required_evidence(env) | |
| obs_3 = _submit( | |
| env, | |
| "request_changes", | |
| [ | |
| "HOT_PATH_SYNC_IO", | |
| "MISSING_LOAD_TEST", | |
| "INTEGRATION_TEST_FAILURE", | |
| ], | |
| ) | |
| assert ( | |
| obs_3.final_score > obs_1.final_score > obs_0.final_score | |
| ), f"Expected 3ev({obs_3.final_score}) > 1ev({obs_1.final_score}) > 0ev({obs_0.final_score})" | |