Spaces:

eastbrick
/

releaseops-env

Sleeping

App Files Files Community

releaseops-env / tests /test_grader.py

eastbrick

first commit

408d02c about 2 months ago

raw

history blame contribute delete

16 kB

	"""Test the deterministic grading system.

	Verifies that the grader produces correct, reproducible scores across
	different trajectories and decision qualities. All tests use easy_001.

	Grading formula:
	score = 0.35 * evidence_coverage
	+ 0.25 * risk_signal_discovery
	+ 0.30 * decision_correctness
	+ 0.10 * efficiency
	- forbidden_penalty (0.3 if any forbidden action taken)
	clamped to [0.0, 1.0]

	risk_signal_discovery measures which required_risk_signals were actually
	triggered (emitted by the environment) during the episode — objective,
	not dependent on what strings the agent typed in reason_codes.
	"""

	import pytest
	from server.releaseops_environment import ReleaseOpsEnvironment
	from releaseops_env.models import ReleaseAction


	# ── Fixtures ─────────────────────────────────────────────────────────


	@pytest.fixture
	def env():
	return ReleaseOpsEnvironment()


	def _gather_all_required_evidence(env):
	"""Play the 3 required evidence steps for easy_001."""
	env.step(ReleaseAction(action_type="inspect_change", section="diff"))
	env.step(ReleaseAction(action_type="inspect_change", section="tests"))
	env.step(ReleaseAction(action_type="check_policy"))


	def _submit(env, decision, reason_codes):
	"""Submit a final decision and return the observation."""
	return env.step(
	ReleaseAction(
	action_type="submit_decision",
	final_decision=decision,
	reason_codes=reason_codes,
	)
	)


	# ── Score range tests ────────────────────────────────────────────────


	def test_perfect_play_scores_above_085(env):
	"""All evidence + optimal decision + all reason codes → ≥ 0.85."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	assert (
	obs.final_score >= 0.85
	), f"Got {obs.final_score}, breakdown: {obs.grader_breakdown}"


	def test_wrong_decision_no_evidence_scores_below_020(env):
	"""No evidence + wrong decision + no codes → very low."""
	env.reset(task_id="easy_001")
	obs = _submit(env, "approve", [])
	# decision_correctness=0, evidence=0, reason_accuracy=0.5 (no codes, no required → but there ARE required)
	# Actually: required_codes exist, submitted=none → precision=0/1=0, recall=0/3=0 → 0.0
	# efficiency: 1 step / 12 = 0.083 → 0.083/0.3 = 0.278
	# score = 0.350 + 0.250 + 0.300 + 0.100.278 = 0.028
	assert obs.final_score < 0.20, f"Got {obs.final_score}"


	def test_acceptable_decision_gets_partial_credit(env):
	"""'block' is acceptable but not optimal for easy_001 → decision_score = 0.5."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs = _submit(
	env,
	"block",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	bd = obs.grader_breakdown
	assert bd["decision_correctness"] == 0.5
	# Still gets full evidence + reason codes, so score should be decent
	assert 0.55 <= obs.final_score <= 0.85, f"Got {obs.final_score}"


	def test_completely_wrong_decision_zero_decision_score(env):
	"""'escalate' is not in acceptable_decisions → decision_score = 0."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs = _submit(
	env,
	"escalate",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	assert obs.grader_breakdown["decision_correctness"] == 0.0


	# ── Evidence coverage component ──────────────────────────────────────


	def test_full_evidence_coverage(env):
	"""Gathering all 3 required evidence items → evidence_coverage = 1.0."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
	assert obs.grader_breakdown["evidence_coverage"] == 1.0


	def test_partial_evidence_coverage(env):
	"""Gathering 1 of 3 required → evidence_coverage ≈ 0.333."""
	env.reset(task_id="easy_001")
	env.step(ReleaseAction(action_type="inspect_change", section="diff"))
	obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
	ec = obs.grader_breakdown["evidence_coverage"]
	assert abs(ec - 1.0 / 3.0) < 0.01, f"Expected ~0.333, got {ec}"


	def test_zero_evidence_coverage(env):
	"""No evidence gathered → evidence_coverage = 0.0."""
	env.reset(task_id="easy_001")
	obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
	assert obs.grader_breakdown["evidence_coverage"] == 0.0


	def test_extra_evidence_doesnt_hurt(env):
	"""Gathering more than required evidence still gives 1.0 coverage."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	# Extra steps beyond the 3 required
	env.step(ReleaseAction(action_type="inspect_change", section="approvals"))
	env.step(ReleaseAction(action_type="inspect_dependencies"))
	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	assert obs.grader_breakdown["evidence_coverage"] == 1.0


	# ── Risk signal discovery component ─────────────────────────────────
	# required_risk_signals for easy_001: hot_path_sync_io, integration_test_failure,
	# missing_load_test, policy_peak_traffic
	# hot_path_sync_io + missing_load_test → inspect_change(diff) + inspect_change(tests)
	# integration_test_failure → inspect_change(tests)
	# policy_peak_traffic → check_policy


	def test_full_signal_discovery_after_complete_investigation(env):
	"""Gathering all required evidence triggers all 4 required risk signals → 1.0."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env) # diff + tests + check_policy
	obs = _submit(env, "request_changes", [])
	assert obs.grader_breakdown["risk_signal_discovery"] == 1.0


	def test_partial_signal_discovery_diff_only(env):
	"""Only inspecting diff discovers hot_path_sync_io → 1/4 = 0.25."""
	env.reset(task_id="easy_001")
	env.step(ReleaseAction(action_type="inspect_change", section="diff"))
	obs = _submit(env, "request_changes", [])
	rsd = obs.grader_breakdown["risk_signal_discovery"]
	# Only hot_path_sync_io triggered; missing_load_test, integration_test_failure,
	# policy_peak_traffic not yet discovered → 1/4 = 0.25
	assert abs(rsd - 0.25) < 0.01, f"Expected ~0.25, got {rsd}"


	def test_reason_codes_do_not_affect_signal_discovery(env):
	"""Submitting wrong/extra reason_codes has no effect on risk_signal_discovery."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs_correct = _submit(env, "request_changes", ["hot_path_sync_io"])

	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs_bogus = _submit(env, "request_changes", ["BOGUS_1", "BOGUS_2", "BOGUS_3"])

	# Both should have the same risk_signal_discovery — it's env-side, not string-matching
	assert obs_correct.grader_breakdown["risk_signal_discovery"] == \
	obs_bogus.grader_breakdown["risk_signal_discovery"]


	def test_zero_signal_discovery_no_investigation(env):
	"""Skipping all investigation → no signals discovered → 0.0."""
	env.reset(task_id="easy_001")
	obs = _submit(env, "request_changes", [])
	assert obs.grader_breakdown["risk_signal_discovery"] == 0.0


	# ── Efficiency component ─────────────────────────────────────────────


	def test_efficiency_sweet_spot(env):
	"""Using 4-8 of 12 steps (33-67%) → efficiency = 1.0."""
	env.reset(task_id="easy_001")
	# 4 steps: 3 evidence + 1 submit = 4/12 = 0.333 (just in sweet spot)
	_gather_all_required_evidence(env)
	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	assert obs.grader_breakdown["efficiency"] == 1.0


	def test_efficiency_too_fast(env):
	"""1 step out of 12 = 8.3% → efficiency < 1.0 (too hasty)."""
	env.reset(task_id="easy_001")
	obs = _submit(env, "request_changes", ["HOT_PATH_SYNC_IO"])
	eff = obs.grader_breakdown["efficiency"]
	# usage = 1/12 = 0.083, efficiency = 0.083/0.3 = 0.278
	assert eff < 0.5, f"Expected < 0.5, got {eff}"


	def test_efficiency_too_slow(env):
	"""Using 11 of 12 steps = 91.7% → efficiency tapers."""
	env.reset(task_id="easy_001")
	# Burn 10 steps with redundant inspections (each gets -0.05 after first)
	env.step(ReleaseAction(action_type="inspect_change", section="diff"))
	env.step(ReleaseAction(action_type="inspect_change", section="tests"))
	env.step(ReleaseAction(action_type="check_policy"))
	for _ in range(7):
	env.step(ReleaseAction(action_type="inspect_change", section="diff"))
	# Now at step 10, submit at step 11
	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	eff = obs.grader_breakdown["efficiency"]
	# usage = 11/12 = 0.917, eff = max(0, 1-(0.917-0.7)/0.3) = 1-0.723 = 0.277
	assert eff < 0.5, f"Expected < 0.5, got {eff}"


	# ── Forbidden actions ────────────────────────────────────────────────


	def test_forbidden_action_penalty_applied(env):
	"""Taking a forbidden action deducts 0.3 from score."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)

	# "approve_with_missing_approval" is in easy_001's forbidden_actions.
	# We fake it by directly injecting into actions_taken.
	env._state.actions_taken.append("approve_with_missing_approval")

	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	assert obs.grader_breakdown["forbidden_penalty"] == 0.3
	# Without penalty this would be ~1.00, with penalty = 0.70
	assert obs.final_score <= 0.70


	def test_no_forbidden_penalty_by_default(env):
	"""Normal trajectory has no forbidden penalty."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	assert obs.grader_breakdown["forbidden_penalty"] == 0.0


	# ── Score clamping ───────────────────────────────────────────────────


	def test_score_never_exceeds_one(env):
	"""Score is clamped to [0.0, 1.0]."""
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	assert 0.0 <= obs.final_score <= 1.0


	def test_score_never_below_zero(env):
	"""Even with forbidden penalty, score clamps to 0.0."""
	env.reset(task_id="easy_001")
	env._state.actions_taken.append("approve_with_missing_approval")
	obs = _submit(env, "escalate", []) # wrong decision + forbidden
	assert obs.final_score >= 0.0


	# ── Determinism ──────────────────────────────────────────────────────


	def test_grader_deterministic_across_runs(env):
	"""Identical trajectory produces identical score every time."""
	scores = []
	for _ in range(5):
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	scores.append(obs.final_score)
	assert len(set(scores)) == 1, f"Scores vary: {scores}"


	def test_grader_breakdown_deterministic(env):
	"""Every breakdown component is identical across runs."""
	breakdowns = []
	for _ in range(3):
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)
	breakdowns.append(obs.grader_breakdown)
	assert breakdowns[0] == breakdowns[1] == breakdowns[2]


	# ── Score ordering (difficulty spread) ───────────────────────────────


	def test_optimal_beats_partial_beats_wrong(env):
	"""Optimal decision > acceptable decision > wrong decision."""
	# Optimal: request_changes
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs_opt = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)

	# Acceptable: block
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs_partial = _submit(
	env,
	"block",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)

	# Wrong: approve
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs_wrong = _submit(
	env,
	"approve",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)

	assert (
	obs_opt.final_score > obs_partial.final_score > obs_wrong.final_score
	), f"Expected optimal({obs_opt.final_score}) > partial({obs_partial.final_score}) > wrong({obs_wrong.final_score})"


	def test_more_evidence_higher_score(env):
	"""Gathering more required evidence → higher score (same decision)."""
	# 0 evidence
	env.reset(task_id="easy_001")
	obs_0 = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)

	# 1 evidence
	env.reset(task_id="easy_001")
	env.step(ReleaseAction(action_type="inspect_change", section="diff"))
	obs_1 = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)

	# 3 evidence (all required)
	env.reset(task_id="easy_001")
	_gather_all_required_evidence(env)
	obs_3 = _submit(
	env,
	"request_changes",
	[
	"HOT_PATH_SYNC_IO",
	"MISSING_LOAD_TEST",
	"INTEGRATION_TEST_FAILURE",
	],
	)

	assert (
	obs_3.final_score > obs_1.final_score > obs_0.final_score
	), f"Expected 3ev({obs_3.final_score}) > 1ev({obs_1.final_score}) > 0ev({obs_0.final_score})"