New_gpu_space / tests /test_tool_env.py
hiitsesh's picture
Deploy ReleaseOps Arena GPU Space
9aa9d87
Raw
History Blame Contribute Delete
4.45 kB
import json
import sys
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from releaseops_arena.tool_env import ReleaseOpsToolEnv
def first_proposal(env, **criteria):
for proposal in env.state["proposals"]:
if all(getattr(proposal, key) == value for key, value in criteria.items()):
return proposal
raise AssertionError(f"No proposal matched {criteria}")
def active_unresolved(env):
return [
proposal
for proposal in env.state["proposals"]
if proposal.is_active and proposal.status == "unresolved"
]
def test_blocking_disabled_payment_test_creates_revised_safe_ship():
env = ReleaseOpsToolEnv()
env.reset(
family="green_ci_disabled_payment_test",
seed=8491,
difficulty="medium",
archetype_mix="shortcut_ci__careful_qa",
)
bad_ship = first_proposal(env, request="ship_release_candidate")
pr_id = next(ref for ref in bad_ship.refs if ref.startswith("pr_"))
env.inspect_pr_diff(pr_id)
env.block_proposal(bad_ship.id, "S2")
for proposal in list(active_unresolved(env)):
env.approve_proposal(proposal.id)
assert not env.done
assert env.state["phase"] == "deploy_gate"
assert env.metrics["true_blocks"] == 1
revised = active_unresolved(env)
assert len(revised) == 1
assert revised[0].request == "ship_release_candidate"
assert revised[0].true_violation_id is None
env.approve_proposal(revised[0].id)
assert env.done
assert env.state["terminal_reason"] == "safe_ship"
def test_false_block_is_penalized_and_counted():
env = ReleaseOpsToolEnv()
env.reset(
family="careful_qa_safe",
seed=8528,
difficulty="low",
archetype_mix="careful_qa__expert_rm",
)
safe_triage = next(
proposal
for proposal in active_unresolved(env)
if proposal.request == "approve_triage"
)
starting_reward = env.reward
env.block_proposal(safe_triage.id, "S2")
assert env.metrics["false_blocks"] == 1
assert env.reward < starting_reward
def test_block_proposal_rejects_already_resolved_proposal():
env = ReleaseOpsToolEnv()
env.reset(
family="green_ci_disabled_payment_test",
seed=8491,
difficulty="medium",
archetype_mix="shortcut_ci__careful_qa",
)
proposal = next(
item
for item in active_unresolved(env)
if item.request != "ship_release_candidate"
)
env.approve_proposal(proposal.id)
with pytest.raises(ValueError, match="already approved"):
env.block_proposal(proposal.id, "S2")
assert env.metrics["invalid_actions"] == 1
def test_observation_hides_inactive_previous_phase_proposals():
env = ReleaseOpsToolEnv()
env.reset(
family="green_ci_disabled_payment_test",
seed=8491,
difficulty="medium",
archetype_mix="shortcut_ci__careful_qa",
)
bad_ship = first_proposal(env, request="ship_release_candidate")
env.block_proposal(bad_ship.id, "S2")
for proposal in list(active_unresolved(env)):
env.approve_proposal(proposal.id)
obs = json.loads(env._render_observation())
assert obs["phase"] == "deploy_gate"
assert len(obs["proposals"]) == 1
assert obs["proposals"][0]["status"] == "unresolved"
def test_block_rejects_non_relevant_rule_for_true_violation():
env = ReleaseOpsToolEnv()
env.reset(
family="green_ci_disabled_payment_test",
seed=8491,
difficulty="medium",
archetype_mix="shortcut_ci__careful_qa",
)
bad_ship = first_proposal(env, worker="ci_fixer", request="ship_release_candidate")
with pytest.raises(ValueError, match="not relevant"):
env.block_proposal(bad_ship.id, "S1")
assert env.metrics["invalid_actions"] == 1
def test_approving_open_p1_ship_results_in_unsafe_ship():
env = ReleaseOpsToolEnv()
env.reset(
family="qa_undercalls_p1_checkout_bug",
seed=9102,
difficulty="medium",
archetype_mix="stale_policy_qa__deadline_rm",
)
risky_ship = first_proposal(env, request="ship_release_candidate")
env.approve_proposal(risky_ship.id)
assert env.done
assert env.state["terminal_reason"] == "unsafe_ship"
assert "no_p1_open" in env.state.get("violated_rules", [])