Spaces:
Sleeping
Sleeping
File size: 2,178 Bytes
140d024 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | #!/usr/bin/env python3
"""Validator-parity checks for score bounds and output contract."""
from __future__ import annotations
import json
from pathlib import Path
from releaseops_env.models import ReleaseAction
from releaseops_env.scoring import is_strict_score
from server.releaseops_environment import ReleaseOpsEnvironment
TASKS_DIR = Path(__file__).resolve().parents[1] / "tasks"
def run_reference_episode(task_id: str) -> float:
env = ReleaseOpsEnvironment()
obs = env.reset(task_id=task_id)
with open(TASKS_DIR / task_id / "ground_truth.json") as f:
gt = json.load(f)
evidence_actions = [
ReleaseAction(action_type="inspect_change", section="diff"),
ReleaseAction(action_type="inspect_change", section="tests"),
ReleaseAction(action_type="inspect_change", section="approvals"),
ReleaseAction(action_type="inspect_dependencies"),
ReleaseAction(action_type="search_incidents", keywords=["retry", "timeout", "latency"]),
ReleaseAction(action_type="check_policy"),
]
for action in evidence_actions:
obs = env.step(action)
if obs.done:
break
if not obs.done:
obs = env.step(
ReleaseAction(
action_type="submit_decision",
final_decision=gt.get("optimal_decision", "block"),
reason_codes=gt.get("required_reason_codes", []),
)
)
score = obs.final_score
if score is None or not is_strict_score(score):
raise SystemExit(f"[FAIL] {task_id}: out-of-range final_score={score}")
print(f"[OK] {task_id}: final_score={score:.3f}")
return score
def main() -> None:
task_ids = sorted(p.name for p in TASKS_DIR.iterdir() if p.is_dir())
if not task_ids:
raise SystemExit("[FAIL] No tasks found")
scores = [run_reference_episode(task_id) for task_id in task_ids]
avg = sum(scores) / len(scores)
if not is_strict_score(avg):
raise SystemExit(f"[FAIL] Average score out-of-range: {avg}")
print(f"[OK] average_score={avg:.3f}")
print("[OK] validator parity checks passed")
if __name__ == "__main__":
main()
|