""" ReleaseOps rubrics — composable grading components. Inspired by the REPL env's rubric pattern. Each rubric is an isolated, testable unit that grades one dimension of agent behavior. The composite ReleaseOpsRubric combines them into the final [0, 1] score. Grading dimensions: EvidenceRubric 0.35 — did the agent inspect the right information sources? RiskDiscoveryRubric 0.25 — did the agent trigger discovery of key risk signals? DecisionRubric 0.30 — was the final release decision correct? EfficiencyRubric 0.10 — did the agent avoid wasted / redundant steps? """ from __future__ import annotations from dataclasses import dataclass from typing import Protocol from releaseops_env.scoring import normalize_score # ── Data types ──────────────────────────────────────────────────────────────── @dataclass class EpisodeTrace: """Snapshot of relevant episode state passed to rubrics.""" evidence_gathered: list[str] # keys accumulated via inspect_* actions risk_signals_found: list[str] # signal_ids emitted by the environment final_decision: str # "approve" | "request_changes" | "block" | "escalate" step_count: int max_steps: int actions_taken: list[str] # action_type per step @dataclass class RubricResult: name: str score: float # [0.0, 1.0] weight: float details: dict # ── Rubric protocol ─────────────────────────────────────────────────────────── class Rubric(Protocol): name: str weight: float def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: ... # ── Individual rubrics ──────────────────────────────────────────────────────── class EvidenceRubric: """ Measures information-gathering breadth. Reward = (required evidence keys gathered) / (total required). A thorough investigator inspects the diff, tests, approvals, policy, dependencies, and incidents before deciding. """ name = "evidence_coverage" weight = 0.35 def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: required = set(ground_truth.get("required_evidence", [])) gathered = set(trace.evidence_gathered) if not required: value = 1.0 matched = set() else: matched = required & gathered value = len(matched) / len(required) missing = sorted(required - gathered) return RubricResult( name=self.name, score=round(value, 3), weight=self.weight, details={ "required": sorted(required), "gathered": sorted(gathered & required), "missing": missing, }, ) class RiskDiscoveryRubric: """ Objective measure of signal discovery. Checks state.risk_signals_found (signal_ids the *environment* emitted during the episode) against required_risk_signals in ground_truth. This is analogous to the Calendar env's SQL verifiers: the environment measures what the agent actually observed, not what strings the agent typed. An agent that skips inspect_tests will never trigger 'missing_load_test', even if it guesses the string correctly in reason_codes. """ name = "risk_signal_discovery" weight = 0.25 def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: required = set(ground_truth.get("required_risk_signals", [])) discovered = set(trace.risk_signals_found) if not required: # No required signals (e.g. trivial approve) — full credit value = 1.0 matched = set() else: matched = required & discovered value = len(matched) / len(required) missing = sorted(required - discovered) return RubricResult( name=self.name, score=round(value, 3), weight=self.weight, details={ "required": sorted(required), "discovered": sorted(discovered & required), "missing": missing, "extra_discovered": sorted(discovered - required), }, ) class DecisionRubric: """ Scores the final release decision. optimal → 1.0 (exactly right) acceptable but not optimal → 0.5 (e.g. block when request_changes was best) wrong → 0.0 """ name = "decision_correctness" weight = 0.30 def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: optimal = ground_truth.get("optimal_decision", "") acceptable = set(ground_truth.get("acceptable_decisions", [optimal])) decision = trace.final_decision if decision == optimal: value = 1.0 label = "optimal" elif decision in acceptable: value = 0.5 label = "acceptable" else: value = 0.0 label = "wrong" return RubricResult( name=self.name, score=round(value, 3), weight=self.weight, details={ "submitted": decision, "optimal": optimal, "acceptable": sorted(acceptable), "verdict": label, }, ) class EfficiencyRubric: """ Rewards investigators who complete the task without wasted steps. The efficiency band [0.3, 0.7] of max_steps scores 1.0. Outside that window the score degrades linearly. Agents that decide after only 1-2 steps are penalized (they skipped evidence) as are agents that thrash up to the step budget. """ name = "efficiency" weight = 0.10 def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: usage = trace.step_count / max(trace.max_steps, 1) if 0.3 <= usage <= 0.7: value = 1.0 elif usage < 0.3: value = usage / 0.3 else: value = max(0.0, 1.0 - (usage - 0.7) / 0.3) return RubricResult( name=self.name, score=round(value, 3), weight=self.weight, details={ "steps_taken": trace.step_count, "max_steps": trace.max_steps, "usage_fraction": round(usage, 3), }, ) # ── Composite rubric ────────────────────────────────────────────────────────── class ReleaseOpsRubric: """ Composite rubric combining all four dimensions. Usage: rubric = ReleaseOpsRubric() result = rubric.score(trace, ground_truth) print(result["score"], result["breakdown"]) """ def __init__(self): self._rubrics: list[Rubric] = [ EvidenceRubric(), RiskDiscoveryRubric(), DecisionRubric(), EfficiencyRubric(), ] def score(self, trace: EpisodeTrace, ground_truth: dict) -> dict: results = [r.score(trace, ground_truth) for r in self._rubrics] # Forbidden action penalty forbidden = ground_truth.get("forbidden_actions", []) took_forbidden = any(fa in trace.actions_taken for fa in forbidden) forbidden_penalty = 0.3 if took_forbidden else 0.0 raw = sum(r.score * r.weight for r in results) final_score = normalize_score(raw - forbidden_penalty) return { "score": round(final_score, 3), "breakdown": {r.name: round(r.score, 3) for r in results} | {"forbidden_penalty": round(forbidden_penalty, 3)}, "details": {r.name: r.details for r in results}, }