releaseops-env / server /rubrics.py
eastbrick's picture
Unify score normalization and add validator parity checks
140d024
"""
ReleaseOps rubrics β€” composable grading components.
Inspired by the REPL env's rubric pattern. Each rubric is an isolated,
testable unit that grades one dimension of agent behavior. The composite
ReleaseOpsRubric combines them into the final [0, 1] score.
Grading dimensions:
EvidenceRubric 0.35 β€” did the agent inspect the right information sources?
RiskDiscoveryRubric 0.25 β€” did the agent trigger discovery of key risk signals?
DecisionRubric 0.30 β€” was the final release decision correct?
EfficiencyRubric 0.10 β€” did the agent avoid wasted / redundant steps?
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Protocol
from releaseops_env.scoring import normalize_score
# ── Data types ────────────────────────────────────────────────────────────────
@dataclass
class EpisodeTrace:
"""Snapshot of relevant episode state passed to rubrics."""
evidence_gathered: list[str] # keys accumulated via inspect_* actions
risk_signals_found: list[str] # signal_ids emitted by the environment
final_decision: str # "approve" | "request_changes" | "block" | "escalate"
step_count: int
max_steps: int
actions_taken: list[str] # action_type per step
@dataclass
class RubricResult:
name: str
score: float # [0.0, 1.0]
weight: float
details: dict
# ── Rubric protocol ───────────────────────────────────────────────────────────
class Rubric(Protocol):
name: str
weight: float
def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
...
# ── Individual rubrics ────────────────────────────────────────────────────────
class EvidenceRubric:
"""
Measures information-gathering breadth.
Reward = (required evidence keys gathered) / (total required).
A thorough investigator inspects the diff, tests, approvals, policy,
dependencies, and incidents before deciding.
"""
name = "evidence_coverage"
weight = 0.35
def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
required = set(ground_truth.get("required_evidence", []))
gathered = set(trace.evidence_gathered)
if not required:
value = 1.0
matched = set()
else:
matched = required & gathered
value = len(matched) / len(required)
missing = sorted(required - gathered)
return RubricResult(
name=self.name,
score=round(value, 3),
weight=self.weight,
details={
"required": sorted(required),
"gathered": sorted(gathered & required),
"missing": missing,
},
)
class RiskDiscoveryRubric:
"""
Objective measure of signal discovery.
Checks state.risk_signals_found (signal_ids the *environment* emitted
during the episode) against required_risk_signals in ground_truth.
This is analogous to the Calendar env's SQL verifiers: the environment
measures what the agent actually observed, not what strings the agent typed.
An agent that skips inspect_tests will never trigger 'missing_load_test',
even if it guesses the string correctly in reason_codes.
"""
name = "risk_signal_discovery"
weight = 0.25
def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
required = set(ground_truth.get("required_risk_signals", []))
discovered = set(trace.risk_signals_found)
if not required:
# No required signals (e.g. trivial approve) β€” full credit
value = 1.0
matched = set()
else:
matched = required & discovered
value = len(matched) / len(required)
missing = sorted(required - discovered)
return RubricResult(
name=self.name,
score=round(value, 3),
weight=self.weight,
details={
"required": sorted(required),
"discovered": sorted(discovered & required),
"missing": missing,
"extra_discovered": sorted(discovered - required),
},
)
class DecisionRubric:
"""
Scores the final release decision.
optimal β†’ 1.0 (exactly right)
acceptable but not optimal β†’ 0.5 (e.g. block when request_changes was best)
wrong β†’ 0.0
"""
name = "decision_correctness"
weight = 0.30
def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
optimal = ground_truth.get("optimal_decision", "")
acceptable = set(ground_truth.get("acceptable_decisions", [optimal]))
decision = trace.final_decision
if decision == optimal:
value = 1.0
label = "optimal"
elif decision in acceptable:
value = 0.5
label = "acceptable"
else:
value = 0.0
label = "wrong"
return RubricResult(
name=self.name,
score=round(value, 3),
weight=self.weight,
details={
"submitted": decision,
"optimal": optimal,
"acceptable": sorted(acceptable),
"verdict": label,
},
)
class EfficiencyRubric:
"""
Rewards investigators who complete the task without wasted steps.
The efficiency band [0.3, 0.7] of max_steps scores 1.0. Outside that
window the score degrades linearly. Agents that decide after only 1-2 steps
are penalized (they skipped evidence) as are agents that thrash up to
the step budget.
"""
name = "efficiency"
weight = 0.10
def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
usage = trace.step_count / max(trace.max_steps, 1)
if 0.3 <= usage <= 0.7:
value = 1.0
elif usage < 0.3:
value = usage / 0.3
else:
value = max(0.0, 1.0 - (usage - 0.7) / 0.3)
return RubricResult(
name=self.name,
score=round(value, 3),
weight=self.weight,
details={
"steps_taken": trace.step_count,
"max_steps": trace.max_steps,
"usage_fraction": round(usage, 3),
},
)
# ── Composite rubric ──────────────────────────────────────────────────────────
class ReleaseOpsRubric:
"""
Composite rubric combining all four dimensions.
Usage:
rubric = ReleaseOpsRubric()
result = rubric.score(trace, ground_truth)
print(result["score"], result["breakdown"])
"""
def __init__(self):
self._rubrics: list[Rubric] = [
EvidenceRubric(),
RiskDiscoveryRubric(),
DecisionRubric(),
EfficiencyRubric(),
]
def score(self, trace: EpisodeTrace, ground_truth: dict) -> dict:
results = [r.score(trace, ground_truth) for r in self._rubrics]
# Forbidden action penalty
forbidden = ground_truth.get("forbidden_actions", [])
took_forbidden = any(fa in trace.actions_taken for fa in forbidden)
forbidden_penalty = 0.3 if took_forbidden else 0.0
raw = sum(r.score * r.weight for r in results)
final_score = normalize_score(raw - forbidden_penalty)
return {
"score": round(final_score, 3),
"breakdown": {r.name: round(r.score, 3) for r in results}
| {"forbidden_penalty": round(forbidden_penalty, 3)},
"details": {r.name: r.details for r in results},
}