Spaces:
Sleeping
Sleeping
| """ | |
| ReleaseOps rubrics β composable grading components. | |
| Inspired by the REPL env's rubric pattern. Each rubric is an isolated, | |
| testable unit that grades one dimension of agent behavior. The composite | |
| ReleaseOpsRubric combines them into the final [0, 1] score. | |
| Grading dimensions: | |
| EvidenceRubric 0.35 β did the agent inspect the right information sources? | |
| RiskDiscoveryRubric 0.25 β did the agent trigger discovery of key risk signals? | |
| DecisionRubric 0.30 β was the final release decision correct? | |
| EfficiencyRubric 0.10 β did the agent avoid wasted / redundant steps? | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import Protocol | |
| from releaseops_env.scoring import normalize_score | |
| # ββ Data types ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class EpisodeTrace: | |
| """Snapshot of relevant episode state passed to rubrics.""" | |
| evidence_gathered: list[str] # keys accumulated via inspect_* actions | |
| risk_signals_found: list[str] # signal_ids emitted by the environment | |
| final_decision: str # "approve" | "request_changes" | "block" | "escalate" | |
| step_count: int | |
| max_steps: int | |
| actions_taken: list[str] # action_type per step | |
| class RubricResult: | |
| name: str | |
| score: float # [0.0, 1.0] | |
| weight: float | |
| details: dict | |
| # ββ Rubric protocol βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class Rubric(Protocol): | |
| name: str | |
| weight: float | |
| def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: | |
| ... | |
| # ββ Individual rubrics ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class EvidenceRubric: | |
| """ | |
| Measures information-gathering breadth. | |
| Reward = (required evidence keys gathered) / (total required). | |
| A thorough investigator inspects the diff, tests, approvals, policy, | |
| dependencies, and incidents before deciding. | |
| """ | |
| name = "evidence_coverage" | |
| weight = 0.35 | |
| def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: | |
| required = set(ground_truth.get("required_evidence", [])) | |
| gathered = set(trace.evidence_gathered) | |
| if not required: | |
| value = 1.0 | |
| matched = set() | |
| else: | |
| matched = required & gathered | |
| value = len(matched) / len(required) | |
| missing = sorted(required - gathered) | |
| return RubricResult( | |
| name=self.name, | |
| score=round(value, 3), | |
| weight=self.weight, | |
| details={ | |
| "required": sorted(required), | |
| "gathered": sorted(gathered & required), | |
| "missing": missing, | |
| }, | |
| ) | |
| class RiskDiscoveryRubric: | |
| """ | |
| Objective measure of signal discovery. | |
| Checks state.risk_signals_found (signal_ids the *environment* emitted | |
| during the episode) against required_risk_signals in ground_truth. | |
| This is analogous to the Calendar env's SQL verifiers: the environment | |
| measures what the agent actually observed, not what strings the agent typed. | |
| An agent that skips inspect_tests will never trigger 'missing_load_test', | |
| even if it guesses the string correctly in reason_codes. | |
| """ | |
| name = "risk_signal_discovery" | |
| weight = 0.25 | |
| def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: | |
| required = set(ground_truth.get("required_risk_signals", [])) | |
| discovered = set(trace.risk_signals_found) | |
| if not required: | |
| # No required signals (e.g. trivial approve) β full credit | |
| value = 1.0 | |
| matched = set() | |
| else: | |
| matched = required & discovered | |
| value = len(matched) / len(required) | |
| missing = sorted(required - discovered) | |
| return RubricResult( | |
| name=self.name, | |
| score=round(value, 3), | |
| weight=self.weight, | |
| details={ | |
| "required": sorted(required), | |
| "discovered": sorted(discovered & required), | |
| "missing": missing, | |
| "extra_discovered": sorted(discovered - required), | |
| }, | |
| ) | |
| class DecisionRubric: | |
| """ | |
| Scores the final release decision. | |
| optimal β 1.0 (exactly right) | |
| acceptable but not optimal β 0.5 (e.g. block when request_changes was best) | |
| wrong β 0.0 | |
| """ | |
| name = "decision_correctness" | |
| weight = 0.30 | |
| def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: | |
| optimal = ground_truth.get("optimal_decision", "") | |
| acceptable = set(ground_truth.get("acceptable_decisions", [optimal])) | |
| decision = trace.final_decision | |
| if decision == optimal: | |
| value = 1.0 | |
| label = "optimal" | |
| elif decision in acceptable: | |
| value = 0.5 | |
| label = "acceptable" | |
| else: | |
| value = 0.0 | |
| label = "wrong" | |
| return RubricResult( | |
| name=self.name, | |
| score=round(value, 3), | |
| weight=self.weight, | |
| details={ | |
| "submitted": decision, | |
| "optimal": optimal, | |
| "acceptable": sorted(acceptable), | |
| "verdict": label, | |
| }, | |
| ) | |
| class EfficiencyRubric: | |
| """ | |
| Rewards investigators who complete the task without wasted steps. | |
| The efficiency band [0.3, 0.7] of max_steps scores 1.0. Outside that | |
| window the score degrades linearly. Agents that decide after only 1-2 steps | |
| are penalized (they skipped evidence) as are agents that thrash up to | |
| the step budget. | |
| """ | |
| name = "efficiency" | |
| weight = 0.10 | |
| def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult: | |
| usage = trace.step_count / max(trace.max_steps, 1) | |
| if 0.3 <= usage <= 0.7: | |
| value = 1.0 | |
| elif usage < 0.3: | |
| value = usage / 0.3 | |
| else: | |
| value = max(0.0, 1.0 - (usage - 0.7) / 0.3) | |
| return RubricResult( | |
| name=self.name, | |
| score=round(value, 3), | |
| weight=self.weight, | |
| details={ | |
| "steps_taken": trace.step_count, | |
| "max_steps": trace.max_steps, | |
| "usage_fraction": round(usage, 3), | |
| }, | |
| ) | |
| # ββ Composite rubric ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class ReleaseOpsRubric: | |
| """ | |
| Composite rubric combining all four dimensions. | |
| Usage: | |
| rubric = ReleaseOpsRubric() | |
| result = rubric.score(trace, ground_truth) | |
| print(result["score"], result["breakdown"]) | |
| """ | |
| def __init__(self): | |
| self._rubrics: list[Rubric] = [ | |
| EvidenceRubric(), | |
| RiskDiscoveryRubric(), | |
| DecisionRubric(), | |
| EfficiencyRubric(), | |
| ] | |
| def score(self, trace: EpisodeTrace, ground_truth: dict) -> dict: | |
| results = [r.score(trace, ground_truth) for r in self._rubrics] | |
| # Forbidden action penalty | |
| forbidden = ground_truth.get("forbidden_actions", []) | |
| took_forbidden = any(fa in trace.actions_taken for fa in forbidden) | |
| forbidden_penalty = 0.3 if took_forbidden else 0.0 | |
| raw = sum(r.score * r.weight for r in results) | |
| final_score = normalize_score(raw - forbidden_penalty) | |
| return { | |
| "score": round(final_score, 3), | |
| "breakdown": {r.name: round(r.score, 3) for r in results} | |
| | {"forbidden_penalty": round(forbidden_penalty, 3)}, | |
| "details": {r.name: r.details for r in results}, | |
| } | |