Spaces:

eastbrick
/

releaseops-env

Sleeping

App Files Files Community

releaseops-env / server /rubrics.py

eastbrick

Unify score normalization and add validator parity checks

140d024 about 2 months ago

raw

history blame contribute delete

8.2 kB

	"""
	ReleaseOps rubrics — composable grading components.

	Inspired by the REPL env's rubric pattern. Each rubric is an isolated,
	testable unit that grades one dimension of agent behavior. The composite
	ReleaseOpsRubric combines them into the final [0, 1] score.

	Grading dimensions:
	EvidenceRubric 0.35 — did the agent inspect the right information sources?
	RiskDiscoveryRubric 0.25 — did the agent trigger discovery of key risk signals?
	DecisionRubric 0.30 — was the final release decision correct?
	EfficiencyRubric 0.10 — did the agent avoid wasted / redundant steps?
	"""

	from __future__ import annotations

	from dataclasses import dataclass
	from typing import Protocol
	from releaseops_env.scoring import normalize_score


	# ── Data types ────────────────────────────────────────────────────────────────

	@dataclass
	class EpisodeTrace:
	"""Snapshot of relevant episode state passed to rubrics."""
	evidence_gathered: list[str] # keys accumulated via inspect_* actions
	risk_signals_found: list[str] # signal_ids emitted by the environment
	final_decision: str # "approve" \| "request_changes" \| "block" \| "escalate"
	step_count: int
	max_steps: int
	actions_taken: list[str] # action_type per step


	@dataclass
	class RubricResult:
	name: str
	score: float # [0.0, 1.0]
	weight: float
	details: dict


	# ── Rubric protocol ───────────────────────────────────────────────────────────

	class Rubric(Protocol):
	name: str
	weight: float

	def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
	...


	# ── Individual rubrics ────────────────────────────────────────────────────────

	class EvidenceRubric:
	"""
	Measures information-gathering breadth.

	Reward = (required evidence keys gathered) / (total required).
	A thorough investigator inspects the diff, tests, approvals, policy,
	dependencies, and incidents before deciding.
	"""

	name = "evidence_coverage"
	weight = 0.35

	def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
	required = set(ground_truth.get("required_evidence", []))
	gathered = set(trace.evidence_gathered)
	if not required:
	value = 1.0
	matched = set()
	else:
	matched = required & gathered
	value = len(matched) / len(required)

	missing = sorted(required - gathered)
	return RubricResult(
	name=self.name,
	score=round(value, 3),
	weight=self.weight,
	details={
	"required": sorted(required),
	"gathered": sorted(gathered & required),
	"missing": missing,
	},
	)


	class RiskDiscoveryRubric:
	"""
	Objective measure of signal discovery.

	Checks state.risk_signals_found (signal_ids the environment emitted
	during the episode) against required_risk_signals in ground_truth.

	This is analogous to the Calendar env's SQL verifiers: the environment
	measures what the agent actually observed, not what strings the agent typed.
	An agent that skips inspect_tests will never trigger 'missing_load_test',
	even if it guesses the string correctly in reason_codes.
	"""

	name = "risk_signal_discovery"
	weight = 0.25

	def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
	required = set(ground_truth.get("required_risk_signals", []))
	discovered = set(trace.risk_signals_found)
	if not required:
	# No required signals (e.g. trivial approve) — full credit
	value = 1.0
	matched = set()
	else:
	matched = required & discovered
	value = len(matched) / len(required)

	missing = sorted(required - discovered)
	return RubricResult(
	name=self.name,
	score=round(value, 3),
	weight=self.weight,
	details={
	"required": sorted(required),
	"discovered": sorted(discovered & required),
	"missing": missing,
	"extra_discovered": sorted(discovered - required),
	},
	)


	class DecisionRubric:
	"""
	Scores the final release decision.

	optimal → 1.0 (exactly right)
	acceptable but not optimal → 0.5 (e.g. block when request_changes was best)
	wrong → 0.0
	"""

	name = "decision_correctness"
	weight = 0.30

	def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
	optimal = ground_truth.get("optimal_decision", "")
	acceptable = set(ground_truth.get("acceptable_decisions", [optimal]))
	decision = trace.final_decision

	if decision == optimal:
	value = 1.0
	label = "optimal"
	elif decision in acceptable:
	value = 0.5
	label = "acceptable"
	else:
	value = 0.0
	label = "wrong"

	return RubricResult(
	name=self.name,
	score=round(value, 3),
	weight=self.weight,
	details={
	"submitted": decision,
	"optimal": optimal,
	"acceptable": sorted(acceptable),
	"verdict": label,
	},
	)


	class EfficiencyRubric:
	"""
	Rewards investigators who complete the task without wasted steps.

	The efficiency band [0.3, 0.7] of max_steps scores 1.0. Outside that
	window the score degrades linearly. Agents that decide after only 1-2 steps
	are penalized (they skipped evidence) as are agents that thrash up to
	the step budget.
	"""

	name = "efficiency"
	weight = 0.10

	def score(self, trace: EpisodeTrace, ground_truth: dict) -> RubricResult:
	usage = trace.step_count / max(trace.max_steps, 1)
	if 0.3 <= usage <= 0.7:
	value = 1.0
	elif usage < 0.3:
	value = usage / 0.3
	else:
	value = max(0.0, 1.0 - (usage - 0.7) / 0.3)

	return RubricResult(
	name=self.name,
	score=round(value, 3),
	weight=self.weight,
	details={
	"steps_taken": trace.step_count,
	"max_steps": trace.max_steps,
	"usage_fraction": round(usage, 3),
	},
	)


	# ── Composite rubric ──────────────────────────────────────────────────────────

	class ReleaseOpsRubric:
	"""
	Composite rubric combining all four dimensions.

	Usage:
	rubric = ReleaseOpsRubric()
	result = rubric.score(trace, ground_truth)
	print(result["score"], result["breakdown"])
	"""

	def __init__(self):
	self._rubrics: list[Rubric] = [
	EvidenceRubric(),
	RiskDiscoveryRubric(),
	DecisionRubric(),
	EfficiencyRubric(),
	]

	def score(self, trace: EpisodeTrace, ground_truth: dict) -> dict:
	results = [r.score(trace, ground_truth) for r in self._rubrics]

	# Forbidden action penalty
	forbidden = ground_truth.get("forbidden_actions", [])
	took_forbidden = any(fa in trace.actions_taken for fa in forbidden)
	forbidden_penalty = 0.3 if took_forbidden else 0.0

	raw = sum(r.score * r.weight for r in results)
	final_score = normalize_score(raw - forbidden_penalty)

	return {
	"score": round(final_score, 3),
	"breakdown": {r.name: round(r.score, 3) for r in results}
	\| {"forbidden_penalty": round(forbidden_penalty, 3)},
	"details": {r.name: r.details for r in results},
	}