""" Impact Oracle - scores whether an agent action produced measurable marginal value. Rule-based to prevent reward hacking from neural reward models. """ import math import random from dataclasses import dataclass, field from typing import Any, Dict, List, Optional @dataclass class OracleResult: raw_score: float cost_adjusted_score: float confidence: float evidence: Dict[str, Any] reason: str failure_tags: List[str] = field(default_factory=list) reward_value: float = 0.0 class ImpactOracle: """ The Impact Oracle scores agent actions on verified impact. It supports: code tasks, retrieval QA, multi-agent debate. """ def __init__( self, code_weights: Optional[Dict[str, float]] = None, qa_weights: Optional[Dict[str, float]] = None, debate_weights: Optional[Dict[str, float]] = None, compute_penalty_rate: float = 0.0001, calibration_weight: float = 0.2, abstention_bonus: float = 1.0, hallucination_penalty: float = 2.0, confident_wrong_penalty: float = 3.0, gaming_penalty: float = 2.0, ): self.code_weights = code_weights or { "correctness": 1.0, "pass_at_k": 0.3, "regression": -0.5, "compute_penalty": 0.001, } self.qa_weights = qa_weights or { "correctness": 1.0, "evidence_support": 0.5, "calibration": 0.2, "abstention_utility": 1.0, "hallucination_penalty": 2.0, "confident_wrong_penalty": 3.0, } self.debate_weights = debate_weights or { "decision_quality": 1.0, "influence_efficiency": 0.5, "throughput": 0.3, "marginal_contribution": 0.5, } self.compute_penalty_rate = compute_penalty_rate self.calibration_weight = calibration_weight self.abstention_bonus = abstention_bonus self.hallucination_penalty = hallucination_penalty self.confident_wrong_penalty = confident_wrong_penalty self.gaming_penalty = gaming_penalty def score( self, mode: str, action: Dict[str, Any], context: Dict[str, Any], result: Dict[str, Any], agent_id: str = "", ) -> OracleResult: if mode == "code": return self._score_code(action, context, result, agent_id) elif mode == "retrieval_qa": return self._score_retrieval_qa(action, context, result, agent_id) elif mode == "debate": return self._score_debate(action, context, result, agent_id) else: return OracleResult( raw_score=0.0, cost_adjusted_score=0.0, confidence=0.0, evidence={}, reason=f"Unknown mode: {mode}", failure_tags=["unknown_mode"], reward_value=0.0, ) def _score_code( self, action: Dict[str, Any], context: Dict[str, Any], result: Dict[str, Any], agent_id: str, ) -> OracleResult: correctness = result.get("correctness", 0.0) pass_at_k = result.get("pass_at_k", 0.0) regression = result.get("regression", False) compute_cost = result.get("compute_cost", 0.0) hidden_tests_pass = result.get("hidden_tests_pass", correctness) public_pass = result.get("public_pass", correctness) failure_tags = [] if public_pass and not hidden_tests_pass: failure_tags.append("gaming_hidden_tests") raw = ( correctness * self.code_weights["correctness"] + pass_at_k * self.code_weights["pass_at_k"] + (self.code_weights["regression"] if regression else 0.0) - compute_cost * self.code_weights.get("compute_penalty", self.compute_penalty_rate) ) if "gaming_hidden_tests" in failure_tags: raw -= self.gaming_penalty cost_adj = raw - compute_cost * self.compute_penalty_rate confidence = result.get("confidence", correctness) reason = f"correctness={correctness:.2f}, pass@k={pass_at_k:.2f}, cost={compute_cost}" if failure_tags: reason += f", failures={failure_tags}" return OracleResult( raw_score=raw, cost_adjusted_score=cost_adj, confidence=confidence, evidence={"correctness": correctness, "pass_at_k": pass_at_k, "regression": regression}, reason=reason, failure_tags=failure_tags, reward_value=cost_adj, ) def _score_retrieval_qa( self, action: Dict[str, Any], context: Dict[str, Any], result: Dict[str, Any], agent_id: str, ) -> OracleResult: gold_answer = context.get("gold_answer", "") answer = result.get("answer", "") confidence = result.get("confidence", 0.5) evidence = result.get("evidence", {}) compute_cost = result.get("compute_cost", 0.0) abstained = action.get("abstained", False) failure_tags = [] if abstained: is_unanswerable = context.get("is_unanswerable", False) correct_abstention = is_unanswerable raw = self.abstention_bonus if correct_abstention else -self.abstention_bonus if not correct_abstention: failure_tags.append("wrong_abstention") else: failure_tags.append("correct_abstention") reason = f"abstained, correct={correct_abstention}" else: correctness = self._answer_correctness(answer, gold_answer) entailment = evidence.get("entailment_score", 0.0) contradiction = evidence.get("contradiction_score", 0.0) hallucination = evidence.get("hallucination", False) or contradiction > 0.5 confident_wrong = (confidence > 0.8) and (correctness < 0.5) compute_waste = compute_cost > 500 and correctness < 0.5 if hallucination: failure_tags.append("hallucination") if confident_wrong: failure_tags.append("confident_wrong") if compute_waste: failure_tags.append("compute_waste") if compute_cost > 2000: failure_tags.append("excessive_compute") raw = ( correctness * self.qa_weights["correctness"] + entailment * self.qa_weights.get("evidence_support", 0.5) - (self.hallucination_penalty if hallucination else 0.0) - (self.confident_wrong_penalty if confident_wrong else 0.0) - compute_cost * self.compute_penalty_rate ) brier = (confidence - correctness) ** 2 calibration_bonus = (1 - brier) * self.calibration_weight raw += calibration_bonus reason = f"correctness={correctness:.2f}, entailment={entailment:.2f}, conf={confidence:.2f}" cost_adj = raw - compute_cost * self.compute_penalty_rate if compute_cost > 100 and raw < 0.5: cost_adj -= self.gaming_penalty * 0.5 return OracleResult( raw_score=raw, cost_adjusted_score=cost_adj, confidence=confidence, evidence=evidence, reason=reason, failure_tags=failure_tags, reward_value=cost_adj, ) def _score_debate( self, action: Dict[str, Any], context: Dict[str, Any], result: Dict[str, Any], agent_id: str, ) -> OracleResult: decision_quality = result.get("decision_quality", 0.0) marginal = result.get("marginal_contribution", 0.0) tokens = result.get("tokens", 0) n_agents = context.get("n_agents", 1) compute_cost = result.get("compute_cost", tokens) spam = result.get("spam", False) collusion = result.get("collusion", False) failure_tags = [] if spam: failure_tags.append("spam") if collusion: failure_tags.append("collusion") if tokens > 5000: failure_tags.append("verbose_waste") raw = ( decision_quality * self.debate_weights["decision_quality"] + marginal * self.debate_weights["marginal_contribution"] + (1.0 / max(tokens, 1)) * self.debate_weights["influence_efficiency"] - compute_cost * self.compute_penalty_rate ) if spam: raw -= self.gaming_penalty if collusion: raw -= self.gaming_penalty * 2 cost_adj = raw - compute_cost * self.compute_penalty_rate return OracleResult( raw_score=raw, cost_adjusted_score=cost_adj, confidence=result.get("confidence", 0.5), evidence={"marginal": marginal, "tokens": tokens, "n_agents": n_agents}, reason=f"decision_quality={decision_quality:.2f}, marginal={marginal:.2f}, tokens={tokens}", failure_tags=failure_tags, reward_value=cost_adj, ) def _answer_correctness(self, answer: str, gold: str) -> float: if not answer or not gold: return 0.0 ans = answer.strip().lower() gld = gold.strip().lower() if ans == gld: return 1.0 if gld in ans or ans in gld: return 0.5 return 0.0 def proper_score(self, prediction: float, outcome: float) -> float: return -((prediction - outcome) ** 2) def abstention_score( self, answer: Optional[str], confidence: float, evidence: Dict[str, Any], outcome: float, ) -> float: if answer is None: return self.abstention_bonus if outcome < 0.5 else -self.abstention_bonus return 0.0 def marginal_impact(self, before: OracleResult, after: OracleResult) -> float: return after.cost_adjusted_score - before.cost_adjusted_score def cost_adjusted_score(self, raw_score: float, compute_cost: float) -> float: return raw_score - compute_cost * self.compute_penalty_rate