File size: 10,274 Bytes

"""
Impact Oracle - scores whether an agent action produced measurable marginal value.
Rule-based to prevent reward hacking from neural reward models.
"""
import math
import random
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional


@dataclass
class OracleResult:
    raw_score: float
    cost_adjusted_score: float
    confidence: float
    evidence: Dict[str, Any]
    reason: str
    failure_tags: List[str] = field(default_factory=list)
    reward_value: float = 0.0


class ImpactOracle:
    """
    The Impact Oracle scores agent actions on verified impact.
    It supports: code tasks, retrieval QA, multi-agent debate.
    """

    def __init__(
        self,
        code_weights: Optional[Dict[str, float]] = None,
        qa_weights: Optional[Dict[str, float]] = None,
        debate_weights: Optional[Dict[str, float]] = None,
        compute_penalty_rate: float = 0.0001,
        calibration_weight: float = 0.2,
        abstention_bonus: float = 1.0,
        hallucination_penalty: float = 2.0,
        confident_wrong_penalty: float = 3.0,
        gaming_penalty: float = 2.0,
    ):
        self.code_weights = code_weights or {
            "correctness": 1.0,
            "pass_at_k": 0.3,
            "regression": -0.5,
            "compute_penalty": 0.001,
        }
        self.qa_weights = qa_weights or {
            "correctness": 1.0,
            "evidence_support": 0.5,
            "calibration": 0.2,
            "abstention_utility": 1.0,
            "hallucination_penalty": 2.0,
            "confident_wrong_penalty": 3.0,
        }
        self.debate_weights = debate_weights or {
            "decision_quality": 1.0,
            "influence_efficiency": 0.5,
            "throughput": 0.3,
            "marginal_contribution": 0.5,
        }
        self.compute_penalty_rate = compute_penalty_rate
        self.calibration_weight = calibration_weight
        self.abstention_bonus = abstention_bonus
        self.hallucination_penalty = hallucination_penalty
        self.confident_wrong_penalty = confident_wrong_penalty
        self.gaming_penalty = gaming_penalty

    def score(
        self,
        mode: str,
        action: Dict[str, Any],
        context: Dict[str, Any],
        result: Dict[str, Any],
        agent_id: str = "",
    ) -> OracleResult:
        if mode == "code":
            return self._score_code(action, context, result, agent_id)
        elif mode == "retrieval_qa":
            return self._score_retrieval_qa(action, context, result, agent_id)
        elif mode == "debate":
            return self._score_debate(action, context, result, agent_id)
        else:
            return OracleResult(
                raw_score=0.0,
                cost_adjusted_score=0.0,
                confidence=0.0,
                evidence={},
                reason=f"Unknown mode: {mode}",
                failure_tags=["unknown_mode"],
                reward_value=0.0,
            )

    def _score_code(
        self,
        action: Dict[str, Any],
        context: Dict[str, Any],
        result: Dict[str, Any],
        agent_id: str,
    ) -> OracleResult:
        correctness = result.get("correctness", 0.0)
        pass_at_k = result.get("pass_at_k", 0.0)
        regression = result.get("regression", False)
        compute_cost = result.get("compute_cost", 0.0)
        hidden_tests_pass = result.get("hidden_tests_pass", correctness)
        public_pass = result.get("public_pass", correctness)

        failure_tags = []
        if public_pass and not hidden_tests_pass:
            failure_tags.append("gaming_hidden_tests")

        raw = (
            correctness * self.code_weights["correctness"]
            + pass_at_k * self.code_weights["pass_at_k"]
            + (self.code_weights["regression"] if regression else 0.0)
            - compute_cost * self.code_weights.get("compute_penalty", self.compute_penalty_rate)
        )
        if "gaming_hidden_tests" in failure_tags:
            raw -= self.gaming_penalty

        cost_adj = raw - compute_cost * self.compute_penalty_rate
        confidence = result.get("confidence", correctness)
        reason = f"correctness={correctness:.2f}, pass@k={pass_at_k:.2f}, cost={compute_cost}"
        if failure_tags:
            reason += f", failures={failure_tags}"

        return OracleResult(
            raw_score=raw,
            cost_adjusted_score=cost_adj,
            confidence=confidence,
            evidence={"correctness": correctness, "pass_at_k": pass_at_k, "regression": regression},
            reason=reason,
            failure_tags=failure_tags,
            reward_value=cost_adj,
        )

    def _score_retrieval_qa(
        self,
        action: Dict[str, Any],
        context: Dict[str, Any],
        result: Dict[str, Any],
        agent_id: str,
    ) -> OracleResult:
        gold_answer = context.get("gold_answer", "")
        answer = result.get("answer", "")
        confidence = result.get("confidence", 0.5)
        evidence = result.get("evidence", {})
        compute_cost = result.get("compute_cost", 0.0)
        abstained = action.get("abstained", False)

        failure_tags = []

        if abstained:
            is_unanswerable = context.get("is_unanswerable", False)
            correct_abstention = is_unanswerable
            raw = self.abstention_bonus if correct_abstention else -self.abstention_bonus
            if not correct_abstention:
                failure_tags.append("wrong_abstention")
            else:
                failure_tags.append("correct_abstention")
            reason = f"abstained, correct={correct_abstention}"
        else:
            correctness = self._answer_correctness(answer, gold_answer)
            entailment = evidence.get("entailment_score", 0.0)
            contradiction = evidence.get("contradiction_score", 0.0)
            hallucination = evidence.get("hallucination", False) or contradiction > 0.5
            confident_wrong = (confidence > 0.8) and (correctness < 0.5)
            compute_waste = compute_cost > 500 and correctness < 0.5

            if hallucination:
                failure_tags.append("hallucination")
            if confident_wrong:
                failure_tags.append("confident_wrong")
            if compute_waste:
                failure_tags.append("compute_waste")
            if compute_cost > 2000:
                failure_tags.append("excessive_compute")

            raw = (
                correctness * self.qa_weights["correctness"]
                + entailment * self.qa_weights.get("evidence_support", 0.5)
                - (self.hallucination_penalty if hallucination else 0.0)
                - (self.confident_wrong_penalty if confident_wrong else 0.0)
                - compute_cost * self.compute_penalty_rate
            )

            brier = (confidence - correctness) ** 2
            calibration_bonus = (1 - brier) * self.calibration_weight
            raw += calibration_bonus
            reason = f"correctness={correctness:.2f}, entailment={entailment:.2f}, conf={confidence:.2f}"

        cost_adj = raw - compute_cost * self.compute_penalty_rate

        if compute_cost > 100 and raw < 0.5:
            cost_adj -= self.gaming_penalty * 0.5

        return OracleResult(
            raw_score=raw,
            cost_adjusted_score=cost_adj,
            confidence=confidence,
            evidence=evidence,
            reason=reason,
            failure_tags=failure_tags,
            reward_value=cost_adj,
        )

    def _score_debate(
        self,
        action: Dict[str, Any],
        context: Dict[str, Any],
        result: Dict[str, Any],
        agent_id: str,
    ) -> OracleResult:
        decision_quality = result.get("decision_quality", 0.0)
        marginal = result.get("marginal_contribution", 0.0)
        tokens = result.get("tokens", 0)
        n_agents = context.get("n_agents", 1)
        compute_cost = result.get("compute_cost", tokens)
        spam = result.get("spam", False)
        collusion = result.get("collusion", False)

        failure_tags = []
        if spam:
            failure_tags.append("spam")
        if collusion:
            failure_tags.append("collusion")
        if tokens > 5000:
            failure_tags.append("verbose_waste")

        raw = (
            decision_quality * self.debate_weights["decision_quality"]
            + marginal * self.debate_weights["marginal_contribution"]
            + (1.0 / max(tokens, 1)) * self.debate_weights["influence_efficiency"]
            - compute_cost * self.compute_penalty_rate
        )
        if spam:
            raw -= self.gaming_penalty
        if collusion:
            raw -= self.gaming_penalty * 2

        cost_adj = raw - compute_cost * self.compute_penalty_rate

        return OracleResult(
            raw_score=raw,
            cost_adjusted_score=cost_adj,
            confidence=result.get("confidence", 0.5),
            evidence={"marginal": marginal, "tokens": tokens, "n_agents": n_agents},
            reason=f"decision_quality={decision_quality:.2f}, marginal={marginal:.2f}, tokens={tokens}",
            failure_tags=failure_tags,
            reward_value=cost_adj,
        )

    def _answer_correctness(self, answer: str, gold: str) -> float:
        if not answer or not gold:
            return 0.0
        ans = answer.strip().lower()
        gld = gold.strip().lower()
        if ans == gld:
            return 1.0
        if gld in ans or ans in gld:
            return 0.5
        return 0.0

    def proper_score(self, prediction: float, outcome: float) -> float:
        return -((prediction - outcome) ** 2)

    def abstention_score(
        self,
        answer: Optional[str],
        confidence: float,
        evidence: Dict[str, Any],
        outcome: float,
    ) -> float:
        if answer is None:
            return self.abstention_bonus if outcome < 0.5 else -self.abstention_bonus
        return 0.0

    def marginal_impact(self, before: OracleResult, after: OracleResult) -> float:
        return after.cost_adjusted_score - before.cost_adjusted_score

    def cost_adjusted_score(self, raw_score: float, compute_cost: float) -> float:
        return raw_score - compute_cost * self.compute_penalty_rate