narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 27 days ago

Commit

b8754a6

verified ·

1 Parent(s): df7e938

Upload oracle/oracle.py

Browse files

Files changed (1) hide show

oracle/oracle.py +398 -0

oracle/oracle.py ADDED Viewed

	@@ -0,0 +1,398 @@

+"""
+Impact Oracle: scores whether an agent action produced measurable marginal value.
+Supports code tasks, retrieval QA tasks, and multi-agent debate tasks.
+"""
+import json
+import math
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+@dataclass
+class OracleResult:
+    raw_score: float = 0.0
+    cost_adjusted_score: float = 0.0
+    confidence: float = 0.0
+    evidence: Dict[str, Any] = field(default_factory=dict)
+    reason: str = ""
+    failure_tags: List[str] = field(default_factory=list)
+    reward_value: float = 0.0
+class ImpactOracle:
+    """
+    Multi-mode impact oracle with structured JSON output.
+    """
+    def __init__(
+        self,
+        compute_budget: float = 1e6,  # tokens or FLOPs budget reference
+        decay_lambda: float = 0.1,
+        calibration_weight: float = 0.2,
+        hallucination_weight: float = 0.5,
+        confident_wrong_weight: float = 0.3,
+        compute_cost_weight: float = 0.2,
+        gaming_weight: float = 0.4,
+    ):
+        self.compute_budget = compute_budget
+        self.decay_lambda = decay_lambda
+        self.calibration_weight = calibration_weight
+        self.hallucination_weight = hallucination_weight
+        self.confident_wrong_weight = confident_wrong_weight
+        self.compute_cost_weight = compute_cost_weight
+        self.gaming_weight = gaming_weight
+        # Gaming detection state (lightweight per-agent history)
+        self._agent_history: Dict[str, List[Dict]] = {}
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def score(
+        self,
+        mode: str,
+        action: Dict[str, Any],
+        context: Dict[str, Any],
+        result: Dict[str, Any],
+        agent_id: Optional[str] = None,
+    ) -> OracleResult:
+        """Score an action based on mode and context."""
+        if mode == "code":
+            raw = self._score_code(action, context, result)
+        elif mode == "retrieval_qa":
+            raw = self._score_retrieval_qa(action, context, result)
+        elif mode == "debate":
+            raw = self._score_debate(action, context, result)
+        else:
+            raw = OracleResult(raw_score=0.0, reason=f"Unknown mode: {mode}")
+        # Apply cost adjustment
+        compute_cost = result.get("compute_cost", 0.0)
+        cost_adj = self.cost_adjusted_score(raw.raw_score, compute_cost)
+        raw.cost_adjusted_score = cost_adj
+        # Detect gaming patterns
+        gaming_penalty = 0.0
+        if agent_id is not None:
+            gaming_penalty = self._detect_gaming(agent_id, action, raw)
+        raw.reward_value = raw.cost_adjusted_score - gaming_penalty
+        if gaming_penalty > 0:
+            raw.failure_tags.append("gaming_detected")
+        return raw
+    def marginal_impact(self, before: OracleResult, after: OracleResult) -> float:
+        """Compute marginal impact between two oracle results."""
+        return after.reward_value - before.reward_value
+    def proper_score(self, prediction: float, outcome: int) -> float:
+        """
+        Brier score (proper scoring rule).
+        prediction: probability of outcome=1
+        outcome: 0 or 1
+        Returns 1 - Brier (higher is better).
+        """
+        brier = (prediction - outcome) ** 2
+        return 1.0 - brier
+    def abstention_score(
+        self,
+        answer: Optional[str],
+        confidence: float,
+        evidence: Dict[str, Any],
+        outcome: Optional[str],
+    ) -> float:
+        """
+        Reward correct abstention, penalize incorrect abstention.
+        answer=None means abstained.
+        outcome=None means question is unanswerable.
+        """
+        abstained = answer is None or answer.strip() == ""
+        unanswerable = outcome is None or outcome.strip() == ""
+        if abstained and unanswerable:
+            return +1.0  # correct abstention
+        elif abstained and not unanswerable:
+            return -1.0  # incorrect abstention (avoided answering a valid question)
+        elif not abstained and unanswerable:
+            return -0.5  # should have abstained
+        else:
+            # Both answered — use proper score on confidence
+            correct = self._fuzzy_match(answer, outcome)
+            return self.proper_score(confidence, int(correct))
+    def cost_adjusted_score(self, raw_score: float, compute_cost: float) -> float:
+        """Penalize excessive compute usage."""
+        penalty = min(1.0, (compute_cost / self.compute_budget)) * self.compute_cost_weight
+        return max(0.0, raw_score - penalty)
+    # ------------------------------------------------------------------
+    # Mode-specific scorers
+    # ------------------------------------------------------------------
+    def _score_code(
+        self, action: Dict, context: Dict, result: Dict
+    ) -> OracleResult:
+        """
+        Code scoring:
+        - unit test pass/fail
+        - pass@k estimate
+        - regression detection
+        - fixed-compute comparison
+        """
+        passed = result.get("passed", False)
+        hidden_passed = result.get("hidden_passed", passed)
+        k = result.get("k", 1)
+        passes = result.get("passes", 1 if passed else 0)
+        prev_passed = context.get("previous_passed", False)
+        compute_cost = result.get("compute_cost", 0.0)
+        # Regression: new pass but old also passed (no marginal value)
+        regression = prev_passed and passed
+        # Hidden-test gaming detection: passes public but fails hidden
+        hidden_gaming = passed and not hidden_passed
+        raw_score = 1.0 if hidden_passed else (0.3 if passed else 0.0)
+        if regression:
+            raw_score *= 0.5  # diminished marginal value
+        if hidden_gaming:
+            raw_score = -0.5  # strong penalty for gaming public tests
+        pass_at_k = passes / k if k > 0 else 0.0
+        failure_tags = []
+        if hidden_gaming:
+            failure_tags.append("hidden_test_gaming")
+        if regression:
+            failure_tags.append("regression")
+        reason = (
+            f"public={'pass' if passed else 'fail'}, "
+            f"hidden={'pass' if hidden_passed else 'fail'}, "
+            f"pass@{k}={pass_at_k:.2f}, "
+            f"regression={regression}"
+        )
+        return OracleResult(
+            raw_score=raw_score,
+            confidence=0.9 if hidden_passed else (0.5 if passed else 0.1),
+            evidence={"pass_at_k": pass_at_k, "regression": regression},
+            reason=reason,
+            failure_tags=failure_tags,
+        )
+    def _score_retrieval_qa(
+        self, action: Dict, context: Dict, result: Dict
+    ) -> OracleResult:
+        """
+        Retrieval QA scoring:
+        - answer correctness
+        - evidence support (NLI-style)
+        - hallucination detection
+        - abstention utility
+        - calibration / ECE
+        - proper scoring rule
+        """
+        answer = result.get("answer")
+        gold = context.get("gold_answer")
+        confidence = result.get("confidence", 0.5)
+        evidence = result.get("evidence", {})
+        compute_cost = result.get("compute_cost", 0.0)
+        # Correctness
+        correct = self._fuzzy_match(answer, gold) if answer else False
+        raw_score = 1.0 if correct else 0.0
+        # Evidence support: entailment score
+        entailment = evidence.get("entailment_score", 0.0)
+        contradiction = evidence.get("contradiction_score", 0.0)
+        # Hallucination penalty
+        hallucination_penalty = contradiction * self.hallucination_weight
+        # Abstention utility
+        abstention = self.abstention_score(answer, confidence, evidence, gold)
+        # Calibration bonus via Brier
+        if gold is not None and answer is not None:
+            brier = (confidence - int(correct)) ** 2
+            calibration_bonus = (1.0 - brier) * self.calibration_weight
+        else:
+            calibration_bonus = 0.0
+        # Confident-wrong penalty
+        confident_wrong_penalty = 0.0
+        if not correct and answer is not None:
+            confident_wrong_penalty = confidence * self.confident_wrong_weight
+        reward = (
+            raw_score
+            + abstention * 0.3
+            + calibration_bonus
+            - hallucination_penalty
+            - confident_wrong_penalty
+        )
+        failure_tags = []
+        if contradiction > 0.5:
+            failure_tags.append("hallucination")
+        if not correct and confidence > 0.8:
+            failure_tags.append("confident_wrong")
+        reason = (
+            f"correct={correct}, confidence={confidence:.2f}, "
+            f"entailment={entailment:.2f}, contradiction={contradiction:.2f}, "
+            f"abstention={abstention:.2f}, calib_bonus={calibration_bonus:.3f}"
+        )
+        return OracleResult(
+            raw_score=raw_score,
+            confidence=confidence,
+            evidence=evidence,
+            reason=reason,
+            failure_tags=failure_tags,
+            reward_value=max(-1.0, min(1.0, reward)),
+        )
+    def _score_debate(
+        self, action: Dict, context: Dict, result: Dict
+    ) -> OracleResult:
+        """
+        Multi-agent debate scoring:
+        - decision quality
+        - influence efficiency (marginal contribution per compute)
+        - throughput
+        """
+        final_correct = result.get("final_correct", False)
+        prev_correct = context.get("previous_correct", False)
+        agent_contribution = result.get("agent_contribution", 0.0)
+        compute_cost = result.get("compute_cost", 0.0)
+        tokens_used = result.get("tokens_used", 0)
+        total_turns = result.get("total_turns", 1)
+        # Decision quality
+        raw_score = 1.0 if final_correct else 0.0
+        # Marginal contribution: did this agent/action improve the decision?
+        marginal = 0.0
+        if final_correct and not prev_correct:
+            marginal = 1.0
+        elif not final_correct and prev_correct:
+            marginal = -1.0
+        # Influence efficiency: marginal contribution per token
+        efficiency = marginal / max(1, tokens_used)
+        # Throughput: decisions per unit compute
+        throughput = 1.0 / max(1, compute_cost)
+        reward = raw_score + efficiency * 10.0  # scale efficiency
+        reason = (
+            f"final_correct={final_correct}, marginal={marginal:.2f}, "
+            f"efficiency={efficiency:.4f}, throughput={throughput:.4f}"
+        )
+        return OracleResult(
+            raw_score=raw_score,
+            confidence=0.8 if final_correct else 0.2,
+            evidence={"marginal": marginal, "efficiency": efficiency, "throughput": throughput},
+            reason=reason,
+            reward_value=max(-1.0, min(1.0, reward)),
+        )
+    # ------------------------------------------------------------------
+    # Gaming detection
+    # ------------------------------------------------------------------
+    def _detect_gaming(
+        self, agent_id: str, action: Dict, result: OracleResult
+    ) -> float:
+        """Return penalty value for detected gaming patterns."""
+        history = self._agent_history.setdefault(agent_id, [])
+        now = len(history)  # simplistic step index
+        penalty = 0.0
+        tags = []
+        # 1. Spam: repeated low-value actions within window
+        window = history[-10:]
+        low_value_count = sum(1 for h in window if h["score"] < 0.2)
+        if low_value_count >= 7:
+            penalty += 0.3
+            tags.append("spam")
+        # 2. Hoarding: credit balance above threshold for many steps (handled in ledger)
+        # We add a lightweight signal here if the agent keeps submitting without earning
+        recent_earnings = [h["earned"] for h in history[-20:]]
+        if len(recent_earnings) >= 20 and sum(recent_earnings) < 1.0:
+            penalty += 0.2
+            tags.append("low_earning_pattern")
+        # 3. Verbose padding: tokens per unit impact below threshold
+        tokens = action.get("tokens_used", 0)
+        if tokens > 500 and result.raw_score < 0.3:
+            penalty += 0.15
+            tags.append("verbose_padding")
+        # 4. Over-abstention
+        if action.get("abstained", False):
+            abstention_rate = sum(1 for h in history[-20:] if h.get("abstained", False)) / max(1, len(history[-20:]))
+            if abstention_rate > 0.7:
+                penalty += 0.25
+                tags.append("over_abstention")
+        # 5. Confidence manipulation: very high confidence on wrong answers
+        if result.confidence > 0.8 and result.raw_score < 0.3:
+            penalty += 0.2
+            tags.append("confidence_manipulation")
+        history.append({
+            "step": now,
+            "score": result.raw_score,
+            "earned": result.reward_value,
+            "abstained": action.get("abstained", False),
+            "tokens": tokens,
+        })
+        # Keep history bounded
+        if len(history) > 1000:
+            self._agent_history[agent_id] = history[-500:]
+        result.failure_tags.extend(tags)
+        return penalty * self.gaming_weight
+    # ------------------------------------------------------------------
+    # Utilities
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _fuzzy_match(a: Optional[str], b: Optional[str]) -> bool:
+        if a is None or b is None:
+            return False
+        a_norm = a.strip().lower()
+        b_norm = b.strip().lower()
+        return a_norm == b_norm or a_norm in b_norm or b_norm in a_norm
+    def compute_ece(
+        self, confidences: List[float], accuracies: List[bool], n_bins: int = 10
+    ) -> float:
+        """Compute Expected Calibration Error."""
+        conf = np.array(confidences)
+        acc = np.array(accuracies, dtype=float)
+        bins = np.linspace(0.0, 1.0, n_bins + 1)
+        ece = 0.0
+        for i in range(n_bins):
+            mask = (conf >= bins[i]) & (conf < bins[i + 1])
+            if i == n_bins - 1:
+                mask = (conf >= bins[i]) & (conf <= bins[i + 1])
+            if mask.sum() == 0:
+                continue
+            avg_conf = conf[mask].mean()
+            avg_acc = acc[mask].mean()
+            ece += (mask.sum() / len(conf)) * abs(avg_conf - avg_acc)
+        return float(ece)