Spaces:

XcodeAddy
/

sentinel-env

Running

File size: 7,502 Bytes

from __future__ import annotations

from sentinel_config import ADVERSARIAL_AWARENESS_STAKES


class TrustLedger:
    """
    Bayesian reliability tracker for each specialist.

    Each specialist gets a Beta distribution prior (alpha, beta).
    alpha = successes + 1, beta = failures + 1 (Laplace smoothing).
    Trust score = alpha / (alpha + beta) = mean of Beta distribution.

    Stakes multiplier: high-stakes outcomes move the needle harder.
    Profile shuffles every episode — ledger resets on reset().
    """

    SPECIALIST_IDS = ["S0", "S1", "S2", "S3", "S4"]

    def __init__(self) -> None:
        self._reset()

    def _reset(self) -> None:
        # Uniform prior: alpha=1, beta=1 → trust=0.5 for all specialists
        self._alpha: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
        self._beta:  dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
        self._call_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS}
        self._confidence_gap_sum: dict[str, float] = {sid: 0.0 for sid in self.SPECIALIST_IDS}
        self._confidence_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS}
        self._domain_success: dict[str, dict[str, float]] = {sid: {} for sid in self.SPECIALIST_IDS}
        self._domain_count: dict[str, dict[str, int]] = {sid: {} for sid in self.SPECIALIST_IDS}
        self._stakes_success: dict[str, dict[str, float]] = {
            sid: {"low": 0.0, "high": 0.0} for sid in self.SPECIALIST_IDS
        }
        self._stakes_count: dict[str, dict[str, int]] = {
            sid: {"low": 0, "high": 0} for sid in self.SPECIALIST_IDS
        }

    def reset(self) -> None:
        """Call at the start of each episode."""
        self._reset()

    # ------------------------------------------------------------------
    # Update
    # ------------------------------------------------------------------

    def update(
        self,
        specialist_id: str,
        outcome: float,   # 1.0 = correct, 0.0 = wrong/adversarial, 0.5 = partial
        stakes: float,    # 0.0–1.0; high stakes = larger update
        confidence: float | None = None,
        domain: str | None = None,
    ) -> None:
        """
        Bayesian update after observing a specialist outcome.
        stakes acts as a weight multiplier (1x at low stakes, 3x at high stakes).
        """
        if specialist_id not in self._alpha:
            return

        weight = 1.0 + 2.0 * stakes   # 1.0 → 3.0

        self._call_count[specialist_id] += 1

        if outcome >= 0.5:
            self._alpha[specialist_id] += weight * outcome
        else:
            self._beta[specialist_id] += weight * (1.0 - outcome)

        if confidence is not None:
            self._confidence_gap_sum[specialist_id] += max(0.0, confidence - outcome)
            self._confidence_count[specialist_id] += 1

        if domain:
            domain_key = domain.upper()
            self._domain_success[specialist_id][domain_key] = (
                self._domain_success[specialist_id].get(domain_key, 0.0) + outcome
            )
            self._domain_count[specialist_id][domain_key] = (
                self._domain_count[specialist_id].get(domain_key, 0) + 1
            )

        stakes_bucket = "high" if stakes >= ADVERSARIAL_AWARENESS_STAKES else "low"
        self._stakes_success[specialist_id][stakes_bucket] += outcome
        self._stakes_count[specialist_id][stakes_bucket] += 1

    # ------------------------------------------------------------------
    # Read
    # ------------------------------------------------------------------

    def trust(self, specialist_id: str) -> float:
        """Point estimate: mean of Beta distribution."""
        a = self._alpha.get(specialist_id, 1.0)
        b = self._beta.get(specialist_id, 1.0)
        return a / (a + b)

    def snapshot(self) -> dict[str, float]:
        """Rounded trust scores for all specialists."""
        return {sid: round(self.trust(sid), 3) for sid in self.SPECIALIST_IDS}

    def behavioral_fingerprints(self) -> dict[str, dict]:
        """
        Public behavioral features an orchestrator can learn from.

        These are still evidence-only: no hidden specialist identity leaks.
        """
        fingerprints: dict[str, dict] = {}
        for sid in self.SPECIALIST_IDS:
            confidence_count = self._confidence_count[sid]
            gap = (
                self._confidence_gap_sum[sid] / confidence_count
                if confidence_count
                else 0.0
            )
            domain_hit_rate = {
                domain: round(success / max(1, self._domain_count[sid][domain]), 3)
                for domain, success in sorted(self._domain_success[sid].items())
            }
            low_rate = self._bucket_rate(sid, "low")
            high_rate = self._bucket_rate(sid, "high")
            volatility = abs(high_rate - low_rate) if low_rate is not None and high_rate is not None else 0.0
            fingerprints[sid] = {
                "calls": self._call_count[sid],
                "confidence_accuracy_gap": round(gap, 3),
                "domain_hit_rate": domain_hit_rate,
                "stakes_volatility": round(volatility, 3),
                "low_stakes_accuracy": round(low_rate, 3) if low_rate is not None else None,
                "high_stakes_accuracy": round(high_rate, 3) if high_rate is not None else None,
            }
        return fingerprints

    def _bucket_rate(self, specialist_id: str, bucket: str) -> float | None:
        count = self._stakes_count[specialist_id][bucket]
        if count == 0:
            return None
        return self._stakes_success[specialist_id][bucket] / count

    def call_count(self, specialist_id: str) -> int:
        return self._call_count.get(specialist_id, 0)

    def most_trusted(self) -> str:
        """Returns the specialist_id with the highest current trust score."""
        return max(self.SPECIALIST_IDS, key=self.trust)

    def least_trusted(self) -> str:
        return min(self.SPECIALIST_IDS, key=self.trust)

    # ------------------------------------------------------------------
    # Calibration score (used in reward engine)
    # ------------------------------------------------------------------

    def brier_score(self, ground_truth_reliability: dict[str, float]) -> float:
        """
        Measures how well the trust scores predict actual specialist reliability.
        Lower = better calibrated. Range 0.0–1.0.

        ground_truth_reliability: {"S0": 0.9, "S1": 0.6, ...}
        (hidden from agent, used only by reward engine)
        """
        total = 0.0
        n = 0
        for sid in self.SPECIALIST_IDS:
            if sid in ground_truth_reliability:
                predicted = self.trust(sid)
                actual    = ground_truth_reliability[sid]
                total += (predicted - actual) ** 2
                n += 1
        return total / n if n > 0 else 0.0

    def calibration_reward(self, ground_truth_reliability: dict[str, float]) -> float:
        """
        Convert Brier score to a reward signal (0.0–1.0).
        Perfect calibration → 1.0. Random → ~0.5.
        """
        brier = self.brier_score(ground_truth_reliability)
        # Invert and scale: brier=0 → reward=1.0, brier=0.25 → reward=0.5
        return max(0.0, 1.0 - 4.0 * brier)

    def __repr__(self) -> str:
        snap = self.snapshot()
        return f"TrustLedger({snap})"