| """
|
| Token-Level Confidence Scoring Engine
|
|
|
| Synthesizes four confidence signals to rate individual claims:
|
| 1. Semantic Confidence — Confidence markers in text ("I'm confident that...", "arguably...")
|
| 2. Attentional Confidence — Semantic overlap with other agents' responses
|
| 3. Probabilistic Confidence — Token-level probabilities from LLM logits
|
| 4. Integrated Learning Signal — Historical coherence from past similar responses
|
|
|
| Author: Claude Code
|
| """
|
|
|
| import re
|
| import time
|
| import json
|
| from dataclasses import dataclass, asdict
|
| from typing import Dict, List, Optional, Tuple, Any
|
| from collections import defaultdict
|
| import hashlib
|
| import logging
|
|
|
| logging.basicConfig(level=logging.INFO)
|
| logger = logging.getLogger(__name__)
|
|
|
|
|
| CONFIDENCE_MARKERS = {
|
| "high": [
|
| r"\bi['\"]?m confident\b",
|
| r"\bdefinitively\b",
|
| r"\bclearly\b",
|
| r"\bunambiguously\b",
|
| r"\bcertainly\b",
|
| r"\bwithout doubt\b",
|
| r"\bno question\b",
|
| r"\bproven\b",
|
| r"\bestablished fact\b",
|
| ],
|
| "medium": [
|
| r"\bi argue\b",
|
| r"\b(it appears|it seems)\b",
|
| r"\breasonably\b",
|
| r"\barguably\b",
|
| r"\blikely\b",
|
| r"\bprobably\b",
|
| r"\bin my view\b",
|
| r"\bi think\b",
|
| r"\bi believe\b",
|
| r"\bfrom my perspective\b",
|
| ],
|
| "low": [
|
| r"\b(it['\"]?s possible|it could be)\b",
|
| r"\bone could say\b",
|
| r"\bperhaps\b",
|
| r"\bmaybe\b",
|
| r"\buncertain\b",
|
| r"\bi['\"]?m not sure\b",
|
| r"\ballegedly\b",
|
| r"\bseemingly\b",
|
| r"\bapparently\b",
|
| r"\bwhoa\b",
|
| ],
|
| }
|
|
|
|
|
| _MARKER_PATTERNS = {}
|
| for level, markers in CONFIDENCE_MARKERS.items():
|
| _MARKER_PATTERNS[level] = [re.compile(m, re.IGNORECASE) for m in markers]
|
|
|
|
|
| @dataclass
|
| class ClaimSegment:
|
| """A single claim extracted from an agent's response."""
|
|
|
| text: str
|
| start_idx: int
|
| end_idx: int
|
| confidence: float
|
| semantic_conf: float
|
| attentional_conf: float
|
| probabilistic_conf: float
|
| learning_signal: float
|
| agent_name: str = ""
|
| debate_round: int = 0
|
|
|
|
|
| @dataclass
|
| class TokenConfidenceScore:
|
| """Per-token confidence analysis for a full response."""
|
|
|
| agent_name: str
|
| response_text: str
|
| token_scores: List[float]
|
| claims: List[ClaimSegment]
|
| semantic_confidence_dict: Dict[int, float]
|
| attentional_confidence_dict: Dict[int, float]
|
| probabilistic_confidence_dict: Dict[int, float]
|
| learning_signal_dict: Dict[int, float]
|
| composite_scores: Dict[int, float]
|
| timestamp: float = 0.0
|
|
|
| def __post_init__(self):
|
| if self.timestamp == 0.0:
|
| self.timestamp = time.time()
|
|
|
| def to_dict(self) -> Dict:
|
| """Serialize for storage."""
|
| return {
|
| "agent_name": self.agent_name,
|
| "response_text": self.response_text[:500],
|
| "mean_token_confidence": sum(self.token_scores) / max(len(self.token_scores), 1),
|
| "claims_count": len(self.claims),
|
| "claims": [
|
| {
|
| "text": c.text,
|
| "confidence": c.confidence,
|
| "semantic_conf": c.semantic_conf,
|
| "attentional_conf": c.attentional_conf,
|
| "probabilistic_conf": c.probabilistic_conf,
|
| "learning_signal": c.learning_signal,
|
| }
|
| for c in self.claims
|
| ],
|
| }
|
|
|
|
|
| class TokenConfidenceEngine:
|
| """Four-signal token confidence scorer."""
|
|
|
| def __init__(
|
| self,
|
| embedding_model: Optional[Any] = None,
|
| living_memory: Optional[Any] = None,
|
| alpha: float = 0.25,
|
| beta: float = 0.25,
|
| gamma: float = 0.25,
|
| delta: float = 0.25,
|
| ):
|
| """
|
| Initialize token confidence engine.
|
|
|
| Args:
|
| embedding_model: Model for generating embeddings (optional, uses sklearn if None)
|
| living_memory: LivingMemoryKernel instance for historical coherence lookup
|
| alpha: Weight for semantic confidence
|
| beta: Weight for attentional confidence
|
| gamma: Weight for probabilistic confidence
|
| delta: Weight for learning signal
|
| """
|
| self.embedding_model = embedding_model
|
| self.living_memory = living_memory
|
| self.alpha = alpha
|
| self.beta = beta
|
| self.gamma = gamma
|
| self.delta = delta
|
|
|
|
|
| self._embedder = None
|
| self._embedder_cache = {}
|
|
|
| def score_tokens(
|
| self,
|
| agent_response: str,
|
| agent_name: str,
|
| peer_responses: Optional[Dict[str, str]] = None,
|
| logits: Optional[List[float]] = None,
|
| ) -> TokenConfidenceScore:
|
| """
|
| Score all tokens/claims in an agent's response using 4 signals.
|
|
|
| Args:
|
| agent_response: The full response text from the agent
|
| agent_name: Name of the agent (for memory lookup)
|
| peer_responses: Dict {peer_agent_name: response_text} for attentional scoring
|
| logits: Optional list of per-token probabilities from generation
|
|
|
| Returns:
|
| TokenConfidenceScore with all components
|
| """
|
| if peer_responses is None:
|
| peer_responses = {}
|
|
|
|
|
| semantic_conf_dict = self._parse_semantic_markers(agent_response)
|
|
|
|
|
| attentional_conf_dict = self._compute_attentional_confidence(
|
| agent_response, peer_responses
|
| )
|
|
|
|
|
| probabilistic_conf_dict = self._extract_probabilistic_confidence(
|
| agent_response, logits
|
| )
|
|
|
|
|
| learning_signal_dict = self._compute_learning_signal(
|
| agent_response, agent_name
|
| )
|
|
|
|
|
| claims = self._extract_claims(
|
| agent_response,
|
| semantic_conf_dict,
|
| attentional_conf_dict,
|
| probabilistic_conf_dict,
|
| learning_signal_dict,
|
| agent_name,
|
| )
|
|
|
|
|
| token_scores = []
|
| composite_scores = {}
|
|
|
| for i, token_text in enumerate(agent_response.split()):
|
| semantic = semantic_conf_dict.get(i, 0.5)
|
| attentional = attentional_conf_dict.get(i, 0.5)
|
| probabilistic = probabilistic_conf_dict.get(i, 0.5)
|
| learning = learning_signal_dict.get(i, 0.5)
|
|
|
|
|
| composite = (
|
| self.alpha * semantic
|
| + self.beta * attentional
|
| + self.gamma * probabilistic
|
| + self.delta * learning
|
| )
|
| composite = max(0.0, min(1.0, composite))
|
|
|
| token_scores.append(composite)
|
| composite_scores[i] = composite
|
|
|
| return TokenConfidenceScore(
|
| agent_name=agent_name,
|
| response_text=agent_response,
|
| token_scores=token_scores,
|
| claims=claims,
|
| semantic_confidence_dict=semantic_conf_dict,
|
| attentional_confidence_dict=attentional_conf_dict,
|
| probabilistic_confidence_dict=probabilistic_conf_dict,
|
| learning_signal_dict=learning_signal_dict,
|
| composite_scores=composite_scores,
|
| )
|
|
|
| def _parse_semantic_markers(self, response: str) -> Dict[int, float]:
|
| """
|
| Parse confidence markers from text.
|
|
|
| Returns:
|
| Dict mapping token_idx to confidence [0, 1]
|
| """
|
| conf_dict = {}
|
| tokens = response.split()
|
|
|
|
|
| for level, confidence_level in [("high", 0.9), ("medium", 0.6), ("low", 0.3)]:
|
| for pattern in _MARKER_PATTERNS[level]:
|
| for match in pattern.finditer(response):
|
|
|
| char_pos = match.start()
|
| char_count = 0
|
| for token_idx, token in enumerate(tokens):
|
| if char_count <= char_pos < char_count + len(token):
|
|
|
| for nearby_idx in range(
|
| max(0, token_idx - 1), min(len(tokens), token_idx + 4)
|
| ):
|
| if nearby_idx not in conf_dict:
|
| conf_dict[nearby_idx] = confidence_level
|
| else:
|
|
|
| conf_dict[nearby_idx] = max(
|
| conf_dict[nearby_idx], confidence_level
|
| )
|
| break
|
| char_count += len(token) + 1
|
|
|
|
|
| for i in range(len(tokens)):
|
| if i not in conf_dict:
|
| conf_dict[i] = 0.5
|
|
|
| return conf_dict
|
|
|
| def _compute_attentional_confidence(
|
| self, agent_response: str, peer_responses: Dict[str, str]
|
| ) -> Dict[int, float]:
|
| """
|
| Compute attentional confidence via semantic overlap with peers.
|
|
|
| High overlap = higher confidence (claim addresses peer perspectives)
|
|
|
| Returns:
|
| Dict mapping token_idx to confidence [0.3, 1.0]
|
| """
|
| conf_dict = {}
|
| tokens = agent_response.split()
|
|
|
| if not peer_responses:
|
|
|
| for i in range(len(tokens)):
|
| conf_dict[i] = 0.5
|
| return conf_dict
|
|
|
|
|
| token_overlaps = defaultdict(list)
|
|
|
| for peer_name, peer_response in peer_responses.items():
|
| peer_tokens_set = set(peer_response.lower().split())
|
|
|
| for token_idx, token in enumerate(tokens):
|
|
|
| if token.lower() in peer_tokens_set:
|
| token_overlaps[token_idx].append(1.0)
|
| elif any(
|
| token.lower().startswith(p[:3]) or p.startswith(token.lower()[:3])
|
| for p in peer_tokens_set
|
| ):
|
|
|
| token_overlaps[token_idx].append(0.6)
|
|
|
|
|
| for i in range(len(tokens)):
|
| if token_overlaps[i]:
|
| overlap_score = sum(token_overlaps[i]) / len(token_overlaps[i])
|
| else:
|
| overlap_score = 0.0
|
|
|
|
|
| attentional_conf = 0.3 + 0.7 * overlap_score
|
| conf_dict[i] = attentional_conf
|
|
|
| return conf_dict
|
|
|
| def _extract_probabilistic_confidence(
|
| self, response: str, logits: Optional[List[float]] = None
|
| ) -> Dict[int, float]:
|
| """
|
| Extract per-token probabilities from logits.
|
|
|
| If logits not provided, use fallback heuristic (all 0.5).
|
|
|
| Returns:
|
| Dict mapping token_idx to probability [0, 1]
|
| """
|
| conf_dict = {}
|
| tokens = response.split()
|
|
|
| if logits and len(logits) == len(tokens):
|
|
|
| for i, prob in enumerate(logits):
|
| conf_dict[i] = max(0.0, min(1.0, prob))
|
| else:
|
|
|
| common_words = {
|
| "the",
|
| "a",
|
| "is",
|
| "and",
|
| "or",
|
| "of",
|
| "to",
|
| "in",
|
| "that",
|
| "it",
|
| }
|
| for i, token in enumerate(tokens):
|
| if token.lower() in common_words:
|
| conf_dict[i] = 0.9
|
| elif len(token) > 3:
|
| conf_dict[i] = 0.6
|
| else:
|
| conf_dict[i] = 0.5
|
|
|
| return conf_dict
|
|
|
| def _compute_learning_signal(
|
| self, response: str, agent_name: str
|
| ) -> Dict[int, float]:
|
| """
|
| Compute learning signal from historical coherence (Phase 2 enhancement).
|
|
|
| Query memory for similar past responses and boost confidence if
|
| they led to high coherence. Recent memories are weighted higher.
|
|
|
| Returns:
|
| Dict mapping token_idx to learning signal [0.5, 1.0]
|
|
|
| Phase 2: Now includes recency weighting with ~7 day half-life
|
| """
|
| import math
|
|
|
| conf_dict = {}
|
| tokens = response.split()
|
|
|
|
|
| if not self.living_memory:
|
| for i in range(len(tokens)):
|
| conf_dict[i] = 0.5
|
| return conf_dict
|
|
|
|
|
| try:
|
| similar_cocoons = self.living_memory.recall_by_adapter(
|
| agent_name, limit=10
|
| )
|
| if not similar_cocoons:
|
| avg_coherence = 0.5
|
| else:
|
|
|
|
|
| recency_weights = []
|
| weighted_coherences = []
|
|
|
| for cocoon in similar_cocoons:
|
| age_hours = cocoon.age_hours()
|
|
|
| recency_weight = math.exp(-age_hours / 168.0)
|
| recency_weights.append(recency_weight)
|
| weighted_coherences.append(cocoon.coherence * recency_weight)
|
|
|
|
|
| total_weight = sum(recency_weights)
|
| if total_weight > 0:
|
| avg_coherence = sum(weighted_coherences) / total_weight
|
| else:
|
| avg_coherence = 0.5
|
|
|
| except Exception as e:
|
| logger.warning(f"Error retrieving memory for {agent_name}: {e}")
|
| avg_coherence = 0.5
|
|
|
|
|
|
|
| learning_signal = 0.5 + 0.5 * avg_coherence
|
|
|
| for i in range(len(tokens)):
|
| conf_dict[i] = learning_signal
|
|
|
| return conf_dict
|
|
|
| def _extract_claims(
|
| self,
|
| response: str,
|
| semantic_conf_dict: Dict[int, float],
|
| attentional_conf_dict: Dict[int, float],
|
| probabilistic_conf_dict: Dict[int, float],
|
| learning_signal_dict: Dict[int, float],
|
| agent_name: str,
|
| ) -> List[ClaimSegment]:
|
| """
|
| Extract individual claims (sentences/clauses) from response.
|
|
|
| Returns:
|
| List of ClaimSegment with aggregate confidence from component signals
|
| """
|
| claims = []
|
|
|
|
|
| sentence_pattern = re.compile(r"[.!?]+")
|
| sentences = sentence_pattern.split(response)
|
|
|
| token_idx = 0
|
| start_char_idx = 0
|
|
|
| for sentence in sentences:
|
| if not sentence.strip():
|
| continue
|
|
|
| sentence_tokens = sentence.split()
|
| sentence_token_indices = list(range(token_idx, token_idx + len(sentence_tokens)))
|
| token_idx += len(sentence_tokens)
|
|
|
|
|
| if sentence_token_indices:
|
| semantic = sum(
|
| semantic_conf_dict.get(i, 0.5) for i in sentence_token_indices
|
| ) / len(sentence_token_indices)
|
| attentional = sum(
|
| attentional_conf_dict.get(i, 0.5) for i in sentence_token_indices
|
| ) / len(sentence_token_indices)
|
| probabilistic = sum(
|
| probabilistic_conf_dict.get(i, 0.5) for i in sentence_token_indices
|
| ) / len(sentence_token_indices)
|
| learning = sum(
|
| learning_signal_dict.get(i, 0.5) for i in sentence_token_indices
|
| ) / len(sentence_token_indices)
|
|
|
| composite_confidence = (
|
| self.alpha * semantic
|
| + self.beta * attentional
|
| + self.gamma * probabilistic
|
| + self.delta * learning
|
| )
|
| composite_confidence = max(0.0, min(1.0, composite_confidence))
|
|
|
| claim = ClaimSegment(
|
| text=sentence.strip(),
|
| start_idx=start_char_idx,
|
| end_idx=start_char_idx + len(sentence),
|
| confidence=composite_confidence,
|
| semantic_conf=semantic,
|
| attentional_conf=attentional,
|
| probabilistic_conf=probabilistic,
|
| learning_signal=learning,
|
| agent_name=agent_name,
|
| )
|
| claims.append(claim)
|
|
|
| start_char_idx += len(sentence) + 1
|
|
|
| return claims
|
|
|