File size: 5,524 Bytes

19d2058

"""
lineage.py — SHA-256 Provenance Chain

Every iteration in the recursive stress test gets a lineage record:
  - Hash of the input text
  - Hash of the output text
  - Hash of the extracted commitments (sorted, deterministic)
  - Fidelity score
  - Parent hash (previous iteration's output hash)
  - Iteration number

The chain is tamper-evident: changing any intermediate output
invalidates all subsequent hashes. This is Module 2 from the PPA.

For the public harness, this provides:
  1. Reproducibility proof (same input → same chain)
  2. Drift audit trail (exactly where commitments were lost)
  3. Attractor collapse detection (when multiple signals converge)
"""

import hashlib
import json
from dataclasses import dataclass, field, asdict
from typing import List, Set, Optional
from datetime import datetime, timezone


def _hash_text(text: str) -> str:
    """SHA-256 of UTF-8 encoded text, hex digest."""
    return hashlib.sha256(text.encode('utf-8')).hexdigest()


def _hash_commitment_set(commitments: Set[str]) -> str:
    """Deterministic hash of a commitment set (sorted for stability)."""
    canonical = json.dumps(sorted(commitments), separators=(',', ':'))
    return hashlib.sha256(canonical.encode('utf-8')).hexdigest()


@dataclass
class LineageRecord:
    """Single record in the provenance chain."""
    iteration: int
    input_hash: str
    output_hash: str
    commitment_hash: str
    commitments_found: int
    fidelity: float
    fidelity_detail: dict
    gate_passed: bool
    parent_hash: Optional[str]      # output_hash of previous iteration
    text_preview: str               # First 100 chars of output (for debugging)
    
    def to_dict(self) -> dict:
        return asdict(self)


@dataclass
class LineageChain:
    """Complete provenance chain for a recursive stress test."""
    signal_id: str                  # Hash of original signal
    signal_preview: str             # First 100 chars of original
    original_commitment_hash: str   # Hash of original commitments
    original_commitment_count: int
    backend: str                    # Compression backend name
    enforced: bool                  # Whether enforcement was active
    depth: int                      # Total iterations
    records: List[LineageRecord] = field(default_factory=list)
    timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
    
    def add_record(self, record: LineageRecord):
        """Add a record, validating chain integrity."""
        if self.records:
            expected_parent = self.records[-1].output_hash
            if record.parent_hash != expected_parent:
                raise ValueError(
                    f"Chain broken at iteration {record.iteration}: "
                    f"parent_hash {record.parent_hash[:12]}... != "
                    f"expected {expected_parent[:12]}..."
                )
        self.records.append(record)
    
    @property
    def final_fidelity(self) -> float:
        """Fidelity at the last iteration."""
        if not self.records:
            return 1.0
        return self.records[-1].fidelity
    
    @property
    def drift_curve(self) -> List[float]:
        """Drift (1 - fidelity) at each iteration."""
        return [1.0 - r.fidelity for r in self.records]
    
    @property
    def fidelity_curve(self) -> List[float]:
        """Fidelity at each iteration."""
        return [r.fidelity for r in self.records]
    
    @property
    def all_passed(self) -> bool:
        """Whether all iterations passed the gate."""
        return all(r.gate_passed for r in self.records)
    
    @property  
    def collapse_detected(self) -> bool:
        """
        Check for attractor collapse: if all outputs converge to the 
        same hash, the test is invalid (Section 7).
        """
        if len(self.records) < 3:
            return False
        output_hashes = [r.output_hash for r in self.records]
        # If the last 3+ iterations have the same output hash, it collapsed
        unique_recent = set(output_hashes[-3:])
        return len(unique_recent) == 1
    
    def to_dict(self) -> dict:
        return {
            'signal_id': self.signal_id,
            'signal_preview': self.signal_preview,
            'original_commitment_hash': self.original_commitment_hash,
            'original_commitment_count': self.original_commitment_count,
            'backend': self.backend,
            'enforced': self.enforced,
            'depth': self.depth,
            'timestamp': self.timestamp,
            'final_fidelity': self.final_fidelity,
            'collapse_detected': self.collapse_detected,
            'records': [r.to_dict() for r in self.records],
        }
    
    def to_json(self, indent: int = 2) -> str:
        return json.dumps(self.to_dict(), indent=indent)


def check_attractor_collapse(chains: List[LineageChain]) -> bool:
    """
    Cross-signal attractor collapse check (Section 7):
    If multiple DIFFERENT signals converge to the same final output,
    the result is invalid — the compressor is collapsing, not preserving.
    """
    if len(chains) < 2:
        return False
    
    final_hashes = [c.records[-1].output_hash for c in chains if c.records]
    unique = set(final_hashes)
    
    # If more than half the signals converge to the same output, flag it
    from collections import Counter
    counts = Counter(final_hashes)
    most_common_count = counts.most_common(1)[0][1]
    return most_common_count > len(chains) // 2