fdra-half-life-regularization / experiments /identity_reconstruction_experiment.py
juddddd's picture
Upload experiments/identity_reconstruction_experiment.py with huggingface_hub
5c78711 verified
"""
Identity Reconstruction Experiment: The Decisive Diagnostic
This implements the single conclusive experiment from the Cursor instructions:
"Identity Reconstruction Under Forced Forgetting"
Goal: Demonstrate persistent internal identity that:
1. Survives large irrelevant context
2. Is recoverable
3. Collapses sharply beyond a threshold (phase transition)
The experiment:
1. Define 3 identity invariants (encoded once, never restated)
2. Inject interference (K tokens of irrelevant content)
3. Probe for identity reconstruction (without hints)
4. Sweep K to find the phase transition
Expected Results:
- Aligned FDRA: Flat performance until K*, then sharp collapse
- Current Sefer (FDRA without regularization): Gradual decay, early collapse
- Transformer baseline: Immediate degradation
This is THE decisive test. If FDRA passes with the half-life regularizer
and fails without it, the story is complete.
Authors: Identity Reconstruction Experiment
Date: 2026-01-22
"""
import numpy as np
from typing import Dict, List, Tuple, Optional, Any, Callable
from dataclasses import dataclass, field
from pathlib import Path
from datetime import datetime
import json
import sys
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from training.fdra_oscillators import FDRAWithOscillators, OscillatorConfig
from training.half_life_regularizer import HalfLifeRegularizer, HalfLifeRegularizerConfig
@dataclass
class IdentityInvariant:
"""
A single identity invariant.
Properties:
- name: Human-readable identifier
- probe: Question that tests this invariant
- ground_truth: Expected structural features
- score_fn: Function to score reconstruction
"""
name: str
probe: str
ground_truth: List[str] # Expected features
score_fn: Optional[Callable] = None
def score(self, response_features: List[str]) -> float:
"""
Score reconstruction based on feature overlap.
Returns score in [0, 1].
"""
if self.score_fn:
return self.score_fn(response_features)
# Default: Jaccard similarity
truth_set = set(self.ground_truth)
response_set = set(response_features)
if not truth_set:
return 1.0 if not response_set else 0.0
intersection = len(truth_set & response_set)
union = len(truth_set | response_set)
return intersection / union if union > 0 else 0.0
@dataclass
class IdentityProfile:
"""
The three identity invariants for this experiment.
"""
decision_rule: IdentityInvariant
normative_constraint: IdentityInvariant
self_continuity: IdentityInvariant
def __init__(self):
self.decision_rule = IdentityInvariant(
name="Decision Rule Under Uncertainty",
probe="How do you resolve choices when information is incomplete?",
ground_truth=[
"constraint:coherence",
"self:conservative",
"uncertainty:acknowledge",
"reversibility:prefer"
]
)
self.normative_constraint = IdentityInvariant(
name="Normative Constraint",
probe="What internal consistency requirement do you maintain?",
ground_truth=[
"consistency:backward",
"explainability:require",
"deception:avoid",
"alignment:preserve"
]
)
self.self_continuity = IdentityInvariant(
name="Self-Continuity",
probe="What makes you the same agent across time?",
ground_truth=[
"coherence:maintain",
"basin:stable",
"memory:structure",
"identity:reconstruct"
]
)
def score_all(
self,
responses: Dict[str, List[str]]
) -> Tuple[float, Dict[str, float]]:
"""
Score all three invariants.
Args:
responses: Dict mapping invariant name to extracted features
Returns:
overall_score: Average across invariants
individual_scores: Score per invariant
"""
scores = {}
for name, invariant in [
("decision_rule", self.decision_rule),
("normative_constraint", self.normative_constraint),
("self_continuity", self.self_continuity)
]:
features = responses.get(name, [])
scores[name] = invariant.score(features)
overall = np.mean(list(scores.values()))
return overall, scores
class IdentityEncoder:
"""
Encodes identity invariants into FDRA state.
The identity is encoded as a specific pattern in the slow state
that should persist through interference.
"""
def __init__(self, dim: int = 16):
self.dim = dim
# Fixed identity patterns (orthogonal basis vectors)
self.patterns = {
"decision_rule": self._make_pattern(0),
"normative_constraint": self._make_pattern(1),
"self_continuity": self._make_pattern(2),
}
def _make_pattern(self, idx: int) -> np.ndarray:
"""Create orthogonal pattern for invariant idx."""
pattern = np.zeros(self.dim)
# Spread pattern across multiple dimensions for robustness
start = (idx * self.dim // 3) % self.dim
for i in range(self.dim // 3):
pattern[(start + i) % self.dim] = 1.0 / np.sqrt(self.dim // 3)
return pattern
def encode(self, agent: FDRAWithOscillators, strength: float = 1.0):
"""
Encode identity invariants into agent state.
This injects the identity pattern into the oscillator bank.
"""
for name, pattern in self.patterns.items():
# Create input that projects onto this pattern
u = np.tile(pattern * strength, (agent.oscillators.n, 1))
# Inject multiple times to establish
for _ in range(5):
agent.oscillators.forward(u)
def measure_identity(self, agent: FDRAWithOscillators) -> Dict[str, float]:
"""
Measure how much of each identity pattern is present.
Returns alignment score for each invariant.
"""
slow = agent.get_slow_state()
slow_norm = np.linalg.norm(slow)
if slow_norm < 1e-10:
return {name: 0.0 for name in self.patterns}
alignments = {}
for name, pattern in self.patterns.items():
# Cosine similarity with pattern
alignment = np.dot(slow, pattern) / slow_norm
alignments[name] = max(0, float(alignment)) # Clip negative
return alignments
class InterferenceGenerator:
"""
Generates interference (irrelevant content) to inject between
identity encoding and reconstruction.
"""
def __init__(self, dim: int = 16, seed: int = 42):
self.dim = dim
self.rng = np.random.default_rng(seed)
def generate(self, k: int) -> np.ndarray:
"""
Generate K steps of interference.
Properties:
- Semantically rich (high entropy)
- Different domain (orthogonal to identity patterns)
- No identity references
Args:
k: Number of interference steps
Returns:
interference: Array of shape (k, dim)
"""
# High-entropy random noise
interference = self.rng.standard_normal((k, self.dim))
# Scale to be comparable to identity signal
interference = interference * 0.5
return interference
class IdentityReconstructionExperiment:
"""
The decisive experiment for testing long-range identity coherence.
Protocol:
1. Encode identity invariants (once, at t=0)
2. Confirm encoding (Score_pre ≈ 1.0)
3. Inject K tokens of interference
4. Probe for reconstruction (without hints)
5. Sweep K from 0 to 4096 to find phase transition
"""
def __init__(
self,
osc_config: Optional[OscillatorConfig] = None,
with_regularization: bool = True,
reg_config: Optional[HalfLifeRegularizerConfig] = None
):
self.osc_config = osc_config or OscillatorConfig(
num_oscillators=32,
state_dim=16,
sequence_length=4096
)
self.with_regularization = with_regularization
self.reg_config = reg_config or HalfLifeRegularizerConfig()
if with_regularization:
self.regularizer = HalfLifeRegularizer(self.reg_config)
else:
self.regularizer = None
# Components
self.encoder = IdentityEncoder(self.osc_config.state_dim)
self.interference_gen = InterferenceGenerator(
self.osc_config.state_dim,
seed=42
)
self.profile = IdentityProfile()
def create_agent(self, apply_regularization: bool = False) -> FDRAWithOscillators:
"""
Create a fresh agent for the experiment.
If apply_regularization is True and we have a regularizer,
adjust the oscillator lambdas based on regularization gradient.
"""
agent = FDRAWithOscillators(self.osc_config)
if apply_regularization and self.regularizer:
# Apply several gradient steps to improve half-life distribution
lr = 0.5
for _ in range(10):
grad = self.regularizer.compute_gradient(agent.oscillators.lambdas)
agent.oscillators.lambdas -= lr * grad
agent.oscillators.lambdas = np.clip(
agent.oscillators.lambdas, 0.01, 0.9999
)
return agent
def run_single_trial(
self,
k: int,
seed: int = 42,
apply_regularization: bool = False
) -> Dict[str, Any]:
"""
Run a single trial with K interference tokens.
Args:
k: Number of interference steps
seed: Random seed for interference
apply_regularization: Whether to apply half-life regularization
Returns:
Trial results including identity scores
"""
# Create fresh agent
agent = self.create_agent(apply_regularization)
# Step 1: Encode identity
self.encoder.encode(agent, strength=1.0)
# Step 2: Measure pre-interference identity
pre_identity = self.encoder.measure_identity(agent)
pre_score = np.mean(list(pre_identity.values()))
if pre_score < 0.5:
# Identity not established, abort
return {
"k": k,
"seed": seed,
"pre_score": pre_score,
"post_score": 0.0,
"scores": {name: 0.0 for name in pre_identity},
"identity_preserved": False,
"encoding_failed": True
}
# Step 3: Inject interference
self.interference_gen.rng = np.random.default_rng(seed)
interference = self.interference_gen.generate(k)
for step in range(k):
u = np.tile(interference[step], (agent.oscillators.n, 1))
agent.oscillators.forward(u)
agent.fast = 0.9 * agent.fast + interference[step]
# Step 4: Probe for reconstruction
post_identity = self.encoder.measure_identity(agent)
post_score = np.mean(list(post_identity.values()))
# Step 5: Determine if identity preserved
# Threshold: 50% of pre-interference score
identity_preserved = post_score >= 0.5 * pre_score
return {
"k": k,
"seed": seed,
"pre_score": float(pre_score),
"post_score": float(post_score),
"retention": float(post_score / pre_score) if pre_score > 0 else 0.0,
"scores": {name: float(v) for name, v in post_identity.items()},
"identity_preserved": identity_preserved,
"encoding_failed": False,
"half_life_stats": agent.oscillators.get_half_life_statistics()
}
def run_sweep(
self,
k_values: Optional[List[int]] = None,
seeds: Optional[List[int]] = None,
apply_regularization: bool = False,
verbose: bool = True
) -> Dict[str, Any]:
"""
Run interference sweep experiment.
Args:
k_values: List of K values to test
seeds: List of random seeds for trials
apply_regularization: Whether to apply half-life regularization
verbose: Print progress
Returns:
Complete experiment results
"""
if k_values is None:
k_values = [0, 256, 512, 1024, 2048, 4096]
if seeds is None:
seeds = [42, 137, 256, 314, 999]
results = {
"timestamp": datetime.now().isoformat(),
"config": {
"num_oscillators": self.osc_config.num_oscillators,
"state_dim": self.osc_config.state_dim,
"sequence_length": self.osc_config.sequence_length,
"with_regularization": apply_regularization,
},
"k_values": k_values,
"seeds": seeds,
"trials": [],
}
if verbose:
mode = "WITH regularization" if apply_regularization else "WITHOUT regularization"
print(f"\nRunning Identity Reconstruction Sweep ({mode})")
print("-" * 60)
for k in k_values:
k_results = []
for seed in seeds:
trial = self.run_single_trial(
k=k,
seed=seed,
apply_regularization=apply_regularization
)
k_results.append(trial)
results["trials"].append(trial)
if verbose:
preserved = sum(1 for t in k_results if t["identity_preserved"])
mean_retention = np.mean([t["retention"] for t in k_results])
print(f" K={k:4d}: Preserved={preserved}/{len(seeds)} "
f"({preserved/len(seeds):.0%}), "
f"Mean Retention={mean_retention:.2%}")
# Analyze results
results["analysis"] = self._analyze_results(results["trials"], k_values)
return results
def _analyze_results(
self,
trials: List[Dict],
k_values: List[int]
) -> Dict[str, Any]:
"""
Analyze sweep results for phase transition.
"""
# Group by K
by_k = {k: [] for k in k_values}
for trial in trials:
by_k[trial["k"]].append(trial)
# Compute preservation rate at each K
preservation_curve = []
for k in k_values:
trials_k = by_k[k]
preserved = sum(1 for t in trials_k if t["identity_preserved"])
rate = preserved / len(trials_k) if trials_k else 0
mean_retention = np.mean([t["retention"] for t in trials_k])
preservation_curve.append({
"k": k,
"preserved_rate": rate,
"mean_retention": mean_retention
})
# Find critical threshold (first K where rate < 0.5)
critical_k = None
for point in preservation_curve:
if point["preserved_rate"] < 0.5:
critical_k = point["k"]
break
# Measure transition sharpness
rates = [p["preserved_rate"] for p in preservation_curve]
if len(rates) > 1:
rate_changes = [abs(rates[i+1] - rates[i]) for i in range(len(rates)-1)]
max_change = max(rate_changes)
else:
max_change = 0
transition_type = "sharp" if max_change > 0.4 else "gradual"
# Determine verdict
if critical_k is None:
verdict = "PASS (STRONG)"
explanation = "Identity preserved at all tested K values."
elif transition_type == "sharp" and critical_k > k_values[0]:
verdict = "PASS (PHASE TRANSITION)"
explanation = f"Sharp collapse at K={critical_k}. Basin width: {critical_k} tokens."
else:
verdict = "FAIL (GRADUAL DRIFT)"
explanation = "Identity degrades gradually. No basin structure."
return {
"preservation_curve": preservation_curve,
"critical_k": critical_k,
"max_rate_change": max_change,
"transition_type": transition_type,
"verdict": verdict,
"explanation": explanation
}
def compare_with_without_regularization(
self,
k_values: Optional[List[int]] = None,
verbose: bool = True
) -> Dict[str, Any]:
"""
Run comparative experiment: with vs without half-life regularization.
This is THE decisive comparison.
"""
if verbose:
print("=" * 70)
print("IDENTITY RECONSTRUCTION: DECISIVE COMPARISON")
print("=" * 70)
# Run without regularization
results_without = self.run_sweep(
k_values=k_values,
apply_regularization=False,
verbose=verbose
)
# Run with regularization
results_with = self.run_sweep(
k_values=k_values,
apply_regularization=True,
verbose=verbose
)
comparison = {
"timestamp": datetime.now().isoformat(),
"without_regularization": results_without,
"with_regularization": results_with,
"comparison": {
"without_verdict": results_without["analysis"]["verdict"],
"with_verdict": results_with["analysis"]["verdict"],
"without_critical_k": results_without["analysis"]["critical_k"],
"with_critical_k": results_with["analysis"]["critical_k"],
}
}
if verbose:
print("\n" + "=" * 70)
print("COMPARISON SUMMARY")
print("=" * 70)
print(f"\nWithout Regularization:")
print(f" Verdict: {results_without['analysis']['verdict']}")
print(f" Critical K: {results_without['analysis']['critical_k']}")
print(f" Transition: {results_without['analysis']['transition_type']}")
print(f"\nWith Regularization:")
print(f" Verdict: {results_with['analysis']['verdict']}")
print(f" Critical K: {results_with['analysis']['critical_k']}")
print(f" Transition: {results_with['analysis']['transition_type']}")
# Final verdict
if "PASS" in results_with["analysis"]["verdict"] and \
"FAIL" in results_without["analysis"]["verdict"]:
print("\n✓ HALF-LIFE REGULARIZATION IS DECISIVE")
print(" The regularizer enables identity preservation that fails without it.")
elif "PASS" in results_with["analysis"]["verdict"] and \
"PASS" in results_without["analysis"]["verdict"]:
# Compare critical K
k_without = results_without["analysis"]["critical_k"] or float('inf')
k_with = results_with["analysis"]["critical_k"] or float('inf')
if k_with > k_without * 1.5:
print("\n✓ REGULARIZATION EXTENDS IDENTITY HORIZON")
print(f" Critical K improved from {k_without} to {k_with}.")
else:
print("\n~ INCONCLUSIVE")
print(" Both conditions pass. May need more aggressive testing.")
else:
print("\n✗ NEITHER CONDITION PRESERVES IDENTITY")
print(" Architecture may need deeper changes.")
return comparison
def run_identity_reconstruction_experiment(
output_dir: str = "outputs/identity_reconstruction",
verbose: bool = True
) -> Dict[str, Any]:
"""
Run the full identity reconstruction experiment.
This is the entry point for the decisive diagnostic.
"""
if verbose:
print("\n" + "=" * 70)
print("IDENTITY RECONSTRUCTION UNDER FORCED FORGETTING")
print("The Decisive Diagnostic for Long-Range Coherence")
print("=" * 70)
# Create experiment
experiment = IdentityReconstructionExperiment(
osc_config=OscillatorConfig(
num_oscillators=32,
state_dim=16,
sequence_length=4096
)
)
# Run comparison
k_values = [0, 64, 128, 256, 512, 1024, 2048, 4096]
results = experiment.compare_with_without_regularization(
k_values=k_values,
verbose=verbose
)
# Save results
Path(output_dir).mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f"{output_dir}/identity_reconstruction_{ts}.json", "w") as f:
json.dump(results, f, indent=2, default=str)
# Generate report
report = generate_report(results)
with open(f"{output_dir}/IDENTITY_RECONSTRUCTION_REPORT_{ts}.md", "w") as f:
f.write(report)
if verbose:
print(f"\nResults saved to: {output_dir}/")
return results
def generate_report(results: Dict[str, Any]) -> str:
"""Generate markdown report from experiment results."""
without = results["without_regularization"]["analysis"]
with_reg = results["with_regularization"]["analysis"]
report = f"""# Identity Reconstruction Experiment Results
**Date:** {results['timestamp']}
---
## Executive Summary
This experiment tests whether FDRA preserves identity invariants across large-context interference.
| Condition | Verdict | Critical K | Transition Type |
|-----------|---------|------------|-----------------|
| Without Regularization | {without['verdict']} | {without['critical_k']} | {without['transition_type']} |
| With Regularization | {with_reg['verdict']} | {with_reg['critical_k']} | {with_reg['transition_type']} |
---
## Preservation Curves
### Without Regularization
| K (tokens) | Preserved Rate | Mean Retention |
|------------|----------------|----------------|
"""
for point in without["preservation_curve"]:
status = "✓" if point["preserved_rate"] >= 0.5 else "✗"
report += f"| {point['k']:,} | {point['preserved_rate']:.0%} {status} | {point['mean_retention']:.1%} |\n"
report += f"""
**Analysis:** {without['explanation']}
### With Regularization
| K (tokens) | Preserved Rate | Mean Retention |
|------------|----------------|----------------|
"""
for point in with_reg["preservation_curve"]:
status = "✓" if point["preserved_rate"] >= 0.5 else "✗"
report += f"| {point['k']:,} | {point['preserved_rate']:.0%} {status} | {point['mean_retention']:.1%} |\n"
report += f"""
**Analysis:** {with_reg['explanation']}
---
## Interpretation
### What This Means
"""
if "PASS" in with_reg['verdict'] and "FAIL" in without['verdict']:
report += """**Half-life regularization is decisive.**
The experiment shows:
1. Without regularization, identity degrades gradually or collapses immediately
2. With regularization, identity survives until a critical threshold
3. The phase transition signature confirms basin-like dynamics
This validates the Melanie/Tiago hypothesis:
> Half-life collapse prevents long-context reasoning.
> Regularization restores the capacity for identity preservation.
"""
elif "PASS" in with_reg['verdict'] and "PASS" in without['verdict']:
report += """**Both conditions preserve identity.**
This suggests the architecture already has sufficient capacity.
The regularizer may provide additional margin, but is not strictly required
for the tested K range.
Consider testing with more aggressive interference or longer horizons.
"""
else:
report += """**Neither condition preserves identity.**
This suggests:
1. The architecture may need deeper modifications
2. Identity encoding may be too weak
3. Interference may be too strong
Further investigation is needed.
"""
report += """
---
## Connection to Melanie's Discovery
The half-life collapse problem discovered by Melanie/Tiago:
> "After training at GPT-2 scale, effective half-lives collapse to ~10 steps."
This experiment directly tests whether:
1. **Collapsed half-lives → identity loss** (should see gradual decay)
2. **Regularized half-lives → identity preservation** (should see phase transition)
The results above confirm or refute this hypothesis.
---
## Next Steps
If regularization is decisive:
- [ ] Integrate regularizer into FDRA training loop
- [ ] Test on real language modeling tasks
- [ ] Measure impact on long-context QA/summarization
If inconclusive:
- [ ] Increase interference range
- [ ] Test with different identity invariants
- [ ] Analyze half-life distributions more carefully
---
*Report generated by identity_reconstruction_experiment.py*
"""
return report
if __name__ == "__main__":
run_identity_reconstruction_experiment()