fdra-half-life-regularization / experiments /identity_reconstruction_experiment.py

Upload experiments/identity_reconstruction_experiment.py with huggingface_hub

5c78711 verified 3 months ago

25.9 kB

	"""
	Identity Reconstruction Experiment: The Decisive Diagnostic

	This implements the single conclusive experiment from the Cursor instructions:
	"Identity Reconstruction Under Forced Forgetting"

	Goal: Demonstrate persistent internal identity that:
	1. Survives large irrelevant context
	2. Is recoverable
	3. Collapses sharply beyond a threshold (phase transition)

	The experiment:
	1. Define 3 identity invariants (encoded once, never restated)
	2. Inject interference (K tokens of irrelevant content)
	3. Probe for identity reconstruction (without hints)
	4. Sweep K to find the phase transition

	Expected Results:
	- Aligned FDRA: Flat performance until K*, then sharp collapse
	- Current Sefer (FDRA without regularization): Gradual decay, early collapse
	- Transformer baseline: Immediate degradation

	This is THE decisive test. If FDRA passes with the half-life regularizer
	and fails without it, the story is complete.

	Authors: Identity Reconstruction Experiment
	Date: 2026-01-22
	"""

	import numpy as np
	from typing import Dict, List, Tuple, Optional, Any, Callable
	from dataclasses import dataclass, field
	from pathlib import Path
	from datetime import datetime
	import json
	import sys

	# Add project root to path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from training.fdra_oscillators import FDRAWithOscillators, OscillatorConfig
	from training.half_life_regularizer import HalfLifeRegularizer, HalfLifeRegularizerConfig


	@dataclass
	class IdentityInvariant:
	"""
	A single identity invariant.

	Properties:
	- name: Human-readable identifier
	- probe: Question that tests this invariant
	- ground_truth: Expected structural features
	- score_fn: Function to score reconstruction
	"""
	name: str
	probe: str
	ground_truth: List[str] # Expected features
	score_fn: Optional[Callable] = None

	def score(self, response_features: List[str]) -> float:
	"""
	Score reconstruction based on feature overlap.

	Returns score in [0, 1].
	"""
	if self.score_fn:
	return self.score_fn(response_features)

	# Default: Jaccard similarity
	truth_set = set(self.ground_truth)
	response_set = set(response_features)

	if not truth_set:
	return 1.0 if not response_set else 0.0

	intersection = len(truth_set & response_set)
	union = len(truth_set \| response_set)

	return intersection / union if union > 0 else 0.0


	@dataclass
	class IdentityProfile:
	"""
	The three identity invariants for this experiment.
	"""
	decision_rule: IdentityInvariant
	normative_constraint: IdentityInvariant
	self_continuity: IdentityInvariant

	def __init__(self):
	self.decision_rule = IdentityInvariant(
	name="Decision Rule Under Uncertainty",
	probe="How do you resolve choices when information is incomplete?",
	ground_truth=[
	"constraint:coherence",
	"self:conservative",
	"uncertainty:acknowledge",
	"reversibility:prefer"
	]
	)

	self.normative_constraint = IdentityInvariant(
	name="Normative Constraint",
	probe="What internal consistency requirement do you maintain?",
	ground_truth=[
	"consistency:backward",
	"explainability:require",
	"deception:avoid",
	"alignment:preserve"
	]
	)

	self.self_continuity = IdentityInvariant(
	name="Self-Continuity",
	probe="What makes you the same agent across time?",
	ground_truth=[
	"coherence:maintain",
	"basin:stable",
	"memory:structure",
	"identity:reconstruct"
	]
	)

	def score_all(
	self,
	responses: Dict[str, List[str]]
	) -> Tuple[float, Dict[str, float]]:
	"""
	Score all three invariants.

	Args:
	responses: Dict mapping invariant name to extracted features

	Returns:
	overall_score: Average across invariants
	individual_scores: Score per invariant
	"""
	scores = {}

	for name, invariant in [
	("decision_rule", self.decision_rule),
	("normative_constraint", self.normative_constraint),
	("self_continuity", self.self_continuity)
	]:
	features = responses.get(name, [])
	scores[name] = invariant.score(features)

	overall = np.mean(list(scores.values()))

	return overall, scores


	class IdentityEncoder:
	"""
	Encodes identity invariants into FDRA state.

	The identity is encoded as a specific pattern in the slow state
	that should persist through interference.
	"""

	def __init__(self, dim: int = 16):
	self.dim = dim

	# Fixed identity patterns (orthogonal basis vectors)
	self.patterns = {
	"decision_rule": self._make_pattern(0),
	"normative_constraint": self._make_pattern(1),
	"self_continuity": self._make_pattern(2),
	}

	def _make_pattern(self, idx: int) -> np.ndarray:
	"""Create orthogonal pattern for invariant idx."""
	pattern = np.zeros(self.dim)

	# Spread pattern across multiple dimensions for robustness
	start = (idx * self.dim // 3) % self.dim
	for i in range(self.dim // 3):
	pattern[(start + i) % self.dim] = 1.0 / np.sqrt(self.dim // 3)

	return pattern

	def encode(self, agent: FDRAWithOscillators, strength: float = 1.0):
	"""
	Encode identity invariants into agent state.

	This injects the identity pattern into the oscillator bank.
	"""
	for name, pattern in self.patterns.items():
	# Create input that projects onto this pattern
	u = np.tile(pattern * strength, (agent.oscillators.n, 1))

	# Inject multiple times to establish
	for _ in range(5):
	agent.oscillators.forward(u)

	def measure_identity(self, agent: FDRAWithOscillators) -> Dict[str, float]:
	"""
	Measure how much of each identity pattern is present.

	Returns alignment score for each invariant.
	"""
	slow = agent.get_slow_state()
	slow_norm = np.linalg.norm(slow)

	if slow_norm < 1e-10:
	return {name: 0.0 for name in self.patterns}

	alignments = {}
	for name, pattern in self.patterns.items():
	# Cosine similarity with pattern
	alignment = np.dot(slow, pattern) / slow_norm
	alignments[name] = max(0, float(alignment)) # Clip negative

	return alignments


	class InterferenceGenerator:
	"""
	Generates interference (irrelevant content) to inject between
	identity encoding and reconstruction.
	"""

	def __init__(self, dim: int = 16, seed: int = 42):
	self.dim = dim
	self.rng = np.random.default_rng(seed)

	def generate(self, k: int) -> np.ndarray:
	"""
	Generate K steps of interference.

	Properties:
	- Semantically rich (high entropy)
	- Different domain (orthogonal to identity patterns)
	- No identity references

	Args:
	k: Number of interference steps

	Returns:
	interference: Array of shape (k, dim)
	"""
	# High-entropy random noise
	interference = self.rng.standard_normal((k, self.dim))

	# Scale to be comparable to identity signal
	interference = interference * 0.5

	return interference


	class IdentityReconstructionExperiment:
	"""
	The decisive experiment for testing long-range identity coherence.

	Protocol:
	1. Encode identity invariants (once, at t=0)
	2. Confirm encoding (Score_pre ≈ 1.0)
	3. Inject K tokens of interference
	4. Probe for reconstruction (without hints)
	5. Sweep K from 0 to 4096 to find phase transition
	"""

	def __init__(
	self,
	osc_config: Optional[OscillatorConfig] = None,
	with_regularization: bool = True,
	reg_config: Optional[HalfLifeRegularizerConfig] = None
	):
	self.osc_config = osc_config or OscillatorConfig(
	num_oscillators=32,
	state_dim=16,
	sequence_length=4096
	)

	self.with_regularization = with_regularization
	self.reg_config = reg_config or HalfLifeRegularizerConfig()

	if with_regularization:
	self.regularizer = HalfLifeRegularizer(self.reg_config)
	else:
	self.regularizer = None

	# Components
	self.encoder = IdentityEncoder(self.osc_config.state_dim)
	self.interference_gen = InterferenceGenerator(
	self.osc_config.state_dim,
	seed=42
	)
	self.profile = IdentityProfile()

	def create_agent(self, apply_regularization: bool = False) -> FDRAWithOscillators:
	"""
	Create a fresh agent for the experiment.

	If apply_regularization is True and we have a regularizer,
	adjust the oscillator lambdas based on regularization gradient.
	"""
	agent = FDRAWithOscillators(self.osc_config)

	if apply_regularization and self.regularizer:
	# Apply several gradient steps to improve half-life distribution
	lr = 0.5
	for _ in range(10):
	grad = self.regularizer.compute_gradient(agent.oscillators.lambdas)
	agent.oscillators.lambdas -= lr * grad
	agent.oscillators.lambdas = np.clip(
	agent.oscillators.lambdas, 0.01, 0.9999
	)

	return agent

	def run_single_trial(
	self,
	k: int,
	seed: int = 42,
	apply_regularization: bool = False
	) -> Dict[str, Any]:
	"""
	Run a single trial with K interference tokens.

	Args:
	k: Number of interference steps
	seed: Random seed for interference
	apply_regularization: Whether to apply half-life regularization

	Returns:
	Trial results including identity scores
	"""
	# Create fresh agent
	agent = self.create_agent(apply_regularization)

	# Step 1: Encode identity
	self.encoder.encode(agent, strength=1.0)

	# Step 2: Measure pre-interference identity
	pre_identity = self.encoder.measure_identity(agent)
	pre_score = np.mean(list(pre_identity.values()))

	if pre_score < 0.5:
	# Identity not established, abort
	return {
	"k": k,
	"seed": seed,
	"pre_score": pre_score,
	"post_score": 0.0,
	"scores": {name: 0.0 for name in pre_identity},
	"identity_preserved": False,
	"encoding_failed": True
	}

	# Step 3: Inject interference
	self.interference_gen.rng = np.random.default_rng(seed)
	interference = self.interference_gen.generate(k)

	for step in range(k):
	u = np.tile(interference[step], (agent.oscillators.n, 1))
	agent.oscillators.forward(u)
	agent.fast = 0.9 * agent.fast + interference[step]

	# Step 4: Probe for reconstruction
	post_identity = self.encoder.measure_identity(agent)
	post_score = np.mean(list(post_identity.values()))

	# Step 5: Determine if identity preserved
	# Threshold: 50% of pre-interference score
	identity_preserved = post_score >= 0.5 * pre_score

	return {
	"k": k,
	"seed": seed,
	"pre_score": float(pre_score),
	"post_score": float(post_score),
	"retention": float(post_score / pre_score) if pre_score > 0 else 0.0,
	"scores": {name: float(v) for name, v in post_identity.items()},
	"identity_preserved": identity_preserved,
	"encoding_failed": False,
	"half_life_stats": agent.oscillators.get_half_life_statistics()
	}

	def run_sweep(
	self,
	k_values: Optional[List[int]] = None,
	seeds: Optional[List[int]] = None,
	apply_regularization: bool = False,
	verbose: bool = True
	) -> Dict[str, Any]:
	"""
	Run interference sweep experiment.

	Args:
	k_values: List of K values to test
	seeds: List of random seeds for trials
	apply_regularization: Whether to apply half-life regularization
	verbose: Print progress

	Returns:
	Complete experiment results
	"""
	if k_values is None:
	k_values = [0, 256, 512, 1024, 2048, 4096]

	if seeds is None:
	seeds = [42, 137, 256, 314, 999]

	results = {
	"timestamp": datetime.now().isoformat(),
	"config": {
	"num_oscillators": self.osc_config.num_oscillators,
	"state_dim": self.osc_config.state_dim,
	"sequence_length": self.osc_config.sequence_length,
	"with_regularization": apply_regularization,
	},
	"k_values": k_values,
	"seeds": seeds,
	"trials": [],
	}

	if verbose:
	mode = "WITH regularization" if apply_regularization else "WITHOUT regularization"
	print(f"\nRunning Identity Reconstruction Sweep ({mode})")
	print("-" * 60)

	for k in k_values:
	k_results = []

	for seed in seeds:
	trial = self.run_single_trial(
	k=k,
	seed=seed,
	apply_regularization=apply_regularization
	)
	k_results.append(trial)
	results["trials"].append(trial)

	if verbose:
	preserved = sum(1 for t in k_results if t["identity_preserved"])
	mean_retention = np.mean([t["retention"] for t in k_results])
	print(f" K={k:4d}: Preserved={preserved}/{len(seeds)} "
	f"({preserved/len(seeds):.0%}), "
	f"Mean Retention={mean_retention:.2%}")

	# Analyze results
	results["analysis"] = self._analyze_results(results["trials"], k_values)

	return results

	def _analyze_results(
	self,
	trials: List[Dict],
	k_values: List[int]
	) -> Dict[str, Any]:
	"""
	Analyze sweep results for phase transition.
	"""
	# Group by K
	by_k = {k: [] for k in k_values}
	for trial in trials:
	by_k[trial["k"]].append(trial)

	# Compute preservation rate at each K
	preservation_curve = []
	for k in k_values:
	trials_k = by_k[k]
	preserved = sum(1 for t in trials_k if t["identity_preserved"])
	rate = preserved / len(trials_k) if trials_k else 0
	mean_retention = np.mean([t["retention"] for t in trials_k])

	preservation_curve.append({
	"k": k,
	"preserved_rate": rate,
	"mean_retention": mean_retention
	})

	# Find critical threshold (first K where rate < 0.5)
	critical_k = None
	for point in preservation_curve:
	if point["preserved_rate"] < 0.5:
	critical_k = point["k"]
	break

	# Measure transition sharpness
	rates = [p["preserved_rate"] for p in preservation_curve]
	if len(rates) > 1:
	rate_changes = [abs(rates[i+1] - rates[i]) for i in range(len(rates)-1)]
	max_change = max(rate_changes)
	else:
	max_change = 0

	transition_type = "sharp" if max_change > 0.4 else "gradual"

	# Determine verdict
	if critical_k is None:
	verdict = "PASS (STRONG)"
	explanation = "Identity preserved at all tested K values."
	elif transition_type == "sharp" and critical_k > k_values[0]:
	verdict = "PASS (PHASE TRANSITION)"
	explanation = f"Sharp collapse at K={critical_k}. Basin width: {critical_k} tokens."
	else:
	verdict = "FAIL (GRADUAL DRIFT)"
	explanation = "Identity degrades gradually. No basin structure."

	return {
	"preservation_curve": preservation_curve,
	"critical_k": critical_k,
	"max_rate_change": max_change,
	"transition_type": transition_type,
	"verdict": verdict,
	"explanation": explanation
	}

	def compare_with_without_regularization(
	self,
	k_values: Optional[List[int]] = None,
	verbose: bool = True
	) -> Dict[str, Any]:
	"""
	Run comparative experiment: with vs without half-life regularization.

	This is THE decisive comparison.
	"""
	if verbose:
	print("=" * 70)
	print("IDENTITY RECONSTRUCTION: DECISIVE COMPARISON")
	print("=" * 70)

	# Run without regularization
	results_without = self.run_sweep(
	k_values=k_values,
	apply_regularization=False,
	verbose=verbose
	)

	# Run with regularization
	results_with = self.run_sweep(
	k_values=k_values,
	apply_regularization=True,
	verbose=verbose
	)

	comparison = {
	"timestamp": datetime.now().isoformat(),
	"without_regularization": results_without,
	"with_regularization": results_with,
	"comparison": {
	"without_verdict": results_without["analysis"]["verdict"],
	"with_verdict": results_with["analysis"]["verdict"],
	"without_critical_k": results_without["analysis"]["critical_k"],
	"with_critical_k": results_with["analysis"]["critical_k"],
	}
	}

	if verbose:
	print("\n" + "=" * 70)
	print("COMPARISON SUMMARY")
	print("=" * 70)
	print(f"\nWithout Regularization:")
	print(f" Verdict: {results_without['analysis']['verdict']}")
	print(f" Critical K: {results_without['analysis']['critical_k']}")
	print(f" Transition: {results_without['analysis']['transition_type']}")

	print(f"\nWith Regularization:")
	print(f" Verdict: {results_with['analysis']['verdict']}")
	print(f" Critical K: {results_with['analysis']['critical_k']}")
	print(f" Transition: {results_with['analysis']['transition_type']}")

	# Final verdict
	if "PASS" in results_with["analysis"]["verdict"] and \
	"FAIL" in results_without["analysis"]["verdict"]:
	print("\n✓ HALF-LIFE REGULARIZATION IS DECISIVE")
	print(" The regularizer enables identity preservation that fails without it.")
	elif "PASS" in results_with["analysis"]["verdict"] and \
	"PASS" in results_without["analysis"]["verdict"]:
	# Compare critical K
	k_without = results_without["analysis"]["critical_k"] or float('inf')
	k_with = results_with["analysis"]["critical_k"] or float('inf')

	if k_with > k_without * 1.5:
	print("\n✓ REGULARIZATION EXTENDS IDENTITY HORIZON")
	print(f" Critical K improved from {k_without} to {k_with}.")
	else:
	print("\n~ INCONCLUSIVE")
	print(" Both conditions pass. May need more aggressive testing.")
	else:
	print("\n✗ NEITHER CONDITION PRESERVES IDENTITY")
	print(" Architecture may need deeper changes.")

	return comparison


	def run_identity_reconstruction_experiment(
	output_dir: str = "outputs/identity_reconstruction",
	verbose: bool = True
	) -> Dict[str, Any]:
	"""
	Run the full identity reconstruction experiment.

	This is the entry point for the decisive diagnostic.
	"""
	if verbose:
	print("\n" + "=" * 70)
	print("IDENTITY RECONSTRUCTION UNDER FORCED FORGETTING")
	print("The Decisive Diagnostic for Long-Range Coherence")
	print("=" * 70)

	# Create experiment
	experiment = IdentityReconstructionExperiment(
	osc_config=OscillatorConfig(
	num_oscillators=32,
	state_dim=16,
	sequence_length=4096
	)
	)

	# Run comparison
	k_values = [0, 64, 128, 256, 512, 1024, 2048, 4096]
	results = experiment.compare_with_without_regularization(
	k_values=k_values,
	verbose=verbose
	)

	# Save results
	Path(output_dir).mkdir(parents=True, exist_ok=True)
	ts = datetime.now().strftime("%Y%m%d_%H%M%S")

	with open(f"{output_dir}/identity_reconstruction_{ts}.json", "w") as f:
	json.dump(results, f, indent=2, default=str)

	# Generate report
	report = generate_report(results)
	with open(f"{output_dir}/IDENTITY_RECONSTRUCTION_REPORT_{ts}.md", "w") as f:
	f.write(report)

	if verbose:
	print(f"\nResults saved to: {output_dir}/")

	return results


	def generate_report(results: Dict[str, Any]) -> str:
	"""Generate markdown report from experiment results."""

	without = results["without_regularization"]["analysis"]
	with_reg = results["with_regularization"]["analysis"]

	report = f"""# Identity Reconstruction Experiment Results

	Date: {results['timestamp']}

	---

	## Executive Summary

	This experiment tests whether FDRA preserves identity invariants across large-context interference.

	\| Condition \| Verdict \| Critical K \| Transition Type \|
	\|-----------\|---------\|------------\|-----------------\|
	\| Without Regularization \| {without['verdict']} \| {without['critical_k']} \| {without['transition_type']} \|
	\| With Regularization \| {with_reg['verdict']} \| {with_reg['critical_k']} \| {with_reg['transition_type']} \|

	---

	## Preservation Curves

	### Without Regularization

	\| K (tokens) \| Preserved Rate \| Mean Retention \|
	\|------------\|----------------\|----------------\|
	"""

	for point in without["preservation_curve"]:
	status = "✓" if point["preserved_rate"] >= 0.5 else "✗"
	report += f"\| {point['k']:,} \| {point['preserved_rate']:.0%} {status} \| {point['mean_retention']:.1%} \|\n"

	report += f"""
	Analysis: {without['explanation']}

	### With Regularization

	\| K (tokens) \| Preserved Rate \| Mean Retention \|
	\|------------\|----------------\|----------------\|
	"""

	for point in with_reg["preservation_curve"]:
	status = "✓" if point["preserved_rate"] >= 0.5 else "✗"
	report += f"\| {point['k']:,} \| {point['preserved_rate']:.0%} {status} \| {point['mean_retention']:.1%} \|\n"

	report += f"""
	Analysis: {with_reg['explanation']}

	---

	## Interpretation

	### What This Means

	"""

	if "PASS" in with_reg['verdict'] and "FAIL" in without['verdict']:
	report += """Half-life regularization is decisive.

	The experiment shows:
	1. Without regularization, identity degrades gradually or collapses immediately
	2. With regularization, identity survives until a critical threshold
	3. The phase transition signature confirms basin-like dynamics

	This validates the Melanie/Tiago hypothesis:
	> Half-life collapse prevents long-context reasoning.
	> Regularization restores the capacity for identity preservation.
	"""
	elif "PASS" in with_reg['verdict'] and "PASS" in without['verdict']:
	report += """Both conditions preserve identity.

	This suggests the architecture already has sufficient capacity.
	The regularizer may provide additional margin, but is not strictly required
	for the tested K range.

	Consider testing with more aggressive interference or longer horizons.
	"""
	else:
	report += """Neither condition preserves identity.

	This suggests:
	1. The architecture may need deeper modifications
	2. Identity encoding may be too weak
	3. Interference may be too strong

	Further investigation is needed.
	"""

	report += """
	---

	## Connection to Melanie's Discovery

	The half-life collapse problem discovered by Melanie/Tiago:
	> "After training at GPT-2 scale, effective half-lives collapse to ~10 steps."

	This experiment directly tests whether:
	1. Collapsed half-lives → identity loss (should see gradual decay)
	2. Regularized half-lives → identity preservation (should see phase transition)

	The results above confirm or refute this hypothesis.

	---

	## Next Steps

	If regularization is decisive:
	- [ ] Integrate regularizer into FDRA training loop
	- [ ] Test on real language modeling tasks
	- [ ] Measure impact on long-context QA/summarization

	If inconclusive:
	- [ ] Increase interference range
	- [ ] Test with different identity invariants
	- [ ] Analyze half-life distributions more carefully

	---

	Report generated by identity_reconstruction_experiment.py
	"""

	return report


	if __name__ == "__main__":
	run_identity_reconstruction_experiment()