Upload training/fdra_oscillators.py with huggingface_hub

5daceea verified 3 months ago

12.3 kB

	"""
	FDRA Oscillator Implementation with Explicit Decay Parameters

	This implements the core FDRA oscillator dynamics where each oscillator has:
	- A decay parameter λ_i ∈ (0, 1)
	- Half-life τ_i = ln(0.5) / ln(λ_i)

	The key problem this addresses (from Melanie/Tiago's discovery):
	- During training at GPT-2 scale, all λ_i collapse to near 1.0 (very short half-lives)
	- This means oscillators only attend to ~10 tokens instead of full context length
	- The model works for short-context tasks but fails on long-context reasoning

	Solution: Half-life regularization to maintain diversity across temporal scales.

	Authors: FDRA Half-Life Regularization Implementation
	Date: 2026-01-22
	"""

	import numpy as np
	from typing import Dict, List, Tuple, Optional, Any
	from dataclasses import dataclass
	import json
	from pathlib import Path


	@dataclass
	class OscillatorConfig:
	"""Configuration for FDRA oscillator bank."""
	num_oscillators: int = 32 # Number of oscillators
	state_dim: int = 16 # Dimension per oscillator
	sequence_length: int = 4096 # Max sequence length (L)
	tau_min: float = 1.0 # Minimum half-life
	tau_max: float = 4096.0 # Maximum half-life (typically = L)

	# Initialization
	init_method: str = "log_uniform" # "log_uniform" or "random"


	@dataclass
	class OscillatorState:
	"""State of an oscillator bank."""
	h: np.ndarray # Hidden states: (num_oscillators, state_dim)
	lambdas: np.ndarray # Decay parameters: (num_oscillators,)

	def copy(self) -> 'OscillatorState':
	return OscillatorState(
	h=self.h.copy(),
	lambdas=self.lambdas.copy()
	)


	class FDRAOscillatorBank:
	"""
	FDRA Oscillator Bank with explicit decay parameters.

	Each oscillator i has:
	h_i(t+1) = λ_i * h_i(t) + u_i(t)

	Where:
	λ_i ∈ (0, 1) is the decay parameter
	τ_i = ln(0.5) / ln(λ_i) is the half-life

	Half-life interpretation:
	τ_i = number of steps for oscillator state to decay to 50%

	The goal of half-life regularization:
	Maintain log-uniform distribution of τ_i across [τ_min, τ_max]
	This ensures oscillators can attend to both short and long contexts.
	"""

	def __init__(self, config: OscillatorConfig):
	self.config = config
	self.n = config.num_oscillators
	self.d = config.state_dim
	self.L = config.sequence_length

	# Initialize decay parameters
	self.lambdas = self._init_lambdas()

	# Initialize hidden states
	self.h = np.zeros((self.n, self.d))

	# Track history for analysis
	self.history: List[Dict[str, Any]] = []

	def _init_lambdas(self) -> np.ndarray:
	"""
	Initialize decay parameters λ_i.

	For log-uniform half-lives, we want:
	τ_i ~ LogUniform(τ_min, τ_max)

	Since τ = ln(0.5) / ln(λ), we have:
	λ = 0.5^(1/τ)

	So for log-uniform τ:
	log(τ) ~ Uniform(log(τ_min), log(τ_max))
	τ = exp(log_τ)
	λ = 0.5^(1/τ)
	"""
	if self.config.init_method == "log_uniform":
	# Log-uniform distribution of half-lives
	log_tau_min = np.log(self.config.tau_min)
	log_tau_max = np.log(self.config.tau_max)

	# Evenly spaced in log space
	log_taus = np.linspace(log_tau_min, log_tau_max, self.n)
	taus = np.exp(log_taus)

	# Convert half-lives to decay parameters
	# λ = exp(ln(0.5) / τ) = 0.5^(1/τ)
	lambdas = np.power(0.5, 1.0 / taus)

	else:
	# Random initialization (not recommended)
	lambdas = np.random.uniform(0.5, 0.99, self.n)

	return lambdas

	def get_half_lives(self) -> np.ndarray:
	"""
	Compute half-lives from decay parameters.

	τ_i = ln(0.5) / ln(λ_i)
	"""
	# Clamp lambdas to avoid log(1) = 0
	safe_lambdas = np.clip(self.lambdas, 1e-10, 1.0 - 1e-10)
	taus = np.log(0.5) / np.log(safe_lambdas)
	return taus

	def get_log_half_lives(self) -> np.ndarray:
	"""Get log of half-lives: z_i = log(τ_i)."""
	return np.log(self.get_half_lives())

	def forward(self, u: np.ndarray) -> np.ndarray:
	"""
	One step of oscillator dynamics.

	h_i(t+1) = λ_i * h_i(t) + u_i(t)

	Args:
	u: Input signal, shape (num_oscillators, state_dim)

	Returns:
	Updated hidden states, shape (num_oscillators, state_dim)
	"""
	# Broadcast lambdas across state dimensions
	lambdas_broadcast = self.lambdas[:, np.newaxis] # (n, 1)

	# Apply dynamics
	self.h = lambdas_broadcast * self.h + u

	return self.h.copy()

	def reset(self):
	"""Reset oscillator states to zero."""
	self.h = np.zeros((self.n, self.d))

	def get_half_life_statistics(self) -> Dict[str, float]:
	"""
	Compute statistics of half-life distribution.

	Returns:
	Dictionary with mean, std, min, max in log space.
	"""
	taus = self.get_half_lives()
	z = np.log(taus)

	return {
	"tau_min": float(np.min(taus)),
	"tau_max": float(np.max(taus)),
	"tau_mean": float(np.mean(taus)),
	"tau_median": float(np.median(taus)),
	"log_tau_mean": float(np.mean(z)),
	"log_tau_std": float(np.std(z)),
	"log_tau_min": float(np.min(z)),
	"log_tau_max": float(np.max(z)),
	}

	def get_state(self) -> OscillatorState:
	"""Get current oscillator state."""
	return OscillatorState(
	h=self.h.copy(),
	lambdas=self.lambdas.copy()
	)

	def set_state(self, state: OscillatorState):
	"""Set oscillator state."""
	self.h = state.h.copy()
	self.lambdas = state.lambdas.copy()


	class FDRAWithOscillators:
	"""
	Full FDRA agent with oscillator bank for memory.

	This extends the basic FDRA agent to use an oscillator bank
	with explicit decay parameters that can be regularized.

	Architecture:
	Input → [Oscillator Bank] → Slow State → Output
	↑ ↓
	Fast State ←──────────────
	"""

	def __init__(
	self,
	osc_config: Optional[OscillatorConfig] = None,
	wlc_threshold: float = 1.0
	):
	self.config = osc_config or OscillatorConfig()
	self.oscillators = FDRAOscillatorBank(self.config)
	self.wlc_threshold = wlc_threshold

	# Fast state (reactive, for computation)
	self.fast = np.zeros(self.config.state_dim)

	# Energy tracking
	self.energy = 0.0

	self.history: List[Dict[str, Any]] = []

	def get_slow_state(self) -> np.ndarray:
	"""
	Aggregate slow state from oscillator bank.

	The slow state is a weighted sum of oscillator states,
	with weights proportional to half-life.
	"""
	taus = self.oscillators.get_half_lives()
	weights = taus / np.sum(taus) # Normalize

	# Weighted sum across oscillators
	weighted_h = self.oscillators.h * weights[:, np.newaxis]
	slow = np.sum(weighted_h, axis=0) # (state_dim,)

	return slow

	def forward_dynamics(self, action: np.ndarray) -> np.ndarray:
	"""
	Forward dynamics with oscillator bank.

	1. Distribute action across oscillators
	2. Update oscillator bank
	3. Compute slow state from oscillators
	4. Update fast state
	"""
	# Distribute action to oscillators (same input, different decays)
	u = np.tile(action, (self.config.num_oscillators, 1)) # (n, d)

	# Scale by oscillator-specific factors (optional: could learn these)
	scale = 0.1 * np.ones((self.config.num_oscillators, 1))
	u = u * scale

	# Update oscillators
	self.oscillators.forward(u)

	# Get slow state from oscillators
	slow = self.get_slow_state()

	# Update fast state (reactive)
	self.fast = 0.9 * self.fast + action

	# Energy
	self.energy += np.linalg.norm(action) * 0.1

	return slow

	def get_coherence(self) -> float:
	"""Coherence between slow and fast states."""
	slow = self.get_slow_state()
	slow_norm = np.linalg.norm(slow)
	fast_norm = np.linalg.norm(self.fast)

	if slow_norm < 1e-10 or fast_norm < 1e-10:
	return 0.0

	return float(np.dot(slow, self.fast) / (slow_norm * fast_norm))

	def step(self, action: np.ndarray) -> Dict[str, Any]:
	"""Execute one step and return diagnostics."""
	slow = self.forward_dynamics(action)
	coherence = self.get_coherence()

	stats = self.oscillators.get_half_life_statistics()

	result = {
	"slow_norm": float(np.linalg.norm(slow)),
	"fast_norm": float(np.linalg.norm(self.fast)),
	"coherence": coherence,
	"energy": self.energy,
	**stats
	}

	self.history.append(result)
	return result

	def reset(self):
	"""Reset all state."""
	self.oscillators.reset()
	self.fast = np.zeros(self.config.state_dim)
	self.energy = 0.0
	self.history = []


	def demo_oscillators():
	"""Demonstrate oscillator bank behavior."""
	print("=" * 60)
	print("FDRA OSCILLATOR BANK DEMONSTRATION")
	print("=" * 60)

	config = OscillatorConfig(
	num_oscillators=16,
	state_dim=8,
	sequence_length=4096,
	tau_min=1.0,
	tau_max=4096.0
	)

	bank = FDRAOscillatorBank(config)

	print("\n1. Initial Half-Life Distribution")
	print("-" * 40)
	stats = bank.get_half_life_statistics()
	print(f" τ range: [{stats['tau_min']:.1f}, {stats['tau_max']:.1f}]")
	print(f" τ mean: {stats['tau_mean']:.1f}")
	print(f" log(τ) mean: {stats['log_tau_mean']:.3f}")
	print(f" log(τ) std: {stats['log_tau_std']:.3f}")

	print("\n2. Half-Lives per Oscillator")
	print("-" * 40)
	taus = bank.get_half_lives()
	for i, tau in enumerate(taus):
	bar = "█" * int(np.log(tau) * 3)
	print(f" Osc {i:2d}: τ = {tau:7.1f} steps {bar}")

	print("\n3. Simulating Input Sequence")
	print("-" * 40)

	# Pulse input at t=0
	u = np.random.randn(config.num_oscillators, config.state_dim)
	bank.forward(u)
	initial_norms = np.linalg.norm(bank.h, axis=1)

	# Decay for 100 steps with zero input
	decay_steps = [10, 50, 100, 500, 1000]
	zero_input = np.zeros((config.num_oscillators, config.state_dim))

	step = 0
	for target in decay_steps:
	while step < target:
	bank.forward(zero_input)
	step += 1

	current_norms = np.linalg.norm(bank.h, axis=1)
	retention = current_norms / (initial_norms + 1e-10)

	print(f"\n After {step} steps:")
	for i, (tau, ret) in enumerate(zip(taus, retention)):
	if tau < step * 0.5:
	expected = "✗ (should be < 50%)"
	else:
	expected = "✓ (should be > 50%)"
	print(f" Osc {i:2d}: τ={tau:7.1f}, retention={ret:.1%} {expected}")
	if i >= 3:
	print(f" ... ({len(taus) - 4} more)")
	break

	print("\n" + "=" * 60)
	print("OBSERVATION: Oscillators with τ > t retain more than 50% of signal")
	print("This is the desired behavior for long-context modeling.")
	print("=" * 60)


	if __name__ == "__main__":
	demo_oscillators()