Upload training_validation/fdra_oscillators_with_routing.py with huggingface_hub

Browse files

Files changed (1) hide show

training_validation/fdra_oscillators_with_routing.py +427 -0

training_validation/fdra_oscillators_with_routing.py ADDED Viewed

	@@ -0,0 +1,427 @@

+"""
+FDRA Oscillator Implementation with Explicit Decay Parameters
+This implements the core FDRA oscillator dynamics where each oscillator has:
+- A decay parameter λ_i ∈ (0, 1)
+- Half-life τ_i = ln(0.5) / ln(λ_i)
+The key problem this addresses (from Melanie/Tiago's discovery):
+- During training at GPT-2 scale, all λ_i collapse to near 1.0 (very short half-lives)
+- This means oscillators only attend to ~10 tokens instead of full context length
+- The model works for short-context tasks but fails on long-context reasoning
+Solution: Half-life regularization to maintain diversity across temporal scales.
+Authors: FDRA Half-Life Regularization Implementation
+Date: 2026-01-22
+"""
+import numpy as np
+from typing import Dict, List, Tuple, Optional, Any
+from dataclasses import dataclass
+import json
+from pathlib import Path
+@dataclass
+class OscillatorConfig:
+    """Configuration for FDRA oscillator bank."""
+    num_oscillators: int = 32        # Number of oscillators
+    state_dim: int = 16              # Dimension per oscillator
+    sequence_length: int = 4096      # Max sequence length (L)
+    tau_min: float = 1.0             # Minimum half-life
+    tau_max: float = 4096.0          # Maximum half-life (typically = L)
+    # Initialization
+    init_method: str = "log_uniform"  # "log_uniform" or "random"
+@dataclass
+class OscillatorState:
+    """State of an oscillator bank."""
+    h: np.ndarray                    # Hidden states: (num_oscillators, state_dim)
+    lambdas: np.ndarray              # Decay parameters: (num_oscillators,)
+    def copy(self) -> 'OscillatorState':
+        return OscillatorState(
+            h=self.h.copy(),
+            lambdas=self.lambdas.copy()
+        )
+class FDRAOscillatorBank:
+    """
+    FDRA Oscillator Bank with explicit decay parameters.
+    Each oscillator i has:
+        h_i(t+1) = λ_i * h_i(t) + u_i(t)
+    Where:
+        λ_i ∈ (0, 1) is the decay parameter
+        τ_i = ln(0.5) / ln(λ_i) is the half-life
+    Half-life interpretation:
+        τ_i = number of steps for oscillator state to decay to 50%
+    The goal of half-life regularization:
+        Maintain log-uniform distribution of τ_i across [τ_min, τ_max]
+        This ensures oscillators can attend to both short and long contexts.
+    """
+    def __init__(self, config: OscillatorConfig):
+        self.config = config
+        self.n = config.num_oscillators
+        self.d = config.state_dim
+        self.L = config.sequence_length
+        # Initialize decay parameters
+        self.lambdas = self._init_lambdas()
+        # Initialize hidden states
+        self.h = np.zeros((self.n, self.d))
+        # Track history for analysis
+        self.history: List[Dict[str, Any]] = []
+    def _init_lambdas(self) -> np.ndarray:
+        """
+        Initialize decay parameters λ_i.
+        For log-uniform half-lives, we want:
+            τ_i ~ LogUniform(τ_min, τ_max)
+        Since τ = ln(0.5) / ln(λ), we have:
+            λ = 0.5^(1/τ)
+        So for log-uniform τ:
+            log(τ) ~ Uniform(log(τ_min), log(τ_max))
+            τ = exp(log_τ)
+            λ = 0.5^(1/τ)
+        """
+        if self.config.init_method == "log_uniform":
+            # Log-uniform distribution of half-lives
+            log_tau_min = np.log(self.config.tau_min)
+            log_tau_max = np.log(self.config.tau_max)
+            # Evenly spaced in log space
+            log_taus = np.linspace(log_tau_min, log_tau_max, self.n)
+            taus = np.exp(log_taus)
+            # Convert half-lives to decay parameters
+            # λ = exp(ln(0.5) / τ) = 0.5^(1/τ)
+            lambdas = np.power(0.5, 1.0 / taus)
+        else:
+            # Random initialization (not recommended)
+            lambdas = np.random.uniform(0.5, 0.99, self.n)
+        return lambdas
+    def get_half_lives(self) -> np.ndarray:
+        """
+        Compute half-lives from decay parameters.
+        τ_i = ln(0.5) / ln(λ_i)
+        """
+        # Clamp lambdas to avoid log(1) = 0
+        safe_lambdas = np.clip(self.lambdas, 1e-10, 1.0 - 1e-10)
+        taus = np.log(0.5) / np.log(safe_lambdas)
+        return taus
+    def get_log_half_lives(self) -> np.ndarray:
+        """Get log of half-lives: z_i = log(τ_i)."""
+        return np.log(self.get_half_lives())
+    def forward(self, u: np.ndarray) -> np.ndarray:
+        """
+        One step of oscillator dynamics.
+        h_i(t+1) = λ_i * h_i(t) + u_i(t)
+        Args:
+            u: Input signal, shape (num_oscillators, state_dim)
+        Returns:
+            Updated hidden states, shape (num_oscillators, state_dim)
+        """
+        # Broadcast lambdas across state dimensions
+        lambdas_broadcast = self.lambdas[:, np.newaxis]  # (n, 1)
+        # Apply dynamics
+        self.h = lambdas_broadcast * self.h + u
+        return self.h.copy()
+    def reset(self):
+        """Reset oscillator states to zero."""
+        self.h = np.zeros((self.n, self.d))
+    def get_half_life_statistics(self) -> Dict[str, float]:
+        """
+        Compute statistics of half-life distribution.
+        Returns:
+            Dictionary with mean, std, min, max in log space.
+        """
+        taus = self.get_half_lives()
+        z = np.log(taus)
+        return {
+            "tau_min": float(np.min(taus)),
+            "tau_max": float(np.max(taus)),
+            "tau_mean": float(np.mean(taus)),
+            "tau_median": float(np.median(taus)),
+            "log_tau_mean": float(np.mean(z)),
+            "log_tau_std": float(np.std(z)),
+            "log_tau_min": float(np.min(z)),
+            "log_tau_max": float(np.max(z)),
+        }
+    def get_state(self) -> OscillatorState:
+        """Get current oscillator state."""
+        return OscillatorState(
+            h=self.h.copy(),
+            lambdas=self.lambdas.copy()
+        )
+    def set_state(self, state: OscillatorState):
+        """Set oscillator state."""
+        self.h = state.h.copy()
+        self.lambdas = state.lambdas.copy()
+class FDRAWithOscillators:
+    """
+    Full FDRA agent with oscillator bank for memory.
+    This extends the basic FDRA agent to use an oscillator bank
+    with explicit decay parameters that can be regularized.
+    Architecture:
+        Input → [Oscillator Bank] → Slow State → Output
+                     ↑                    ↓
+                 Fast State ←──────────────
+    Routing Modes (validated in routing ablation):
+        - "uniform": Equal weight to all oscillators (baseline)
+        - "tau_weighted": Weight ∝ τ (soft routing to slow modes)
+        - "tau_gated": Only write to τ > threshold oscillators
+    """
+    def __init__(
+        self,
+        osc_config: Optional[OscillatorConfig] = None,
+        wlc_threshold: float = 1.0,
+        routing_mode: str = "uniform"  # "uniform", "tau_weighted", or "tau_gated"
+    ):
+        self.config = osc_config or OscillatorConfig()
+        self.oscillators = FDRAOscillatorBank(self.config)
+        self.wlc_threshold = wlc_threshold
+        self.routing_mode = routing_mode
+        # Routing config
+        self.routing_min = 0.25  # Minimum routing weight
+        self.routing_max = 4.0   # Maximum routing weight
+        self.gating_threshold = 0.25  # Fraction of L for gating threshold
+        # Fast state (reactive, for computation)
+        self.fast = np.zeros(self.config.state_dim)
+        # Energy tracking
+        self.energy = 0.0
+        self.history: List[Dict[str, Any]] = []
+    def _compute_routing_weights(self) -> np.ndarray:
+        """
+        Compute routing weights based on routing mode.
+        Returns:
+            Routing weights, shape (num_oscillators,)
+        """
+        taus = self.oscillators.get_half_lives()
+        if self.routing_mode == "uniform":
+            # Equal weight to all oscillators
+            return np.ones(self.config.num_oscillators)
+        elif self.routing_mode == "tau_weighted":
+            # Weight ∝ τ, normalized by mean
+            weights = taus / np.mean(taus)
+            # Clamp for stability
+            weights = np.clip(weights, self.routing_min, self.routing_max)
+            return weights
+        elif self.routing_mode == "tau_gated":
+            # Hard gating: only oscillators with τ > threshold
+            threshold = self.gating_threshold * self.config.sequence_length
+            mask = (taus > threshold).astype(float)
+            if np.sum(mask) == 0:
+                # Fallback to uniform if no oscillators pass
+                return np.ones(self.config.num_oscillators)
+            # Normalize so total weight is same as uniform
+            return mask * (self.config.num_oscillators / np.sum(mask))
+        else:
+            raise ValueError(f"Unknown routing mode: {self.routing_mode}")
+    def get_slow_state(self) -> np.ndarray:
+        """
+        Aggregate slow state from oscillator bank.
+        The slow state is a weighted sum of oscillator states,
+        with weights proportional to half-life.
+        """
+        taus = self.oscillators.get_half_lives()
+        weights = taus / np.sum(taus)  # Normalize
+        # Weighted sum across oscillators
+        weighted_h = self.oscillators.h * weights[:, np.newaxis]
+        slow = np.sum(weighted_h, axis=0)  # (state_dim,)
+        return slow
+    def forward_dynamics(self, action: np.ndarray) -> np.ndarray:
+        """
+        Forward dynamics with oscillator bank.
+        1. Compute routing weights based on mode
+        2. Distribute action across oscillators (weighted by routing)
+        3. Update oscillator bank
+        4. Compute slow state from oscillators
+        5. Update fast state
+        """
+        # Compute routing weights (the key change for τ-routing)
+        routing_weights = self._compute_routing_weights()  # (n,)
+        # Distribute action to oscillators WITH ROUTING WEIGHTS
+        u = np.tile(action, (self.config.num_oscillators, 1))  # (n, d)
+        # Apply routing weights (scale each oscillator's input by its weight)
+        u = u * routing_weights[:, np.newaxis]  # (n, d)
+        # Scale by base factor
+        u = u * 0.1
+        # Update oscillators
+        self.oscillators.forward(u)
+        # Get slow state from oscillators
+        slow = self.get_slow_state()
+        # Update fast state (reactive)
+        self.fast = 0.9 * self.fast + action
+        # Energy
+        self.energy += np.linalg.norm(action) * 0.1
+        return slow
+    def get_coherence(self) -> float:
+        """Coherence between slow and fast states."""
+        slow = self.get_slow_state()
+        slow_norm = np.linalg.norm(slow)
+        fast_norm = np.linalg.norm(self.fast)
+        if slow_norm < 1e-10 or fast_norm < 1e-10:
+            return 0.0
+        return float(np.dot(slow, self.fast) / (slow_norm * fast_norm))
+    def step(self, action: np.ndarray) -> Dict[str, Any]:
+        """Execute one step and return diagnostics."""
+        slow = self.forward_dynamics(action)
+        coherence = self.get_coherence()
+        stats = self.oscillators.get_half_life_statistics()
+        result = {
+            "slow_norm": float(np.linalg.norm(slow)),
+            "fast_norm": float(np.linalg.norm(self.fast)),
+            "coherence": coherence,
+            "energy": self.energy,
+            **stats
+        }
+        self.history.append(result)
+        return result
+    def reset(self):
+        """Reset all state."""
+        self.oscillators.reset()
+        self.fast = np.zeros(self.config.state_dim)
+        self.energy = 0.0
+        self.history = []
+def demo_oscillators():
+    """Demonstrate oscillator bank behavior."""
+    print("=" * 60)
+    print("FDRA OSCILLATOR BANK DEMONSTRATION")
+    print("=" * 60)
+    config = OscillatorConfig(
+        num_oscillators=16,
+        state_dim=8,
+        sequence_length=4096,
+        tau_min=1.0,
+        tau_max=4096.0
+    )
+    bank = FDRAOscillatorBank(config)
+    print("\n1. Initial Half-Life Distribution")
+    print("-" * 40)
+    stats = bank.get_half_life_statistics()
+    print(f"   τ range: [{stats['tau_min']:.1f}, {stats['tau_max']:.1f}]")
+    print(f"   τ mean: {stats['tau_mean']:.1f}")
+    print(f"   log(τ) mean: {stats['log_tau_mean']:.3f}")
+    print(f"   log(τ) std: {stats['log_tau_std']:.3f}")
+    print("\n2. Half-Lives per Oscillator")
+    print("-" * 40)
+    taus = bank.get_half_lives()
+    for i, tau in enumerate(taus):
+        bar = "█" * int(np.log(tau) * 3)
+        print(f"   Osc {i:2d}: τ = {tau:7.1f} steps  {bar}")
+    print("\n3. Simulating Input Sequence")
+    print("-" * 40)
+    # Pulse input at t=0
+    u = np.random.randn(config.num_oscillators, config.state_dim)
+    bank.forward(u)
+    initial_norms = np.linalg.norm(bank.h, axis=1)
+    # Decay for 100 steps with zero input
+    decay_steps = [10, 50, 100, 500, 1000]
+    zero_input = np.zeros((config.num_oscillators, config.state_dim))
+    step = 0
+    for target in decay_steps:
+        while step < target:
+            bank.forward(zero_input)
+            step += 1
+        current_norms = np.linalg.norm(bank.h, axis=1)
+        retention = current_norms / (initial_norms + 1e-10)
+        print(f"\n   After {step} steps:")
+        for i, (tau, ret) in enumerate(zip(taus, retention)):
+            if tau < step * 0.5:
+                expected = "✗ (should be < 50%)"
+            else:
+                expected = "✓ (should be > 50%)"
+            print(f"      Osc {i:2d}: τ={tau:7.1f}, retention={ret:.1%} {expected}")
+            if i >= 3:
+                print(f"      ... ({len(taus) - 4} more)")
+                break
+    print("\n" + "=" * 60)
+    print("OBSERVATION: Oscillators with τ > t retain more than 50% of signal")
+    print("This is the desired behavior for long-context modeling.")
+    print("=" * 60)
+if __name__ == "__main__":
+    demo_oscillators()