KitsuVp
/

NeoLLM

@@ -10,6 +10,7 @@ Updated to include:
 - SeeDNorm: Dynamic normalization with input-dependent scaling for better adaptability
 - Dropout regularization at strategic locations
 - ResFormer: Feature residual connections from first layer (applied before projections)
 """
 import math
@@ -26,7 +27,6 @@ from transformers.masking_utils import create_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
@@ -238,67 +238,178 @@ class NeoLLMRMSNormGated(nn.Module):
         return hidden_states.to(input_dtype)
-class NeoLLMRotaryEmbedding(nn.Module):
-    inv_freq: torch.Tensor  # fix linting for `register_buffer`
-    def __init__(self, config: NeoLLMConfig, device=None):
         super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            self.rope_type = "default"
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-        self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-    @torch.no_grad()
-    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-    def forward(self, x, position_ids):
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
-        position_ids_expanded = position_ids[:, None, :].float()
-        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos() * self.attention_scaling
-            sin = emb.sin() * self.attention_scaling
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors."""
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    # Keep half or full tensor for later concatenation
-    rotary_dim = cos.shape[-1]
-    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
-    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
-    # Apply rotary embeddings on the first half or full tensor
-    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
-    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
-    # Concatenate back to full shape
-    q_embed = torch.cat([q_embed, q_pass], dim=-1)
-    k_embed = torch.cat([k_embed, k_pass], dim=-1)
-    return q_embed, k_embed
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -323,10 +434,18 @@ def eager_attention_forward(
     dropout: float = 0.0,
     **kwargs: Unpack[TransformersKwargs],
 ):
     key_states = repeat_kv(key, module.num_key_value_groups)
     value_states = repeat_kv(value, module.num_key_value_groups)
     attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
     if attention_mask is not None:
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
@@ -342,10 +461,14 @@ def eager_attention_forward(
 class NeoLLMAttention(nn.Module):
     """
     Multi-headed attention with FANformer integration, SeeDNorm for Q/K normalization,
-    and ResFormer feature residual connections for enhanced information flow.
     ResFormer enhancement: Applies learnable feature residual connections from the first layer
     BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
@@ -353,7 +476,11 @@ class NeoLLMAttention(nn.Module):
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
@@ -369,22 +496,34 @@ class NeoLLMAttention(nn.Module):
         # QKV projections operate on FAN-transformed features
         self.q_proj = nn.Linear(
-            fan_output_dim, config.num_attention_heads * self.head_dim * 2, bias=config.attention_bias
         )
         self.k_proj = nn.Linear(
-            fan_output_dim, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.v_proj = nn.Linear(
-            fan_output_dim, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.o_proj = nn.Linear(
-            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
         # SeeDNorm for Q/K normalization (replaces RMSNorm)
         self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
@@ -401,34 +540,61 @@ class NeoLLMAttention(nn.Module):
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         # ResFormer: Apply feature residual connection BEFORE projections
-        # This ensures dimensional compatibility across all layer types
         if first_layer_fan is not None:
             hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
         # Store current FAN features for potential use as first_layer_fan in subsequent layers
         current_layer_fan = hidden_states_fan.clone()
-        hidden_shape = (*input_shape, -1, self.head_dim)
-        # Use FAN-transformed features (with residual applied) for projections
         query_states, gate = torch.chunk(
-            self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
-        gate = gate.reshape(*input_shape, -1)
-        # Apply SeeDNorm to Q and K
-        query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
-        key_states = self.k_norm(self.k_proj(hidden_states_fan).view(hidden_shape)).transpose(1, 2)
-        value_states = self.v_proj(hidden_states_fan).view(hidden_shape).transpose(1, 2)
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
@@ -444,7 +610,10 @@ class NeoLLMAttention(nn.Module):
             **kwargs,
         )
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = attn_output * torch.sigmoid(gate)
         attn_output = self.o_proj(attn_output)
@@ -998,6 +1167,7 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
                 module.lambda_1.data.fill_(0.5)
             if hasattr(module, 'lambda_2'):
                 module.lambda_2.data.fill_(0.5)
         elif isinstance(module, GPAS):
             # Initialize GPAS alpha to 0 as per paper
             module.alpha.data.fill_(0.0)
@@ -1010,6 +1180,9 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
             # beta (β) initialized to 0 (default in Parameter definition)
             # alpha (α) initialized to 1 (default in Parameter definition)
             pass
 class NeoLLMModel(NeoLLMPreTrainedModel):
@@ -1023,7 +1196,15 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         )
         # SeeDNorm for final output normalization (replaces RMSNorm)
         self.norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # ResFormer: storage for first layer's FAN features (H_fan_1)
@@ -1061,8 +1242,9 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         hidden_states = inputs_embeds
-        # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
         # ResFormer: reset first_layer_fan at the start of each forward pass
         self.first_layer_fan = None
@@ -1193,6 +1375,7 @@ __all__ = [
     "NeoLLMConfig",
     "FANLayer",
     "SeeDNorm",
 ]
 # Register the configuration and model for AutoClass support

 - SeeDNorm: Dynamic normalization with input-dependent scaling for better adaptability
 - Dropout regularization at strategic locations
 - ResFormer: Feature residual connections from first layer (applied before projections)
+- PoPE (Polar Coordinate Position Embedding): Decouples 'what' and 'where' for superior length extrapolation
 """
 import math
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
         return hidden_states.to(input_dtype)
+class PolarPositionalEmbedding(nn.Module):
+    """
+    Polar Coordinate Position Embedding (PoPE) - FlashAttention2-compatible implementation
+    From "Decoupling the 'What' and 'Where' with Polar Coordinate Positional Embedding":
+    THEORETICAL FORMULATION (from paper):
+    - Magnitudes: μ_q̃tc = softplus(qtc), μ_k̃sc = softplus(ksc) (content only)
+    - Phases: φ_q̃tc = t*θc, φ_k̃sc = s*θc (position only)
+    - Attention score: a^PoPE_ts = Re[q̃^H @ k̃] = Σ (x_q * x_k + y_q * y_k)
+    Where x = μ*cos(φ), y = μ*sin(φ) are Cartesian coordinates.
+    PRACTICAL IMPLEMENTATION (this code):
+    To enable FlashAttention2 compatibility without custom kernels, we use the
+    mathematically equivalent formulation:
+    Q' = [x_q; y_q] ∈ ℝ^(2d)  (concatenation of real and imaginary parts)
+    K' = [x_k; y_k] ∈ ℝ^(2d)
+    This doubles head_dim (d → 2d) but allows:
+    - Standard FlashAttention2 kernel usage
+    - Q'·K' = Σ(x_q*x_k + y_q*y_k) = a^PoPE_ts (mathematically equivalent)
+    - ~2× overhead in attention computation (acceptable tradeoff vs custom kernels)
+    Benefits retained:
+    - Superior length extrapolation without fine-tuning
+    - Decoupled 'what' and 'where' information
+    - Better performance on content/position independent matching tasks
+    Args:
+        dim: Original dimension per attention head (will be doubled to 2d internally)
+        max_position_embeddings: Maximum sequence length
+        base: Base wavelength (theta) for frequency components
+        device: Device to place tensors on
+    """
+    def __init__(
+        self,
+        dim: int,
+        max_position_embeddings: int = 2048,
+        base: float = 10000.0,
+        device=None
+    ):
         super().__init__()
+        self.dim = dim  # Original head_dim (d)
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # Compute frequency components: θc = base^(-(c-1)/d) for c = 1, ..., d
+        # PoPE uses d frequencies (not d/2 like RoPE)
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 1, dtype=torch.float32) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        position_ids: torch.LongTensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Apply PoPE transformation with concatenation for FlashAttention2 compatibility.
+        Args:
+            q: Query tensor of shape (batch, num_heads, seq_len, head_dim)
+            k: Key tensor of shape (batch, num_kv_heads, seq_len, head_dim)
+            position_ids: Position indices of shape (batch, seq_len)
+        Returns:
+            Tuple of (Q', K') with doubled head_dim:
+            - Q': shape (batch, num_heads, seq_len, 2*head_dim) = [x_q; y_q]
+            - K': shape (batch, num_kv_heads, seq_len, 2*head_dim) = [x_k; y_k]
+        """
+        # Step 1: Apply softplus to get magnitudes (Equation 3 from paper)
+        # μ_q̃tc = softplus(qtc), μ_k̃sc = softplus(ksc)
+        mu_q = F.softplus(q)
+        mu_k = F.softplus(k)
+        # Step 2: Compute phase angles (Equation 4 from paper)
+        # φ_q̃tc = t*θc, φ_k̃sc = s*θc
+        # freqs shape: (batch, 1, seq_len, head_dim)
+        inv_freq_expanded = self.inv_freq[None, None, None, :].to(q.device)
+        position_ids_expanded = position_ids[:, None, :, None].float()
+        freqs = position_ids_expanded * inv_freq_expanded
+        # Step 3: Convert to Cartesian coordinates (Equations 7-8 from paper)
+        # x = μ * cos(φ), y = μ * sin(φ)
+        cos_freqs = torch.cos(freqs)
+        sin_freqs = torch.sin(freqs)
+        q_real = mu_q * cos_freqs  # x_q component
+        q_imag = mu_q * sin_freqs  # y_q component
+        k_real = mu_k * cos_freqs  # x_k component
+        k_imag = mu_k * sin_freqs  # y_k component
+        # Step 4: Concatenate [real; imag] to create 2d dimensional vectors
+        # This enables Q'·K' = Σ(x_q*x_k + y_q*y_k) via standard dot product
+        q_pope = torch.cat([q_real, q_imag], dim=-1)  # (batch, num_heads, seq_len, 2*head_dim)
+        k_pope = torch.cat([k_real, k_imag], dim=-1)  # (batch, num_kv_heads, seq_len, 2*head_dim)
+        return q_pope, k_pope
+def apply_pope_embedding(
+    q_pope: torch.Tensor,
+    k_pope: torch.Tensor,
+    delta_bias: Optional[torch.Tensor] = None,
+    num_key_value_groups: int = 1
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply learnable phase bias δc to PoPE embeddings (Equation 6 from paper).
+    With phase bias: a^PoPE_ts = Σ μ_q μ_k cos((s-t)θc + δc)
+    This is implemented by rotating k by exp(i*δ) in the concatenated representation.
+    Args:
+        q_pope: Query with PoPE applied, shape (batch, num_heads, seq_len, 2*head_dim)
+                Format: [x_q; y_q] where first head_dim is real, second head_dim is imaginary
+        k_pope: Key with PoPE applied, shape (batch, num_kv_heads, seq_len, 2*head_dim)
+                Format: [x_k; y_k]
+        delta_bias: Learnable phase bias per head/dim, shape (num_attention_heads, head_dim)
+                   Bounded to [-2π, 0] as per paper. Applied only to keys.
+        num_key_value_groups: Number of query groups per key/value head for GQA
+    Returns:
+        Tuple of (q_out, k_out) with delta_bias applied:
+        - q_out: Query unchanged (phase bias only affects keys)
+        - k_out: Key rotated by delta_bias
+        Both maintain shape with 2*head_dim
+    """
+    # Query passes through unchanged (phase bias only affects keys)
+    q_out = q_pope
+    # Apply learnable phase bias to key if provided
+    if delta_bias is not None:
+        # Get head_dim (original dimension, half of current last dim)
+        head_dim = k_pope.shape[-1] // 2
+        # Split k into real and imaginary components
+        k_real, k_imag = k_pope[..., :head_dim], k_pope[..., head_dim:]
+        # Clamp delta_bias to [-2π, 0] as specified in paper Section 3
+        delta_clamped = torch.clamp(delta_bias, min=-2*math.pi, max=0)
+        # Adapt delta_bias for GQA: (num_attention_heads, head_dim) -> (num_kv_heads, head_dim)
+        # Group the attention heads' biases by averaging/selecting
+        if num_key_value_groups > 1:
+            # Reshape: (num_attention_heads, head_dim) -> (num_kv_heads, num_key_value_groups, head_dim)
+            num_kv_heads = delta_clamped.shape[0] // num_key_value_groups
+            delta_clamped = delta_clamped.view(num_kv_heads, num_key_value_groups, head_dim)
+            # Average across the groups to get one bias per kv_head
+            delta_clamped = delta_clamped.mean(dim=1)  # (num_kv_heads, head_dim)
+        # Reshape for broadcasting: (num_kv_heads, head_dim) -> (1, num_kv_heads, 1, head_dim)
+        delta_clamped = delta_clamped.unsqueeze(0).unsqueeze(2)
+        # Compute rotation components: exp(i*δ) = cos(δ) + i*sin(δ)
+        cos_delta = torch.cos(delta_clamped)
+        sin_delta = torch.sin(delta_clamped)
+        # Apply complex multiplication: k * exp(i*δ)
+        # Real part: k_real*cos(δ) - k_imag*sin(δ)
+        # Imag part: k_real*sin(δ) + k_imag*cos(δ)
+        k_real_rotated = k_real * cos_delta - k_imag * sin_delta
+        k_imag_rotated = k_real * sin_delta + k_imag * cos_delta
+        # Recombine into concatenated form [real; imag]
+        k_out = torch.cat([k_real_rotated, k_imag_rotated], dim=-1)
+    else:
+        k_out = k_pope
+    return q_out, k_out
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     dropout: float = 0.0,
     **kwargs: Unpack[TransformersKwargs],
 ):
+    """
+    Standard eager attention implementation for PoPE.
+    Note: query and key have 2*head_dim due to PoPE concatenation [real; imag].
+    Value is padded to match this dimension for kernel compatibility.
+    """
     key_states = repeat_kv(key, module.num_key_value_groups)
     value_states = repeat_kv(value, module.num_key_value_groups)
+    # Standard attention computation
     attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
     if attention_mask is not None:
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
 class NeoLLMAttention(nn.Module):
     """
     Multi-headed attention with FANformer integration, SeeDNorm for Q/K normalization,
+    PoPE for positional encoding, and ResFormer feature residual connections.
     ResFormer enhancement: Applies learnable feature residual connections from the first layer
     BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
+    PoPE enhancement: Decouples 'what' and 'where' via polar coordinates for superior
+    length extrapolation and content/position independent matching. Uses concatenated
+    [real; imag] representation for FlashAttention2 compatibility (2× head_dim overhead).
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
+        # PoPE uses original head_dim for scaling (not 2*head_dim)
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
         # QKV projections operate on FAN-transformed features
         self.q_proj = nn.Linear(
+            fan_output_dim, self.num_attention_heads * self.head_dim * 2, bias=config.attention_bias
         )
         self.k_proj = nn.Linear(
+            fan_output_dim, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.v_proj = nn.Linear(
+            fan_output_dim, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.o_proj = nn.Linear(
+            self.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
         # SeeDNorm for Q/K normalization (replaces RMSNorm)
         self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
+        # PoPE: Learnable phase bias δc for each head and dimension
+        # Initialized based on pope_bias_init config: 'zero' or 'uniform'
+        pope_bias_init = getattr(config, 'pope_bias_init', 'zero')
+        if pope_bias_init == 'uniform':
+            # Uniform initialization in [-2π, 0]
+            delta_init = torch.empty(self.num_attention_heads, self.head_dim).uniform_(-2 * math.pi, 0)
+        else:
+            # Zero initialization (better for length extrapolation)
+            delta_init = torch.zeros(self.num_attention_heads, self.head_dim)
+        self.delta_bias = nn.Parameter(delta_init)
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
+        batch_size, seq_len = input_shape
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         # ResFormer: Apply feature residual connection BEFORE projections
         if first_layer_fan is not None:
             hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
         # Store current FAN features for potential use as first_layer_fan in subsequent layers
         current_layer_fan = hidden_states_fan.clone()
+        # Project to Q, K, V
         query_states, gate = torch.chunk(
+            self.q_proj(hidden_states_fan).view(batch_size, seq_len, self.num_attention_heads, self.head_dim * 2),
+            2, dim=-1
         )
+        gate = gate.reshape(batch_size, seq_len, -1)
+        key_states = self.k_proj(hidden_states_fan).view(
+            batch_size, seq_len, self.num_key_value_heads, self.head_dim
+        )
+        value_states = self.v_proj(hidden_states_fan).view(
+            batch_size, seq_len, self.num_key_value_heads, self.head_dim
+        )
+        # Apply SeeDNorm to Q and K before PoPE
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
+        # Transpose to (batch, num_heads, seq_len, head_dim)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        # Apply PoPE: position_embeddings is (pope_emb, position_ids)
+        pope_emb, position_ids = position_embeddings
+        # Get PoPE embeddings with concatenated [real; imag] representation
+        # Returns Q', K' with shape (..., 2*head_dim)
+        query_states, key_states = pope_emb(query_states, key_states, position_ids)
+        # Apply learnable phase bias δc
+        # Apply learnable phase bias δc
+        query_states, key_states = apply_pope_embedding(
+            query_states,
+            key_states,
+            self.delta_bias,
+            num_key_value_groups=self.num_key_value_groups  # AGREGAR ESTE PARÁMETRO
+        )
+        # Pad value to 2*head_dim for dimension compatibility
+        # Only first head_dim components are used in output
+        value_states = F.pad(value_states, (0, self.head_dim), value=0.0)
+        # Call attention with doubled head_dim
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
             **kwargs,
         )
+        # Extract only the first head_dim components (discard padding)
+        attn_output = attn_output[..., :self.head_dim]
+        attn_output = attn_output.reshape(batch_size, seq_len, -1).contiguous()
         attn_output = attn_output * torch.sigmoid(gate)
         attn_output = self.o_proj(attn_output)
                 module.lambda_1.data.fill_(0.5)
             if hasattr(module, 'lambda_2'):
                 module.lambda_2.data.fill_(0.5)
+            # PoPE delta_bias already initialized in __init__
         elif isinstance(module, GPAS):
             # Initialize GPAS alpha to 0 as per paper
             module.alpha.data.fill_(0.0)
             # beta (β) initialized to 0 (default in Parameter definition)
             # alpha (α) initialized to 1 (default in Parameter definition)
             pass
+        elif isinstance(module, PolarPositionalEmbedding):
+            # PoPE frequency initialization handled in __init__
+            pass
 class NeoLLMModel(NeoLLMPreTrainedModel):
         )
         # SeeDNorm for final output normalization (replaces RMSNorm)
         self.norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
+        # PoPE positional embedding (replaces RoPE)
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.pope_emb = PolarPositionalEmbedding(
+            dim=head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            base=getattr(config, 'rope_theta', 10000.0),  # Use rope_theta for backward compatibility
+        )
         self.gradient_checkpointing = False
         # ResFormer: storage for first layer's FAN features (H_fan_1)
         hidden_states = inputs_embeds
+        # Create position embeddings for PoPE
+        # position_embeddings is a tuple of (pope_emb, position_ids)
+        position_embeddings = (self.pope_emb, position_ids)
         # ResFormer: reset first_layer_fan at the start of each forward pass
         self.first_layer_fan = None
     "NeoLLMConfig",
     "FANLayer",
     "SeeDNorm",
+    "PolarPositionalEmbedding",
 ]
 # Register the configuration and model for AutoClass support