File size: 6,449 Bytes

f24563f

"""
Embedding layers for the LLM model.
"""

import jax
import jax.numpy as jnp
import flax.linen as nn
from typing import Optional, Tuple, Callable
import math


class TokenEmbedding(nn.Module):
    """
    Token embedding layer.

    Attributes:
        vocab_size: Size of vocabulary
        embed_dim: Embedding dimension
        dtype: Data type for embeddings
    """
    vocab_size: int
    embed_dim: int
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        self.embedding = self.param(
            'embedding',
            nn.initializers.normal(stddev=0.02),
            (self.vocab_size, self.embed_dim),
            self.dtype
        )

    def __call__(self, input_ids: jnp.ndarray) -> jnp.ndarray:
        """
        Apply token embedding.

        Args:
            input_ids: Token IDs [batch_size, seq_len]

        Returns:
            Token embeddings [batch_size, seq_len, embed_dim]
        """
        return jnp.take(self.embedding, input_ids, axis=0)


class PositionalEmbedding(nn.Module):
    """
    Learned positional embedding layer.

    Attributes:
        max_seq_len: Maximum sequence length
        embed_dim: Embedding dimension
        dtype: Data type for embeddings
    """
    max_seq_len: int
    embed_dim: int
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        self.embedding = self.param(
            'embedding',
            nn.initializers.normal(stddev=0.02),
            (self.max_seq_len, self.embed_dim),
            self.dtype
        )

    def __call__(self, positions: jnp.ndarray) -> jnp.ndarray:
        """
        Apply positional embedding.

        Args:
            positions: Position indices [batch_size, seq_len]

        Returns:
            Positional embeddings [batch_size, seq_len, embed_dim]
        """
        return jnp.take(self.embedding, positions, axis=0)


class RotaryPositionalEmbedding(nn.Module):
    """
    Rotary Positional Embedding (RoPE) with support for long sequences.

    Attributes:
        dim: Dimension of the embedding
        max_seq_len: Maximum sequence length
        base: Base for frequency computation
        scale: Scaling factor for RoPE frequencies (for longer contexts)
        dtype: Data type for embeddings
        use_dynamic_scaling: Whether to use dynamic scaling for longer contexts
    """
    dim: int
    max_seq_len: int = 32768  # Increased to support longer contexts
    base: int = 10000
    scale: float = 1.0  # Scaling factor for RoPE frequencies
    dtype: jnp.dtype = jnp.float32
    use_dynamic_scaling: bool = True  # Enable dynamic scaling for longer contexts
    original_max_seq_len: int = 4096  # Original max sequence length for scaling

    def setup(self):
        # Apply scaling for longer contexts if enabled
        effective_base = self.base
        if self.use_dynamic_scaling and self.max_seq_len > self.original_max_seq_len:
            # Dynamic NTK-aware scaling for longer contexts
            # This helps maintain the same level of position sensitivity at longer distances
            scaling_factor = math.log(self.max_seq_len / self.original_max_seq_len) / math.log(2)
            effective_base = self.base * (self.scale ** scaling_factor)

        # Compute frequency bands
        freqs = effective_base ** (-jnp.arange(0, self.dim, 2) / self.dim)
        # Compute position encodings
        pos = jnp.arange(self.max_seq_len)
        # Outer product of positions and frequencies
        freqs = jnp.outer(pos, freqs).astype(self.dtype)
        # Cache cos and sin values
        self.cos_cached = jnp.cos(freqs)
        self.sin_cached = jnp.sin(freqs)

    def _rotate_half(self, x: jnp.ndarray) -> jnp.ndarray:
        """Rotate half of the dimensions."""
        x1, x2 = jnp.split(x, 2, axis=-1)
        return jnp.concatenate([-x2, x1], axis=-1)

    def __call__(self, x: jnp.ndarray, positions: Optional[jnp.ndarray] = None) -> jnp.ndarray:
        """
        Apply rotary positional embedding.

        Args:
            x: Input tensor [batch_size, seq_len, ..., dim]
            positions: Optional position indices [batch_size, seq_len]

        Returns:
            Tensor with rotary positional encoding applied
        """
        seq_len = x.shape[1]

        if positions is None:
            positions = jnp.arange(seq_len)

        # Ensure positions are within bounds
        positions = jnp.clip(positions, 0, self.max_seq_len - 1)

        # Get cos and sin values for the positions
        cos = jnp.take(self.cos_cached, positions, axis=0)[:, :seq_len]
        sin = jnp.take(self.sin_cached, positions, axis=0)[:, :seq_len]

        # Reshape for broadcasting
        cos = cos.reshape(cos.shape + (1,) * (x.ndim - 3))
        sin = sin.reshape(sin.shape + (1,) * (x.ndim - 3))

        # Apply rotary embedding
        return x * cos + self._rotate_half(x) * sin


def get_rope_embedding(
    dim: int,
    max_seq_len: int = 32768,
    base: int = 10000,
    scale: float = 1.0,
    use_dynamic_scaling: bool = True,
    original_max_seq_len: int = 4096,
    dtype: jnp.dtype = jnp.float32
) -> Tuple[jnp.ndarray, jnp.ndarray]:
    """
    Get rotary positional embedding (RoPE) sin and cos values with support for long sequences.

    Args:
        dim: Dimension of the embedding
        max_seq_len: Maximum sequence length (default: 32768 for long context)
        base: Base for frequency computation
        scale: Scaling factor for RoPE frequencies (for longer contexts)
        use_dynamic_scaling: Whether to use dynamic scaling for longer contexts
        original_max_seq_len: Original max sequence length for scaling
        dtype: Data type for embeddings

    Returns:
        Tuple of (cos, sin) arrays for RoPE
    """
    # Apply scaling for longer contexts if enabled
    effective_base = base
    if use_dynamic_scaling and max_seq_len > original_max_seq_len:
        # Dynamic NTK-aware scaling for longer contexts
        scaling_factor = math.log(max_seq_len / original_max_seq_len) / math.log(2)
        effective_base = base * (scale ** scaling_factor)

    # Compute frequency bands
    freqs = effective_base ** (-jnp.arange(0, dim, 2) / dim)
    # Compute position encodings
    pos = jnp.arange(max_seq_len)
    # Outer product of positions and frequencies
    freqs = jnp.outer(pos, freqs).astype(dtype)
    # Compute cos and sin values
    cos = jnp.cos(freqs)
    sin = jnp.sin(freqs)

    return cos, sin