"""
MARS: Multi-scale Adaptive Recurrence with State compression
============================================================

An innovative method for super long sequence modeling in sequential recommendation.

Key innovations:
1. Temporal-Aware Delta Network (TADN) for O(n) long-range modeling
   - Explicit exponential temporal decay in state updates
   - Input-dependent gating for selective memory retention
   
2. Compressive Memory Tokens
   - Fixed-size learnable memory that compresses arbitrarily long histories
   - Acts as information bottleneck (denoising effect per Rec2PM)
   
3. Dual-Branch Architecture with Learned Fusion
   - Long-term branch: TADN layers processing full history at O(n) cost
   - Short-term branch: Standard self-attention on recent K interactions
   - Adaptive gating fusion that balances long/short-term signals per user

4. Multi-Scale Temporal Encoding
   - Absolute time embeddings + relative time deltas + periodic components
   - Captures daily/weekly/seasonal patterns in user behavior

This combines ideas from HyTRec (2602.18283), Rec2PM (2602.11605), 
SIGMA (2408.11451), and HSTU (2402.17152) into a unified architecture.
"""

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, Dict


class TemporalEncoding(nn.Module):
    """Multi-scale temporal encoding with periodic components.
    
    Captures absolute time, relative time deltas, and periodic patterns
    (daily, weekly cycles) in user behavior.
    """
    
    def __init__(self, embed_dim: int, max_periods: int = 4):
        super().__init__()
        self.embed_dim = embed_dim
        
        # Relative time delta projection
        self.time_delta_proj = nn.Linear(1, embed_dim)
        
        # Periodic components (daily=86400s, weekly=604800s, etc.)
        periods = [3600, 86400, 604800, 2592000][:max_periods]
        self.register_buffer('periods', torch.tensor(periods, dtype=torch.float32))
        self.periodic_proj = nn.Linear(max_periods * 2, embed_dim)  # sin + cos
        
        # Learnable position encoding for sequence order
        self.layernorm = nn.LayerNorm(embed_dim)
    
    def forward(self, timestamps: torch.Tensor) -> torch.Tensor:
        """
        Args:
            timestamps: (batch, seq_len) absolute timestamps in seconds
        Returns:
            temporal_emb: (batch, seq_len, embed_dim)
        """
        B, T = timestamps.shape
        
        # 1. Relative time deltas (seconds since previous interaction)
        time_deltas = torch.zeros_like(timestamps)
        time_deltas[:, 1:] = timestamps[:, 1:] - timestamps[:, :-1]
        time_deltas = time_deltas.clamp(min=0)
        # Log-scale for better numerical properties
        log_deltas = torch.log1p(time_deltas).unsqueeze(-1)  # (B, T, 1)
        delta_emb = self.time_delta_proj(log_deltas)  # (B, T, D)
        
        # 2. Periodic components
        ts_expanded = timestamps.unsqueeze(-1)  # (B, T, 1)
        periods = self.periods.view(1, 1, -1)   # (1, 1, P)
        angles = 2 * math.pi * ts_expanded / periods  # (B, T, P)
        periodic_features = torch.cat([
            torch.sin(angles),
            torch.cos(angles)
        ], dim=-1)  # (B, T, 2*P)
        periodic_emb = self.periodic_proj(periodic_features)  # (B, T, D)
        
        # 3. Combine
        temporal_emb = self.layernorm(delta_emb + periodic_emb)
        return temporal_emb


class TADNLayer(nn.Module):
    """Temporal-Aware Delta Network Layer.
    
    Linear complexity O(n) recurrent layer with:
    - Delta rule state updates (inspired by HyTRec)
    - Explicit temporal decay gating
    - Input-dependent selective memory
    
    The state matrix S is updated as:
        S_t = S_{t-1} * (I - g_t * beta_t * k_t * k_t^T) + beta_t * v_t * k_t^T
    
    where g_t incorporates temporal decay:
        g_t = alpha * sigmoid(W_g * [h_t, delta_h_t]) * tau_t + (1-alpha) * g_static
        tau_t = exp(-(t_current - t_behavior) / T)
    """
    
    def __init__(self, embed_dim: int, state_dim: int = 64, dropout: float = 0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.state_dim = state_dim
        
        # Query, Key, Value projections
        self.q_proj = nn.Linear(embed_dim, state_dim)
        self.k_proj = nn.Linear(embed_dim, state_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        
        # Gating mechanism
        self.gate_proj = nn.Linear(embed_dim * 2, embed_dim)
        self.beta_proj = nn.Linear(embed_dim, state_dim)
        
        # Temporal decay parameters
        self.alpha = nn.Parameter(torch.tensor(0.5))
        self.time_scale = nn.Parameter(torch.tensor(1.0))
        
        # Static gate (learnable baseline)
        self.gate_static = nn.Parameter(torch.ones(embed_dim) * 0.5)
        
        # Output
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.layernorm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(
        self,
        x: torch.Tensor,
        timestamps: Optional[torch.Tensor] = None,
        mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Args:
            x: (batch, seq_len, embed_dim) input sequence
            timestamps: (batch, seq_len) timestamps for temporal decay
            mask: (batch, seq_len) boolean mask (True = valid)
        Returns:
            output: (batch, seq_len, embed_dim)
        """
        B, T, D = x.shape
        
        # Project to Q, K, V
        q = self.q_proj(x)   # (B, T, state_dim)
        k = self.k_proj(x)   # (B, T, state_dim)
        v = self.v_proj(x)   # (B, T, D)
        
        # Beta (key importance scaling)
        beta = torch.sigmoid(self.beta_proj(x))  # (B, T, state_dim)
        
        # Temporal decay
        if timestamps is not None:
            # Compute recency-based decay with proper normalization
            # Use the LAST VALID position's timestamp as reference
            # Normalize by log(1 + delta) to handle large time ranges (seconds → years)
            t_last = timestamps[:, -1:].unsqueeze(-1)  # (B, 1, 1) - last timestamp
            t_behavior = timestamps.unsqueeze(-1)       # (B, T, 1)
            time_delta = (t_last - t_behavior).clamp(min=0)
            
            # Log-normalize: log(1 + delta_seconds / 3600) → hours-scale
            log_delta = torch.log1p(time_delta / 3600.0)  # Normalize to hours
            
            # Learnable time scale controls the decay rate
            tau = torch.exp(
                -log_delta / (torch.abs(self.time_scale) * 10.0 + 1.0)
            )  # (B, T, 1), values in [0, 1]
        else:
            # Fallback: linear decay
            positions = torch.arange(T, device=x.device).float()
            tau = torch.exp(-positions / (T + 1e-6)).view(1, T, 1)
        
        # Dynamic gating with temporal awareness
        # Delta of hidden states for change detection
        x_shifted = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1)
        delta_x = x - x_shifted
        gate_input = torch.cat([x, delta_x], dim=-1)  # (B, T, 2*D)
        
        alpha = torch.sigmoid(self.alpha)
        g_dynamic = torch.sigmoid(self.gate_proj(gate_input))  # (B, T, D)
        g = alpha * g_dynamic * tau + (1 - alpha) * torch.sigmoid(self.gate_static)
        
        # Recurrent state update with delta rule
        # Use chunked processing for better GPU utilization
        chunk_size = min(64, T)  # Process in chunks for efficiency
        
        outputs = []
        S = torch.zeros(B, self.state_dim, D, device=x.device)  # State matrix
        
        for chunk_start in range(0, T, chunk_size):
            chunk_end = min(chunk_start + chunk_size, T)
            
            for t in range(chunk_start, chunk_end):
                k_t = k[:, t]     # (B, state_dim)
                v_t = v[:, t]     # (B, D)
                beta_t = beta[:, t]  # (B, state_dim)
                g_t = g[:, t]     # (B, D)
                q_t = q[:, t]     # (B, state_dim)
                
                # Delta rule update: erase old, write new
                # Clamp erase to [0, 1] for stability
                erase = torch.einsum('bs,bd->bsd', beta_t * k_t, g_t).clamp(0, 1)
                write = torch.einsum('bs,bd->bsd', beta_t * k_t, v_t)
                
                if mask is not None:
                    valid = mask[:, t].float().view(B, 1, 1)
                    S = S * (1 - erase * valid) + write * valid
                else:
                    S = S * (1 - erase) + write
                
                # Clamp state for numerical stability
                S = S.clamp(-10, 10)
                
                # Read from state
                out_t = torch.einsum('bs,bsd->bd', q_t, S)
                outputs.append(out_t)
        
        output = torch.stack(outputs, dim=1)  # (B, T, D)
        output = self.out_proj(self.dropout(output))
        output = self.layernorm(x + output)  # Residual connection
        
        return output


class CompressiveMemory(nn.Module):
    """Compressive Memory Module.
    
    Compresses long sequence history into a fixed number of memory tokens.
    Acts as information bottleneck (denoising per Rec2PM theory).
    
    Uses cross-attention: memory queries attend to sequence to extract summary.
    """
    
    def __init__(self, embed_dim: int, num_memory_tokens: int = 8, num_heads: int = 2):
        super().__init__()
        self.num_memory_tokens = num_memory_tokens
        
        # Learnable memory query tokens
        self.memory_queries = nn.Parameter(
            torch.randn(num_memory_tokens, embed_dim) * 0.02
        )
        
        # Cross-attention: memory queries attend to sequence
        self.cross_attn = nn.MultiheadAttention(
            embed_dim=embed_dim,
            num_heads=num_heads,
            batch_first=True,
            dropout=0.1
        )
        
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(embed_dim * 4, embed_dim),
            nn.Dropout(0.1),
        )
        
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
    
    def forward(
        self,
        sequence: torch.Tensor,
        mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Args:
            sequence: (batch, seq_len, embed_dim) - encoded sequence
            mask: (batch, seq_len) boolean mask (True = valid, False = padding)
        Returns:
            memory: (batch, num_memory_tokens, embed_dim)
        """
        B = sequence.shape[0]
        
        # Expand memory queries for batch
        queries = self.memory_queries.unsqueeze(0).expand(B, -1, -1)  # (B, M, D)
        
        # Cross-attention with key padding mask
        # nn.MultiheadAttention expects key_padding_mask where True = IGNORE
        if mask is not None:
            key_padding_mask = ~mask  # Invert: True means padding (to ignore)
        else:
            key_padding_mask = None
        
        attn_out, _ = self.cross_attn(
            query=queries,
            key=sequence,
            value=sequence,
            key_padding_mask=key_padding_mask
        )
        memory = self.norm1(queries + attn_out)
        memory = self.norm2(memory + self.ffn(memory))
        
        return memory


class ShortTermAttention(nn.Module):
    """Standard self-attention block for short-term (recent) interactions.
    
    Uses standard causal multi-head attention — full expressiveness
    for the most recent K items where O(K²) is acceptable.
    """
    
    def __init__(self, embed_dim: int, num_heads: int = 2, num_layers: int = 2, dropout: float = 0.1):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
    
    def forward(
        self,
        x: torch.Tensor,
        mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Args:
            x: (batch, K, embed_dim) recent interactions
            mask: (batch, K) boolean mask
        Returns:
            output: (batch, K, embed_dim)
        """
        T = x.shape[1]
        
        # Causal mask
        causal_mask = torch.triu(
            torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1
        )
        
        # Padding mask
        src_key_padding_mask = ~mask if mask is not None else None
        
        output = self.encoder(
            x,
            mask=causal_mask,
            src_key_padding_mask=src_key_padding_mask
        )
        return output


class AdaptiveFusionGate(nn.Module):
    """Adaptive fusion gate that balances long-term and short-term signals.
    
    Per-user, per-timestep gating:
        output = sigma(gate) * long_term + (1 - sigma(gate)) * short_term
    """
    
    def __init__(self, embed_dim: int):
        super().__init__()
        self.gate = nn.Sequential(
            nn.Linear(embed_dim * 3, embed_dim),
            nn.GELU(),
            nn.Linear(embed_dim, embed_dim),
            nn.Sigmoid()
        )
    
    def forward(
        self,
        long_term: torch.Tensor,
        short_term: torch.Tensor,
        memory: torch.Tensor
    ) -> torch.Tensor:
        """
        Args:
            long_term: (batch, embed_dim)
            short_term: (batch, embed_dim) 
            memory: (batch, embed_dim) compressed memory summary
        Returns:
            fused: (batch, embed_dim)
        """
        gate_input = torch.cat([long_term, short_term, memory], dim=-1)
        g = self.gate(gate_input)
        return g * long_term + (1 - g) * short_term


class MARS(nn.Module):
    """
    MARS: Multi-scale Adaptive Recurrence with State compression
    
    Architecture:
        Input: Full user interaction sequence + timestamps
            |
            v
        [Item Embedding + Temporal Encoding]
            |
            +---- Long-term Branch (TADN layers, O(n))
            |         |
            |     [Compressive Memory] → memory tokens
            |         |
            +---- Short-term Branch (Self-Attention on recent K items)
            |
            v
        [Adaptive Fusion Gate]
            |
            v
        [Prediction Head] → next item scores
    
    Args:
        num_items: number of unique items
        embed_dim: embedding dimension
        max_seq_len: maximum sequence length (can be very long, e.g. 2048)
        short_term_len: number of recent items for short-term branch
        num_memory_tokens: number of compressive memory tokens
        num_tadn_layers: number of TADN layers in long-term branch
        num_attn_layers: number of attention layers in short-term branch
        num_heads: number of attention heads
        state_dim: state dimension for TADN
        dropout: dropout rate
    """
    
    def __init__(
        self,
        num_items: int,
        embed_dim: int = 64,
        max_seq_len: int = 512,
        short_term_len: int = 50,
        num_memory_tokens: int = 8,
        num_tadn_layers: int = 3,
        num_attn_layers: int = 2,
        num_heads: int = 2,
        state_dim: int = 64,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.num_items = num_items
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len
        self.short_term_len = short_term_len
        self.num_memory_tokens = num_memory_tokens
        
        # Item embeddings (0 = padding)
        self.item_embedding = nn.Embedding(num_items + 1, embed_dim, padding_idx=0)
        
        # Temporal encoding
        self.temporal_encoding = TemporalEncoding(embed_dim)
        
        # Learnable position encoding (for short-term branch)
        self.position_embedding = nn.Embedding(max_seq_len, embed_dim)
        
        # Input processing
        self.input_norm = nn.LayerNorm(embed_dim)
        self.input_dropout = nn.Dropout(dropout)
        
        # Long-term branch: stack of TADN layers
        self.tadn_layers = nn.ModuleList([
            TADNLayer(embed_dim, state_dim, dropout)
            for _ in range(num_tadn_layers)
        ])
        
        # Compressive memory
        self.compressive_memory = CompressiveMemory(
            embed_dim, num_memory_tokens, num_heads
        )
        
        # Short-term branch: standard self-attention
        self.short_term_attn = ShortTermAttention(
            embed_dim, num_heads, num_attn_layers, dropout
        )
        
        # Adaptive fusion
        self.fusion_gate = AdaptiveFusionGate(embed_dim)
        
        # Output projection
        self.output_norm = nn.LayerNorm(embed_dim)
        self.output_proj = nn.Linear(embed_dim, embed_dim)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize with truncated normal distribution."""
        for name, param in self.named_parameters():
            if 'weight' in name and param.dim() >= 2:
                nn.init.trunc_normal_(param, std=0.02)
            elif 'bias' in name:
                nn.init.zeros_(param)
        
        # Special init for item embeddings
        nn.init.trunc_normal_(self.item_embedding.weight, std=0.02)
        nn.init.zeros_(self.item_embedding.weight[0])  # Padding = zero
    
    @property
    def item_embeddings(self):
        """Access item embedding table (for evaluation)."""
        return self.item_embedding
    
    def encode(
        self,
        item_ids: torch.Tensor,
        timestamps: Optional[torch.Tensor] = None,
        mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Encode a full sequence into user representations.
        
        Args:
            item_ids: (batch, seq_len) item indices (0 = padding)
            timestamps: (batch, seq_len) timestamps in seconds
            mask: (batch, seq_len) boolean mask (True = valid)
        Returns:
            user_emb: (batch, embed_dim) final user representation
        """
        B, T = item_ids.shape
        
        # Create mask from padding if not provided
        if mask is None:
            mask = (item_ids != 0)
        
        # 1. Item + Temporal Embeddings
        item_emb = self.item_embedding(item_ids)  # (B, T, D)
        
        if timestamps is not None:
            temp_emb = self.temporal_encoding(timestamps.float())
            item_emb = item_emb + temp_emb
        
        # Add position embeddings (only for the sequence order)
        positions = torch.arange(T, device=item_ids.device).unsqueeze(0)
        positions = positions.clamp(max=self.max_seq_len - 1)
        pos_emb = self.position_embedding(positions)
        
        item_emb = self.input_norm(item_emb + pos_emb)
        item_emb = self.input_dropout(item_emb)
        
        # 2. Long-term Branch: TADN over full sequence
        long_term_repr = item_emb
        for tadn in self.tadn_layers:
            long_term_repr = tadn(long_term_repr, timestamps, mask)
        
        # Compress long-term into memory tokens
        memory = self.compressive_memory(long_term_repr, mask)  # (B, M, D)
        memory_summary = memory.mean(dim=1)  # (B, D) - aggregated memory
        
        # Get last valid long-term representation
        # Use mask to find last valid position
        lengths = mask.sum(dim=1).long()  # (B,)
        long_term_last = long_term_repr[
            torch.arange(B, device=item_ids.device),
            (lengths - 1).clamp(min=0)
        ]  # (B, D)
        
        # 3. Short-term Branch: Attention on recent K items
        # With right-padding, valid items are at positions 0...(length-1)
        # Extract last K valid items per user
        K = min(self.short_term_len, T)
        
        # For each user, get the last K valid positions
        short_item_ids_list = []
        short_ts_list = []
        short_mask_list = []
        
        for b in range(B):
            seq_len = lengths[b].item()
            actual_k = min(K, seq_len)
            start = max(0, seq_len - K)
            end = seq_len
            
            # Extract valid items and pad to K
            ids = item_ids[b, start:end]
            pad_len = K - actual_k
            if pad_len > 0:
                ids = torch.cat([ids, torch.zeros(pad_len, dtype=ids.dtype, device=ids.device)])
            short_item_ids_list.append(ids)
            
            if timestamps is not None:
                ts = timestamps[b, start:end]
                if pad_len > 0:
                    ts = torch.cat([ts, torch.zeros(pad_len, dtype=ts.dtype, device=ts.device)])
                short_ts_list.append(ts)
            
            m = torch.zeros(K, dtype=torch.bool, device=item_ids.device)
            m[:actual_k] = True
            short_mask_list.append(m)
        
        short_item_ids = torch.stack(short_item_ids_list)  # (B, K)
        short_mask = torch.stack(short_mask_list)            # (B, K)
        
        short_emb = self.item_embedding(short_item_ids)
        
        if timestamps is not None:
            short_ts = torch.stack(short_ts_list)  # (B, K)
            short_temp = self.temporal_encoding(short_ts.float())
            short_emb = short_emb + short_temp
        
        short_positions = torch.arange(K, device=item_ids.device).unsqueeze(0)
        short_positions = short_positions.clamp(max=self.max_seq_len - 1)
        short_emb = short_emb + self.position_embedding(short_positions)
        short_emb = self.input_norm(short_emb)
        
        short_term_repr = self.short_term_attn(short_emb, short_mask)
        
        # Get last valid short-term representation
        short_lengths = short_mask.sum(dim=1).long()
        short_term_last = short_term_repr[
            torch.arange(B, device=item_ids.device),
            (short_lengths - 1).clamp(min=0)
        ]  # (B, D)
        
        # 4. Adaptive Fusion
        user_emb = self.fusion_gate(long_term_last, short_term_last, memory_summary)
        user_emb = self.output_proj(self.output_norm(user_emb))
        
        return user_emb
    
    def forward(
        self,
        batch: Dict[str, torch.Tensor]
    ) -> torch.Tensor:
        """
        Training forward pass.
        
        Expected batch format (flat tensors, matching Yambda convention):
            - item_ids: (batch, max_seq_len) padded item sequences
            - timestamps: (batch, max_seq_len) padded timestamps
            - mask: (batch, max_seq_len) boolean mask
            - positive_ids: (batch,) positive next items
            - negative_ids: (batch, num_neg) negative items
        
        Returns:
            loss: scalar BCE loss
        """
        if self.training:
            return self._training_forward(batch)
        else:
            return self._eval_forward(batch)
    
    def _training_forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
        """Compute training loss with next-item prediction."""
        item_ids = batch['item_ids']       # (B, T)
        timestamps = batch.get('timestamps')  # (B, T) or None
        mask = batch.get('mask')           # (B, T)
        pos_ids = batch['positive_ids']    # (B,)
        neg_ids = batch['negative_ids']    # (B, num_neg)
        
        # Encode user sequence
        user_emb = self.encode(item_ids, timestamps, mask)  # (B, D)
        
        # Score positive and negative items
        pos_emb = self.item_embedding(pos_ids)   # (B, D)
        neg_emb = self.item_embedding(neg_ids)   # (B, num_neg, D)
        
        pos_scores = (user_emb * pos_emb).sum(dim=-1)  # (B,)
        neg_scores = torch.einsum('bd,bnd->bn', user_emb, neg_emb)  # (B, num_neg)
        
        # BPR-style loss + BCE
        pos_labels = torch.ones_like(pos_scores)
        neg_labels = torch.zeros_like(neg_scores)
        
        loss_pos = F.binary_cross_entropy_with_logits(pos_scores, pos_labels)
        loss_neg = F.binary_cross_entropy_with_logits(neg_scores, neg_labels)
        
        loss = loss_pos + loss_neg
        return loss
    
    def _eval_forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
        """Eval forward: returns user embeddings."""
        item_ids = batch['item_ids']
        timestamps = batch.get('timestamps')
        mask = batch.get('mask')
        
        user_emb = self.encode(item_ids, timestamps, mask)
        return user_emb


class SASRecBaseline(nn.Module):
    """
    Standard SASRec baseline for comparison.
    Uses causal self-attention (O(n²) complexity).
    """
    
    def __init__(
        self,
        num_items: int,
        embed_dim: int = 64,
        max_seq_len: int = 200,
        num_heads: int = 2,
        num_layers: int = 2,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.num_items = num_items
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len
        
        self.item_embedding = nn.Embedding(num_items + 1, embed_dim, padding_idx=0)
        self.position_embedding = nn.Embedding(max_seq_len, embed_dim)
        
        self.input_norm = nn.LayerNorm(embed_dim)
        self.input_dropout = nn.Dropout(dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.output_norm = nn.LayerNorm(embed_dim)
        
        self._init_weights()
    
    def _init_weights(self):
        for name, param in self.named_parameters():
            if 'weight' in name and param.dim() >= 2:
                nn.init.trunc_normal_(param, std=0.02)
            elif 'bias' in name:
                nn.init.zeros_(param)
        nn.init.zeros_(self.item_embedding.weight[0])
    
    @property
    def item_embeddings(self):
        return self.item_embedding
    
    def encode(self, item_ids, timestamps=None, mask=None):
        B, T = item_ids.shape
        if mask is None:
            mask = (item_ids != 0)
        
        item_emb = self.item_embedding(item_ids)
        positions = torch.arange(T, device=item_ids.device).unsqueeze(0)
        positions = positions.clamp(max=self.max_seq_len - 1)
        item_emb = item_emb + self.position_embedding(positions)
        item_emb = self.input_norm(item_emb)
        item_emb = self.input_dropout(item_emb)
        
        causal_mask = torch.triu(torch.ones(T, T, device=item_ids.device, dtype=torch.bool), diagonal=1)
        src_key_padding_mask = ~mask
        
        output = self.encoder(item_emb, mask=causal_mask, src_key_padding_mask=src_key_padding_mask)
        
        lengths = mask.sum(dim=1).long()
        user_emb = output[torch.arange(B, device=item_ids.device), (lengths - 1).clamp(min=0)]
        user_emb = self.output_norm(user_emb)
        
        return user_emb
    
    def forward(self, batch):
        if self.training:
            item_ids = batch['item_ids']
            timestamps = batch.get('timestamps')
            mask = batch.get('mask')
            pos_ids = batch['positive_ids']
            neg_ids = batch['negative_ids']
            
            user_emb = self.encode(item_ids, timestamps, mask)
            pos_emb = self.item_embedding(pos_ids)
            neg_emb = self.item_embedding(neg_ids)
            
            pos_scores = (user_emb * pos_emb).sum(dim=-1)
            neg_scores = torch.einsum('bd,bnd->bn', user_emb, neg_emb)
            
            loss_pos = F.binary_cross_entropy_with_logits(pos_scores, torch.ones_like(pos_scores))
            loss_neg = F.binary_cross_entropy_with_logits(neg_scores, torch.zeros_like(neg_scores))
            
            return loss_pos + loss_neg
        else:
            return self.encode(batch['item_ids'], batch.get('timestamps'), batch.get('mask'))