| import math | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| class PhaseFormerTransformerLayer(nn.Module): | |
| """ | |
| Transformer layer with phase-based temporal gating applied | |
| to attention and feed-forward residual paths. | |
| Args: | |
| d_model (int): Input/output dimension. | |
| nhead (int): Number of attention heads. | |
| dim_feedforward (int): FFN hidden layer size. | |
| dropout (float): Dropout probability. | |
| decay_rate (float): Decay coefficient lambda. | |
| """ | |
| def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, decay_rate=0.1): | |
| super().__init__() | |
| self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) | |
| self.linear1 = nn.Linear(d_model, dim_feedforward) | |
| self.dropout = nn.Dropout(dropout) | |
| self.linear2 = nn.Linear(dim_feedforward, d_model) | |
| self.norm1 = nn.LayerNorm(d_model) | |
| self.norm2 = nn.LayerNorm(d_model) | |
| self.decay_rate = decay_rate | |
| self.phase_proj = nn.Linear(d_model, d_model) | |
| def forward(self, src, t: float): | |
| D_t = math.exp(-self.decay_rate * t) | |
| phase = self.phase_proj(src) | |
| g = D_t * torch.sin(phase) | |
| attn_out, _ = self.self_attn(src, src, src) | |
| src2 = self.norm1(src + g * attn_out) | |
| ff = self.linear2(self.dropout(F.relu(self.linear1(src2)))) | |
| return self.norm2(src2 + g * ff) | |