Abner0803
/

Transformer-RPB

Model card Files Files and versions

Abner0803 commited on Jan 14

Commit

113dff4

·

verified ·

1 Parent(s): 011fa3c

Update README.md

Files changed (1) hide show

README.md +81 -0

README.md CHANGED Viewed

@@ -152,6 +152,87 @@ class BaseTransformerComp(nn.Module):
         return mask
 ```
 ### RPB Components
 ```python

         return mask
 ```
+### Transformer Encoder Layer with RPB
+```python
+class TransformerEncoderLayerWithRPB(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int,
+        dropout: float,
+        rbp,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.nhead = nhead
+        self.rbp = rbp
+        # QKV projections
+        self.qkv_proj = nn.Linear(d_model, 3 * d_model)
+        self.out_proj = nn.Linear(d_model, d_model)
+        # FFN layers
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        # Normalization and dropout
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = F.relu
+    def forward(
+        self,
+        src: torch.Tensor,
+        src_mask: Optional[torch.Tensor] = None,
+        src_key_padding_mask: Optional[torch.Tensor] = None,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        seq_len, batch_size, d_model = src.shape
+        head_dim = d_model // self.nhead
+        qkv = self.qkv_proj(src)
+        q, k, v = qkv.chunk(3, dim=-1)
+        q = q.reshape(seq_len, batch_size, self.nhead, head_dim).permute(1, 2, 0, 3)
+        k = k.reshape(seq_len, batch_size, self.nhead, head_dim).permute(1, 2, 0, 3)
+        v = v.reshape(seq_len, batch_size, self.nhead, head_dim).permute(1, 2, 0, 3)
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(head_dim)
+        # Add RBP after QK^T
+        rbp_bias = self.rbp(
+            seq_len, seq_len, device=src.device
+        )  # [nhead, seq_len, seq_len]
+        attn_weights = attn_weights + rbp_bias.unsqueeze(
+            0
+        )  # [batch, nhead, seq_len, seq_len]
+        if src_mask is not None:
+            attn_weights = attn_weights + src_mask.unsqueeze(0).unsqueeze(0)
+        if src_key_padding_mask is not None:
+            attn_weights = attn_weights.masked_fill(
+                src_key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf")
+            )
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = self.dropout1(attn_weights)
+        attn_output = torch.matmul(attn_weights, v)  # [batch, nhead, seq_len, head_dim]
+        attn_output = attn_output.permute(2, 0, 1, 3).reshape(
+            seq_len, batch_size, d_model
+        )
+        attn_output = self.out_proj(attn_output)
+        src2 = src + self.dropout1(attn_output)
+        src2 = self.norm1(src2)
+        ffn_output = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src3 = src2 + self.dropout2(ffn_output)
+        src3 = self.norm2(src3)
+        return src3
+```
 ### RPB Components
 ```python