intexcp
/

gigactc

@@ -203,10 +203,8 @@ class MultiHeadAttention(nn.Module, ABC):
         return self.linear_out(x)
 class RelPositionMultiHeadAttention(MultiHeadAttention):
-    """
-    Relative Position Multi-Head Attention module.
-    """
     def __init__(self, n_head: int, n_feat: int):
         super().__init__(n_head, n_feat)
@@ -214,19 +212,20 @@ class RelPositionMultiHeadAttention(MultiHeadAttention):
         self.pos_bias_u = nn.Parameter(torch.FloatTensor(self.h, self.d_k))
         self.pos_bias_v = nn.Parameter(torch.FloatTensor(self.h, self.d_k))
-    def rel_shift(self, x: Tensor) -> Tensor:
         b, h, qlen, pos_len = x.size()
         x = torch.nn.functional.pad(x, pad=(1, 0))
         x = x.view(b, h, -1, qlen)
         return x[:, :, 1:].view(b, h, qlen, pos_len)
     def forward(
-        self,
-        query: Tensor,
-        key: Tensor,
-        value: Tensor,
-        pos_emb: Tensor,
-        mask: Optional[Tensor] = None,
     ) -> Tensor:
         q, k, v = self.forward_qkv(query, key, value)
         q = q.transpose(1, 2)
@@ -243,17 +242,14 @@ class RelPositionMultiHeadAttention(MultiHeadAttention):
 class RotaryPositionMultiHeadAttention(MultiHeadAttention):
-    """
-    Rotary Position Multi-Head Attention module.
-    """
     def forward(
-        self,
-        query: Tensor,
-        key: Tensor,
-        value: Tensor,
-        pos_emb: List[Tensor],
-        mask: Optional[Tensor] = None,
     ) -> Tensor:
         b, t, _ = value.size()
         query = query.transpose(0, 1).view(t, b, self.h, self.d_k)
@@ -269,25 +265,13 @@ class RotaryPositionMultiHeadAttention(MultiHeadAttention):
             value.view(t, b, self.h * self.d_k).transpose(0, 1),
         )
-        # if not self.flash_attn:
-        scores = torch.matmul(q, k.transpose(-2, -1) / math.sqrt(self.d_k))
         out = self.forward_attention(v, scores, mask)
-        # else:
-        #     if mask is None:
-        #         scores = flash_attn_func(q, k, v)
-        #     else:
-        #         scores = apply_masked_flash_attn(q, k, v, mask, self.h, self.d_k)
-        #     scores = scores.view(b, -1, self.h * self.d_k)
-        #     out = self.linear_out(scores)
         return out
 class PositionalEncoding(nn.Module, ABC):
-    """
-    Base class of Positional Encodings.
-    """
     def __init__(self, dim: int, base: int):
         super().__init__()
@@ -295,14 +279,11 @@ class PositionalEncoding(nn.Module, ABC):
         self.base = base
     @abstractmethod
-    def create_pe(self, length: int, device: torch.device) -> Optional[Tensor]:
         pass
-    def extend_pe(self, length: int, device: torch.device):
-        """
-        Extends the positional encoding buffer to process longer sequences.
-        """
-        pe = self.create_pe(length, device)
         if pe is None:
             return
         if hasattr(self, "pe"):
@@ -312,17 +293,10 @@ class PositionalEncoding(nn.Module, ABC):
 class RelPositionalEmbedding(PositionalEncoding):
-    """
-    Relative Positional Embedding module.
-    """
-    def create_pe(self, length: int, device: torch.device) -> Optional[Tensor]:
-        """
-        Creates the relative positional encoding matrix.
-        """
         if hasattr(self, "pe") and self.pe.shape[1] >= 2 * length - 1:
             return None
-        positions = torch.arange(length - 1, -length, -1, device=device).unsqueeze(1)
         pos_length = positions.size(0)
         pe = torch.zeros(pos_length, self.dim, device=positions.device)
         div_term = torch.exp(
@@ -342,29 +316,23 @@ class RelPositionalEmbedding(PositionalEncoding):
 class RotaryPositionalEmbedding(PositionalEncoding):
-    """
-    Rotary Positional Embedding module.
-    """
-    def create_pe(self, length: int, device: torch.device) -> Optional[Tensor]:
-        """
-        Creates or extends the rotary positional encoding matrix.
-        """
         if hasattr(self, "pe") and self.pe.size(0) >= 2 * length:
             return None
-        positions = torch.arange(0, length, dtype=torch.float32, device=device)
         inv_freq = 1.0 / (
-            self.base ** (torch.arange(0, self.dim, 2, device=positions.device).float() / self.dim)
         )
-        t = torch.arange(length, device=positions.device, dtype=inv_freq.dtype)
         freqs = torch.einsum("i,j->ij", t, inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1).to(positions.device)
         return torch.cat([emb.cos()[:, None, None, :], emb.sin()[:, None, None, :]])
     def forward(self, x: torch.Tensor) -> Tuple[Tensor, List[Tensor]]:
-        cos_emb = self.pe[0 : x.shape[1]]
         half_pe = self.pe.shape[0] // 2
-        sin_emb = self.pe[half_pe : half_pe + x.shape[1]]
         return x, [cos_emb, sin_emb]

         return self.linear_out(x)
 class RelPositionMultiHeadAttention(MultiHeadAttention):
     def __init__(self, n_head: int, n_feat: int):
         super().__init__(n_head, n_feat)
         self.pos_bias_u = nn.Parameter(torch.FloatTensor(self.h, self.d_k))
         self.pos_bias_v = nn.Parameter(torch.FloatTensor(self.h, self.d_k))
+    @staticmethod
+    def rel_shift(x: Tensor) -> Tensor:
         b, h, qlen, pos_len = x.size()
         x = torch.nn.functional.pad(x, pad=(1, 0))
         x = x.view(b, h, -1, qlen)
         return x[:, :, 1:].view(b, h, qlen, pos_len)
     def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            pos_emb: Tensor,
+            mask: Optional[Tensor] = None,
     ) -> Tensor:
         q, k, v = self.forward_qkv(query, key, value)
         q = q.transpose(1, 2)
 class RotaryPositionMultiHeadAttention(MultiHeadAttention):
     def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            pos_emb: List[Tensor],
+            mask: Optional[Tensor] = None,
     ) -> Tensor:
         b, t, _ = value.size()
         query = query.transpose(0, 1).view(t, b, self.h, self.d_k)
             value.view(t, b, self.h * self.d_k).transpose(0, 1),
         )
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
         out = self.forward_attention(v, scores, mask)
         return out
 class PositionalEncoding(nn.Module, ABC):
     def __init__(self, dim: int, base: int):
         super().__init__()
         self.base = base
     @abstractmethod
+    def create_pe(self, length: int) -> Optional[Tensor]:
         pass
+    def extend_pe(self, length: int):
+        pe = self.create_pe(length)
         if pe is None:
             return
         if hasattr(self, "pe"):
 class RelPositionalEmbedding(PositionalEncoding):
+    def create_pe(self, length: int) -> Optional[Tensor]:
         if hasattr(self, "pe") and self.pe.shape[1] >= 2 * length - 1:
             return None
+        positions = torch.arange(length - 1, -length, -1).unsqueeze(1)
         pos_length = positions.size(0)
         pe = torch.zeros(pos_length, self.dim, device=positions.device)
         div_term = torch.exp(
 class RotaryPositionalEmbedding(PositionalEncoding):
+    def create_pe(self, length: int) -> Optional[Tensor]:
         if hasattr(self, "pe") and self.pe.size(0) >= 2 * length:
             return None
+        positions = torch.arange(0, length, dtype=torch.float32)
         inv_freq = 1.0 / (
+                self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)
         )
+        t = torch.arange(length, device=positions.device).type_as(inv_freq)
         freqs = torch.einsum("i,j->ij", t, inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1).to(positions.device)
         return torch.cat([emb.cos()[:, None, None, :], emb.sin()[:, None, None, :]])
     def forward(self, x: torch.Tensor) -> Tuple[Tensor, List[Tensor]]:
+        cos_emb = self.pe[0: x.shape[1]]
         half_pe = self.pe.shape[0] // 2
+        sin_emb = self.pe[half_pe: half_pe + x.shape[1]]
         return x, [cos_emb, sin_emb]