inwaves
/

os-solu

Model card Files Files and versions

xet

Community

inwaves commited on Sep 7, 2022

Commit

0b6a10a

1 Parent(s): 2896dec

Implemented unidirectional attention, moving on

Browse files

Files changed (2) hide show

model.py +67 -16
utils.py +10 -8

model.py CHANGED Viewed

@@ -3,46 +3,97 @@ import torch.nn as nn
 import torch.functional as F
 import torch.optim as optim
 import wandb
-import fancy_einsum
 from einops import rearrange, repeat, reduce
 class OsSoluModel(nn.Module):
-    def __init__(self, config) -> None:
         super().__init__()
         self.config = config
         self.transformer_block = TransformerBlock(config)
     def forward(self, x: t.Tensor) -> t.Tensor:
-        pass
 class TransformerBlock(nn.Module):
-    def __init__(self, config) -> None:
-        super().__init__()
         self.config = config
-        self.embed = nn.Embedding(config.num_embeddings, config.d_model)
         self.linear = nn.Sequential(
             nn.Linear(config.d_model, config.d_model),
             SoLU(),
         )
-        self.layer_norm = nn.LayerNorm(normalized_shape)
         self.unembed = nn.Embedding(config.num_embeddings, config.d_model)
     def forward(self, x: t.Tensor) -> t.Tensor:
         pass
-class RotaryAttention(nn.Module):
-    def __init__(self, config) -> None:
         super().__init__()
-    def forward(self, x: t.Tensor, attention_mask: t.Tensor) -> t.Tensor:
-        # Compute pre-softmax attention scores
-        # Apply attention mask
-        # Compute softmax
-        # Apply final einsum
-        # Return attention output
-        pass

 import torch.functional as F
 import torch.optim as optim
 import wandb
+import fancy_einsum as einsum
 from einops import rearrange, repeat, reduce
+from utils import OsSoluConfig
 class OsSoluModel(nn.Module):
+    def __init__(self, config: OsSoluConfig) -> None:
         super().__init__()
+        normalised_shape = None             # TODO: normalised_shape should be defined properly
         self.config = config
+        self.embed_positions = nn.Embedding(config.max_positional_embeddings, config.d_model)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
         self.transformer_block = TransformerBlock(config)
+        self.final_ln = nn.LayerNorm(normalized_shape, config.ln_eps)
+        self.unembed = nn
     def forward(self, x: t.Tensor) -> t.Tensor:
+        positional_embeddings = self.embed_positions(t.arange(x.size(1)))
+        token_embeddings = self.embed_tokens(x)
+        embeddings = positional_embeddings + token_embeddings
 class TransformerBlock(nn.Module):
+    def __init__(self, config: OsSoluConfig) -> None:
+        super().__init__()
         self.config = config
+        self.attention = UnidirectionalAttention(config) if config.self_attention_type == "unidirectional" else RotaryAttention(config)
         self.linear = nn.Sequential(
             nn.Linear(config.d_model, config.d_model),
             SoLU(),
         )
+        self.layer_norm = nn.LayerNorm(normalized_shape, config.ln_eps)
         self.unembed = nn.Embedding(config.num_embeddings, config.d_model)
     def forward(self, x: t.Tensor) -> t.Tensor:
         pass
+class UnidirectionalAttention(nn.Module):
+    def __init__(self, config: OsSoluConfig) -> None:
         super().__init__()
+        self.num_heads = config.num_heads
+        self.d_model = config.d_model
+        self.project_q = nn.Linear(config.num_embeddings, config.d_model)
+        self.project_k = nn.Linear(config.num_embeddings, config.d_model)
+        self.project_v = nn.Linear(config.num_embeddings, config.d_model)
+        self.project_out = nn.Linear(config.d_model, config.d_model)
+        self.LARGE_NEGATIVE_VALUE = -1e5
+    def hidden_to_heads(self, tensor: t.Tensor) -> t.Tensor:
+        return rearrange(tensor, "b s (nh hs) -> b nh s hs", nh=self.num_heads)
+    def compute_pre_softmax_attn_pattern(self, x: t.Tensor) -> t.Tensor:
+        Q = self.project_q(x)
+        K = self.project_k(x)
+        Q = self.hidden_to_heads(Q)
+        K = self.hidden_to_heads(K)
+        attention_pattern = einsum("batch num_heads seqlen_q head_size, batch num_heads seqlen_k head_size -> batch num_heads seqlen_q seqlen_k")
+        return attention_pattern
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        batch, seqlen, hidden_size = x.shape
+        attention_pattern = self.compute_pre_softmax_attn_pattern(x)
+        V = self.project_v(x)
+        # Masking attention. Since GPT is unidirectional, it should only attend to previous tokens.
+        if seqlen > 1:
+            fst_range = t.arange(seqlen, device=self.device).unsqueeze(0).T
+            snd_range = t.arange(seqlen, device=self.device).unsqueeze(0)
+            bool_array = fst_range < snd_range
+            attention_score[..., bool_array] = self.LARGE_NEGATIVE_VALUE
+        attention_pattern = attention_pattern / t.sqrt(t.tensor(self.d_model // self.num_heads))
+        attention_score = attention_pattern.softmax(dim=-1)
+        V = self.hidden_to_heads(V)
+        out = einsum("batch num_heads seqlen_q seqlen_k, batch num_heads seqlen_k head_size -> batch num_heads seqlen_q head_size", attention_score, V)
+        out = rearrange("b nh s hs -> b s (nh hs)")
+        out = self.project_out(out)
+        return out
+class RotaryAttention(nn.Module):
+    def __init__(self, config: OsSoluConfig) -> None:
+        super().__init__()
+        self.config = config
+    def forward(self, x: t.Tensor) -> t.Tensor:
+        pass

utils.py CHANGED Viewed

@@ -1,10 +1,12 @@
 @dataclass
 class OsSoluConfig:
-    d_model: int = 512                  # Hidden size of the model.
-    vocab_size: int = 65536             # Vocabulary size of the input sequence. Unsure about this.
-    learning_rate: float = 1e-3         # Learning rate for the optimiser.
-    num_embeddings: int = 1024          # Number of embeddings. Unsure about this.
-    num_blocks: int = 1                 # Number of transformer blocks.
-    dropout: float = 0.1                # Probability of dropout.
-    ln_eps: float = 1e-3                # Layer norm epsilon.
-    num_heads: int = 4                  # Number of attention heads in each attention layer.

 @dataclass
 class OsSoluConfig:
+    d_model: int = 512                              # Hidden size of the model.
+    vocab_size: int = 65536                         # Vocabulary size of the input sequence. Unsure about this.
+    learning_rate: float = 1e-3                     # Learning rate for the optimiser.
+    num_embeddings: int = 1024                      # Number of embeddings. Unsure about this.
+    num_blocks: int = 1                             # Number of transformer blocks.
+    dropout: float = 0.1                            # Probability of dropout.
+    ln_eps: float = 1e-3                            # Layer norm epsilon.
+    num_heads: int = 4                              # Number of attention heads in each attention layer.
+    self_attention_type: str = "unidirectional"     # What type of attention to use: rotary or unidirectional.
+    max_positional_embeddings: int = 1024           # Maximum number of positional embeddings.