Add trained transformer model

Browse files

Files changed (15) hide show

__init__.py +0 -0
main.py +32 -0
model/__init__.py +0 -0
model/__pycache__/__init__.cpython-310.pyc +0 -0
model/__pycache__/decoder.cpython-310.pyc +0 -0
model/__pycache__/encoder.cpython-310.pyc +0 -0
model/__pycache__/sublayers.cpython-310.pyc +0 -0
model/__pycache__/transformer.cpython-310.pyc +0 -0
model/decoder.py +135 -0
model/encoder.py +87 -0
model/sublayers.py +194 -0
model/transformer.py +205 -0
params.json +1 -0
trained_model/transformer-model.pt → pytorch_transformer_model.pt +2 -2
vocab.pt +3 -0

__init__.py ADDED Viewed

File without changes

main.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# torch packages
+import torch
+from model.transformer import Transformer
+import json
+if __name__ == "__main__":
+    """
+    Following parameters are for Multi30K dataset
+    """
+    # Load config containing model input parameters
+    with open('params.json') as json_data:
+        config = json.load(json_data)
+    print(config)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Instantiate model
+    model = Transformer(
+                    config["dk"],
+                    config["dv"],
+                    config["h"],
+                    config["src_vocab_size"],
+                    config["target_vocab_size"],
+                    config["num_encoders"],
+                    config["num_decoders"],
+                    config["dim_multiplier"],
+                    config["pdropout"],
+                    device = device)
+    # Load model weights
+    model.load_state_dict(torch.load('pytorch_transformer_model.pt',
+                                     map_location=device))
+    print(model)

model/__init__.py ADDED Viewed

File without changes

model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (169 Bytes). View file

model/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (3.65 kB). View file

model/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (2.76 kB). View file

model/__pycache__/sublayers.cpython-310.pyc ADDED Viewed

Binary file (6.17 kB). View file

model/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (4.4 kB). View file

model/decoder.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import math
+import copy
+import time
+import random
+import spacy
+import numpy as np
+import os
+# torch packages
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import torch.optim as optim
+from model.sublayers import (
+                        MultiHeadAttention,
+                        PositionalEncoding,
+                        PositionwiseFeedForward,
+                        Embedding)
+class DecoderLayer(nn.Module):
+    def __init__(
+                self,
+                dk,
+                dv,
+                h,
+                dim_multiplier = 4,
+                pdropout = 0.1):
+        super().__init__()
+        # Reference page 5 chapter 3.2.2 Multi-head attention
+        dmodel = dk*h
+        # Reference page 5 chapter 3.3 positionwise FeedForward
+        dff = dmodel * dim_multiplier
+        # Masked Multi Head Attention
+        self.masked_attention = MultiHeadAttention(dk, dv, h, pdropout)
+        self.masked_attn_norm = nn.LayerNorm(dmodel)
+        # Multi head attention
+        self.attention = MultiHeadAttention(dk, dv, h, pdropout)
+        self.attn_norm = nn.LayerNorm(dmodel)
+        # Add position FeedForward Network
+        self.ff = PositionwiseFeedForward(dmodel, dff, pdropout=pdropout)
+        self.ff_norm = nn.LayerNorm(dmodel)
+        self.dropout = nn.Dropout(p = pdropout)
+    def forward(self,
+                trg: Tensor,
+                src: Tensor,
+                trg_mask: Tensor,
+                src_mask: Tensor):
+        """
+        Args:
+            trg:          embedded sequences                (batch_size, trg_seq_length, d_model)
+            src:          embedded sequences                (batch_size, src_seq_length, d_model)
+            trg_mask:     mask for the sequences            (batch_size, 1, trg_seq_length, trg_seq_length)
+            src_mask:     mask for the sequences            (batch_size, 1, 1, src_seq_length)
+        Returns:
+            trg:          sequences after self-attention    (batch_size, trg_seq_length, d_model)
+            attn_probs:   self-attention softmax scores     (batch_size, n_heads, trg_seq_length, src_seq_length)
+        """
+        _trg, attn_probs = self.masked_attention(
+                                query = trg,
+                                key = trg,
+                                val = trg,
+                                mask = trg_mask)
+        # Residual connection between input and sublayer output, details: Page 7, Chapter 5.4 "Regularization",
+        # Actual paper design is the following
+        trg = self.masked_attn_norm(trg + self.dropout(_trg))
+        # Inputs to the decoder attention is given as follows
+        # query = previous decoder layer
+        # key and val = output of encoder
+        # mask = src_mask
+        # Reference : page 5 chapter 3.2.3 point 1
+        _trg, attn_probs = self.attention(
+                                query = trg,
+                                key = src,
+                                val = src,
+                                mask = src_mask)
+        trg = self.attn_norm(trg + self.dropout(_trg))
+        # position-wise feed-forward network
+        _trg = self.ff(trg)
+        # Perform Add Norm again
+        trg = self.ff_norm(trg + self.dropout(_trg))
+        return trg, attn_probs
+class Decoder(nn.Module):
+    def __init__(
+                self,
+                dk,
+                dv,
+                h,
+                num_decoders,
+                dim_multiplier = 4,
+                pdropout=0.1):
+        super().__init__()
+        self.decoder_layers = nn.ModuleList([
+            DecoderLayer(dk,
+                         dv,
+                         h,
+                         dim_multiplier,
+                         pdropout) for _ in range(num_decoders)
+        ])
+    def forward(self, target_inputs, src_inputs, target_mask, src_mask):
+        """
+        Input from the Embedding layer
+        target_inputs = embedded sequences    (batch_size, trg_seq_length, d_model)
+        src_inputs = embedded sequences       (batch_size, src_seq_length, d_model)
+        target_mask = mask for the sequences  (batch_size, 1, trg_seq_length, trg_seq_length)
+        src_mask = mask for the sequences     (batch_size, 1, 1, src_seq_length)
+        """
+        target_representation = target_inputs
+        # Forward pass through decoder stack
+        for layer in self.decoder_layers:
+            target_representation, attn_probs = layer(
+                                    target_representation,
+                                    src_inputs,
+                                    target_mask,
+                                    src_mask)
+        self.attn_probs = attn_probs
+        return target_representation

model/encoder.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import math
+import copy
+import time
+import random
+import spacy
+import numpy as np
+import os
+# torch packages
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import torch.optim as optim
+from model.sublayers import (
+                        MultiHeadAttention,
+                        PositionalEncoding,
+                        PositionwiseFeedForward,
+                        Embedding)
+class EncoderLayer(nn.Module):
+    """
+    This building block in the encoder layer consists of the following
+    1. MultiHead Attention
+    2. Sublayer Logic
+    3. Positional FeedForward Network
+    """
+    def __init__(self, dk, dv, h, dim_multiplier = 4, pdropout=0.1):
+        super().__init__()
+        self.attention = MultiHeadAttention(dk, dv, h, pdropout)
+        # Reference page 5 chapter 3.2.2 Multi-head attention
+        dmodel = dk*h
+        # Reference page 5 chapter 3.3 positionwise FeedForward
+        dff = dmodel * dim_multiplier
+        self.attn_norm = nn.LayerNorm(dmodel)
+        self.ff = PositionwiseFeedForward(dmodel, dff, pdropout=pdropout)
+        self.ff_norm = nn.LayerNorm(dmodel)
+        self.dropout = nn.Dropout(p = pdropout)
+    def forward(self, src_inputs, src_mask=None):
+        """
+        Forward pass as per page 3 chapter 3.1
+        """
+        mha_out, attention_wts = self.attention(
+                                query = src_inputs,
+                                key = src_inputs,
+                                val = src_inputs,
+                                mask = src_mask)
+        # Residual connection between input and sublayer output, details: Page 7, Chapter 5.4 "Regularization",
+        # Actual paper design is the following
+        intermediate_out = self.attn_norm(src_inputs + self.dropout(mha_out))
+        pff_out = self.ff(intermediate_out)
+        # Perform Add Norm again
+        out = self.ff_norm(intermediate_out + self.dropout(pff_out))
+        return out, attention_wts
+class Encoder(nn.Module):
+    def __init__(self, dk, dv, h, num_encoders, dim_multiplier = 4, pdropout=0.1):
+        super().__init__()
+        self.encoder_layers = nn.ModuleList([
+            EncoderLayer(dk,
+                         dv,
+                         h,
+                         dim_multiplier,
+                         pdropout) for _ in range(num_encoders)
+        ])
+    def forward(self, src_inputs, src_mask = None):
+        """
+        Input from the Embedding layer
+        src_inputs = (B - batch size, S/T - max token sequence length, D- model dimension)
+        """
+        src_representation = src_inputs
+        # Forward pass through encoder stack
+        for enc in self.encoder_layers:
+            src_representation, attn_probs = enc(src_representation, src_mask)
+        self.attn_probs = attn_probs
+        return src_representation

model/sublayers.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# importing required libraries
+import math
+import copy
+import time
+import random
+import spacy
+import numpy as np
+import os
+# torch packages
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import torch.optim as optim
+class MultiHeadAttention(nn.Module):
+    """
+    We can refer to the following blog to understand in depth about the transformer and MHA
+    https://medium.com/@hunter-j-phillips/multi-head-attention-7924371d477a
+    Here we are clubbing all the linear layers together and duplicating the inputs and
+    then performing matrix multiplications
+    """
+    def __init__(self, dk, dv, h, pdropout=0.1):
+        """
+        Input Args:
+        dk(int): Key dimensions used for generating Key weight matrix
+        dv(int): Val dimensions used for generating val weight matrix
+        h(int) : Number of heads in MHA
+        """
+        super().__init__()
+        assert dk == dv
+        self.dk = dk
+        self.dv = dv
+        self.h = h
+        self.dmodel = self.dk * self.h  # model dimension
+        # Add the params in modulelist as the params in the conv list needs to be tracked
+        # wq, wk, wv -> multiple linear weights for the number of heads
+        self.WQ = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
+        self.WK = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
+        self.WV = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
+        # Output Weights
+        self.WO = nn.Linear(self.h*self.dv, self.dmodel)  # shape -> (dmodel, dmodel)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(p = pdropout)
+    def forward(self, query, key, val, mask=None):
+        """
+        Forward pass for MHA
+        X has a size of (batch_size, seq_length, d_model)
+        Wq, Wk, and Wv have a size of (d_model, d_model)
+        Perform Scaled Dot Product Attention on multi head attention.
+        Notation: B - batch size, S/T - max src/trg token-sequence length
+        query shape = (B, S, dmodel)
+        key shape = (B, S, dmodel)
+        val shape = (B, S, dmodel)
+        """
+        # Weight the queries
+        Q = self.WQ(query)     # shape -> (B, S, dmodel)
+        K = self.WK(key)       # shape -> (B, S, dmodel)
+        V = self.WV(val)       # shape -> (B, S, dmodel)
+        # Separate last dimension to number of head and dk
+        batch_size = Q.size(0)
+        Q = Q.view(batch_size, -1, self.h, self.dk)   # shape -> (B, S, h, dk)
+        K = K.view(batch_size, -1, self.h, self.dk)   # shape -> (B, S, h, dk)
+        V = V.view(batch_size, -1, self.h, self.dk)   # shape -> (B, S, h, dk)
+        # each sequence is split across n_heads, with each head receiving seq_length tokens
+        # with d_key elements in each token instead of d_model.
+        Q = Q.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
+        K = K.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
+        V = V.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
+        # dot product of Q and K
+        scaled_dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.dk)
+        # fill those positions of product as (-1e10) where mask positions are 0
+        if mask is not None:
+            scaled_dot_product = scaled_dot_product.masked_fill(mask == 0, -1e10)
+        attn_probs = self.softmax(scaled_dot_product)
+        # Create head
+        head = torch.matmul(self.dropout(attn_probs), V)  # shape -> (B, h, S, S) * (B, h, S, dk) = (B, h, S, dk)
+        # Prepare the head to pass it through output linear layer
+        head = head.permute(0, 2, 1, 3).contiguous()  # shape -> (B, S, h, dk)
+        # Concatenate the head together
+        head = head.view(batch_size, -1, self.h* self.dk)  # shape -> (B, S, (h*dk = dmodel))
+        # Pass through output layer
+        token_representation = self.WO(head)
+        return token_representation, attn_probs
+class Embedding(nn.Module):
+    """
+    Embedding lookup table which is used by the positional
+    embedding block.
+    Embedding lookup table is shared across input and output
+    """
+    def __init__(self, vocab_size, dmodel):
+        """
+        Embedding lookup needs a vocab size and model
+        dimension size matrix for creating lookups
+        """
+        super().__init__()
+        self.embedding_lookup = nn.Embedding(vocab_size, dmodel)
+        self.vocab_size = vocab_size
+        self.dmodel = dmodel
+    def forward(self, token_ids):
+        """
+        For a given token lookup the embedding vector
+        As per the paper, we also multiply the embedding vector with sqrt of dmodel
+        """
+        assert token_ids.ndim == 2, \
+        f'Expected: (batch size, max token sequence length), got {token_ids.shape}'
+        embedding_vector = self.embedding_lookup(token_ids)
+        return embedding_vector * math.sqrt(self.dmodel)
+class PositionalEncoding(nn.Module):
+    def __init__(self, dmodel, max_seq_length = 5000, pdropout = 0.1,):
+        """
+        dmodel(int): model dimensions
+        max_seq_length(int): Maximum input sequence length
+        pdropout(float): Dropout probability
+        """
+        super().__init__()
+        self.dropout = nn.Dropout(p = pdropout)
+        # Calculate frequencies
+        position_ids = torch.arange(0, max_seq_length).unsqueeze(1)
+        # -ve sign is added because the exponents are inverted when you multiply position and frequencies
+        frequencies = torch.pow(10000, -torch.arange(0, dmodel, 2, dtype = torch.float)/ dmodel)
+        # Create positional encoding table
+        positional_encoding_table = torch.zeros(max_seq_length, dmodel)
+        # Fill the table with even entries with sin and odd entries with cosine
+        positional_encoding_table[:, 0::2] = torch.sin(position_ids * frequencies)
+        positional_encoding_table[:, 1::2] = torch.cos(position_ids * frequencies)
+        # Registering the position enconding in state_dict but the its not included
+        # in named parameter as it is not trainable
+        self.register_buffer("positional_encoding_table", positional_encoding_table)
+    def forward(self, embeddings_batch):
+        """
+        embeddings_batch shape = (batch size, seq_length, dmodel)
+        positional_encoding_table shape = (max_seq_length, dmodel)
+        """
+        assert embeddings_batch.ndim == 3, \
+        f"Embeddings batch should have dimension of 3 but got {embeddings_batch.ndim}"
+        assert embeddings_batch.size()[-1] == self.positional_encoding_table.size()[-1], \
+        f"Embedding batch shape and positional_encoding_table shape should match, expected Embedding batch shape : {embeddings_batch.shape[-1]} while positional_encoding_table shape : {self.positional_encoding_table[-1]}"
+        # Get encodings for the given input sequence length
+        pos_encodings = self.positional_encoding_table[:embeddings_batch.shape[1]] # Choose only seq_length out of max_seq_length
+        # Final output
+        out = embeddings_batch + pos_encodings
+        out = self.dropout(out)
+        return out
+class PositionwiseFeedForward(nn.Module):
+    def __init__(self, dmodel, dff, pdropout = 0.1):
+        super().__init__()
+        self.dropout = nn.Dropout(p = pdropout)
+        self.W1 = nn.Linear(dmodel, dff)      # Intermediate layer
+        self.W2 = nn.Linear(dff, dmodel)    # Output layer
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        """
+        Perform Feedforward calculation
+        x shape = (B - batch size, S/T - max token sequence length, D- model dimension).
+        """
+        out = self.W2(self.relu(self.dropout(self.W1(x))))
+        return out

model/transformer.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import math
+import copy
+import time
+import random
+import spacy
+import numpy as np
+import os
+# torch packages
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import torch.optim as optim
+from model.sublayers import (
+                        MultiHeadAttention,
+                        PositionalEncoding,
+                        PositionwiseFeedForward,
+                        Embedding)
+from model.encoder import Encoder
+from model.decoder import Decoder
+class Transformer(nn.Module):
+    def __init__(self,
+                dk,
+                dv,
+                h,
+                src_vocab_size,
+                target_vocab_size,
+                num_encoders,
+                num_decoders,
+                src_pad_idx,
+                target_pad_idx,
+                dim_multiplier = 4,
+                pdropout=0.1,
+                device = "cpu"
+                ):
+        super().__init__()
+        # Reference page 5 chapter 3.2.2 Multi-head attention
+        dmodel = dk*h
+        # Modules required to build Encoder
+        self.src_embeddings = Embedding(src_vocab_size, dmodel)
+        self.src_positional_encoding = PositionalEncoding(
+                                        dmodel,
+                                        max_seq_length = src_vocab_size,
+                                        pdropout = pdropout
+                                        )
+        self.encoder = Encoder(
+                                dk,
+                                dv,
+                                h,
+                                num_encoders,
+                                dim_multiplier=dim_multiplier,
+                                pdropout=pdropout)
+        # Modules required to build Decoder
+        self.target_embeddings = Embedding(target_vocab_size, dmodel)
+        self.target_positional_encoding = PositionalEncoding(
+                                        dmodel,
+                                        max_seq_length = target_vocab_size,
+                                        pdropout = pdropout
+                                        )
+        self.decoder = Decoder(
+                                dk,
+                                dv,
+                                h,
+                                num_decoders,
+                                dim_multiplier=4,
+                                pdropout=0.1)
+        # Final output
+        self.linear = nn.Linear(dmodel, target_vocab_size)
+#         self.softmax = nn.Softmax(dim=-1)
+        self.device = device
+        self.src_pad_idx = src_pad_idx
+        self.target_pad_idx = target_pad_idx
+        self.init_params()
+    # This part wasn't mentioned in the paper, but it's super important!
+    def init_params(self):
+        """
+        xavier has tremendous impact! I didn't expect
+        that the model's perf, with normalization layers,
+        is so dependent on the choice of weight initialization.
+        """
+        for name, p in self.named_parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def make_src_mask(self, src):
+        """
+        Args:
+            src: raw sequences with padding        (batch_size, seq_length)
+            src_pad_idx(int): index where the token need not be attended
+        Returns:
+            src_mask: mask for each sequence            (batch_size, 1, 1, seq_length)
+        """
+        batch_size = src.shape[0]
+        # assign 1 to tokens that need attended to and 0 to padding tokens,
+        # then add 2 dimensions
+        src_mask = (src != self.src_pad_idx).view(batch_size, 1, 1, -1)
+        return src_mask
+    def make_target_mask(self, target):
+        """
+        Args:
+            target:  raw sequences with padding        (batch_size, seq_length)
+            target_pad_idx(int): index where the token need not be attended
+        Returns:
+            target_mask: mask for each sequence   (batch_size, 1, seq_length, seq_length)
+        """
+        seq_length = target.shape[1]
+        batch_size = target.shape[0]
+        # assign True to tokens that need attended to and
+        # False to padding tokens, then add 2 dimensions
+        target_mask = (target != self.target_pad_idx).view(batch_size, 1, 1, -1) # (batch_size, 1, 1, seq_length)
+        # generate subsequent mask
+        trg_sub_mask = torch.tril(torch.ones((seq_length, seq_length), device=self.device)).bool() # (batch_size, 1, seq_length, seq_length)
+        # bitwise "and" operator | 0 & 0 = 0, 1 & 1 = 1, 1 & 0 = 0
+        target_mask = target_mask & trg_sub_mask
+        return target_mask
+    def forward(
+        self,
+        src_token_ids_batch,
+        target_token_ids_batch):
+        # create source and target masks
+        src_mask = self.make_src_mask(
+                        src_token_ids_batch) # (batch_size, 1, 1, src_seq_length)
+        target_mask = self.make_target_mask(
+                        target_token_ids_batch) # (batch_size, 1, trg_seq_length, trg_seq_length)
+        # Create embeddings
+        src_representations = self.src_embeddings(src_token_ids_batch)
+        src_representations = self.src_positional_encoding(src_representations)
+        target_representations = self.target_embeddings(target_token_ids_batch)
+        target_representations = self.target_positional_encoding(target_representations)
+        # Encode
+        encoded_src = self.encoder(src_representations, src_mask)
+        # Decode
+        decoded_output = self.decoder(
+                                target_representations,
+                                encoded_src,
+                                target_mask,
+                                src_mask)
+        # Post processing
+        out = self.linear(decoded_output)
+        # Don't use softmax as we are not comparing against softmaxed output while
+        # computing loss. We are comparing against linear outputs
+#         # Output
+#         out = self.softmax(out)
+        return out
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+if __name__ == "__main__":
+    """
+    Following parameters are for Multi30K dataset
+    """
+    dk = 32
+    dv = 32
+    h = 8
+    src_vocab_size = 7983
+    target_vocab_size = 5979
+    src_pad_idx = 2
+    target_pad_idx = 2
+    num_encoders = 3
+    num_decoders = 3
+    dim_multiplier = 4
+    pdropout=0.1
+    # print(111)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = Transformer(
+                    dk,
+                    dv,
+                    h,
+                    src_vocab_size,
+                    target_vocab_size,
+                    num_encoders,
+                    num_decoders,
+                    dim_multiplier,
+                    pdropout,
+                    device = device)
+    if torch.cuda.is_available():
+        model.cuda()
+    print(model)
+    print(f'The model has {count_parameters(model):,} trainable parameters')

params.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"dk": 32, "dv": 32, "h": 8, "src_vocab_size": 8500, "target_vocab_size": 6500, "src_pad_idx": 2, "target_pad_idx": 2, "num_encoders": 3, "num_decoders": 3, "dim_multiplier": 4, "pdropout": 0.1, "lr": 0.0003, "N_EPOCHS": 50, "CLIP": 1, "patience": 5}

trained_model/transformer-model.pt → pytorch_transformer_model.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c72ccefd0a3594899f7f6e4d0266c74d18497b51e953261d4f678855a863258
-size 56911669

 version https://git-lfs.github.com/spec/v1
+oid sha256:bec7a1a3b8371fa8260fcfc9204e6695714f221cd54f121503e6241e31def867
+size 59573843

vocab.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:457ebb2e34df81149998f2fa2bfe6b7c3aac3964beff79b3dd24057c48341cb4
+size 249451