""" Source: https://github.com/karpathy/nanoGPT/blob/master/model.py """ from torch import nn from models.attention import CausalSelfAttention from models.decoder_ffn import FFNN class DecoderBlock(nn.Module): def __init__( self, d, H, T, bias=False, dropout=0.2, ): """ Arguments: d: size of embedding dimension H: number of attention heads T: maximum length of input sequences (in tokens) bias: whether or not to use bias in linear layers dropout: probability of dropout """ super().__init__() self.ln_1 = nn.LayerNorm(d) self.attn = CausalSelfAttention(d, H, T, bias, dropout) self.ln_2 = nn.LayerNorm(d) self.ffnn = FFNN(d, bias, dropout) def forward(self, x): x = x + self.attn(self.ln_1(x)) x = x + self.ffnn(self.ln_2(x)) return x