makiisthebes
/

transformers_scratch

Model card Files Files and versions

xet

Community

makiisthebes commited on Feb 9, 2024

Commit

336cbca

verified ·

1 Parent(s): 7d6a371

Transformers from Scratch

Browse files

Files changed (1) hide show

scratch_transformer.py +187 -0

scratch_transformer.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Transformers from Scratch using "Attention is All You Need" paper
+# Modelling Scaled Dot-Product Attention, Multi-Head Attention, Position-wise Feed-Forward Networks.
+# Import Modules
+import matplotlib.pyplot as plt
+import torch.nn.functional as F
+import torch.nn as nn
+import torch
+import numpy as np
+import math
+# Making Single and Multi-Head Attention modules from scratch using Pure PyTorch
+# Initialise the seed for reproducibility
+seed = 42
+np.random.seed(seed)
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+# Self-Attention Mechanism: Single Head
+embdim = 256  # D
+headdim = 64  # Internal D
+tokens = torch.randn(1, 5, embdim)  # batch, tokens, embedding
+# Defining weights associates with query, key, value
+Wq = torch.randn(embdim, headdim) / math.sqrt(embdim)
+Wk = torch.randn(embdim, headdim) / math.sqrt(embdim)
+Wv = torch.randn(embdim, embdim) / math.sqrt(embdim)
+# Query, Key, Value
+qis = torch.einsum("BSE,EH->BSH", tokens, Wq)  # batch x seqlen x headdim; queries, (1, 5, 64)
+kis = torch.einsum("BTE,EH->BTH", tokens, Wk)  # batch x seqlen x headdim; keys
+vis = torch.einsum("BTE,EF->BTF", tokens, Wv)  # batch x seqlen x embeddim; values
+# Start: Testing Code
+random_mat1 = torch.randn(2, 5, 4)  # BATCH, TOKENS, DIMENSIONS
+random_mat2 = torch.randn(2, 5, 4)
+# 2, 5, 4 * , 2, 4, 5
+torch.matmul(random_mat1, random_mat2.transpose(1, 2))  # 2, 5, 5
+print(qis.shape)
+print(kis.shape)
+# (Q) N, D * (K^T) D, N  -> N, N
+# End: Testing Code
+scoremat = torch.matmul(qis, kis.transpose(1, 2))  # output: batch x seqlen (Query) x seqlen (Key)
+attmat = F.softmax(scoremat / math.sqrt(headdim), dim=2)  # attention matrix given.
+# Output of the attention mechanism
+zis = torch.einsum("BST,BTF->BSF", attmat, vis)
+# We can verify the output, with scaled dot-product attention
+attn_torch = F.scaled_dot_product_attention(qis, kis, vis)
+assert (torch.allclose(attn_torch, zis, atol=1E-6, rtol=1E-6))  # True
+# Multi-Head Attention
+embdim = 768
+headcnt = 12
+headdim = embdim // headcnt
+# print(headdim)
+assert headdim * headcnt == embdim
+tokens = torch.randn(1, 5, embdim)  # batch, tokens, embedding
+# We use all the 256, ( 768)  ~ which is (256), (64 * 12 (heads))
+Wq = torch.randn(embdim, headcnt * headdim) / math.sqrt(embdim)  # heads packed in a single dim
+Wk = torch.randn(embdim, headcnt * headdim) / math.sqrt(embdim)  # heads packed in a single dim
+Wv = torch.randn(embdim, headcnt * headdim) / math.sqrt(embdim)  # heads packed in a single dim
+print(Wq.shape)
+print(Wk.shape)
+print(Wv.shape)
+batch, token_num, _ = tokens.shape  # batch, tokens (n), embedding shape.
+# tokens, B, N, E
+# Wq,     B, E, HWeights (H * HC)
+qis = torch.einsum("BSE,EH->BSH", tokens, Wq)  # Batch, N, H  ~ 1, 5, 768
+kis = torch.einsum("BTE,EH->BTH", tokens, Wk)  # Batch N, H
+vis = torch.einsum("BTE,EH->BTH", tokens, Wv)  # Batch, N, H
+# split the single hidden dim into the heads
+# Converting dimensions from (B, N, H) to (B, N, HC, HW)
+# So now for each batch, for each token, for each head there are a set of weights.
+qis_mh = qis.view(batch, token_num, headcnt, headdim)  # B, N, HC, HW
+kis_mh = kis.view(batch, token_num, headcnt, headdim)
+vis_mh = vis.view(batch, token_num, headcnt, headdim)
+scoremat_mh = torch.einsum("BSHC,BTHC->BHST", qis_mh, kis_mh)  # Input: (B, N, HC, HH) & Output: (B, HC, Q, K)
+print(scoremat_mh.shape)  # 1, 12, 5, 5  # Now I have 12 heads, which have given me attention matrices of shape 5x5.
+# batch x headcnt x seqlen (query) x seqlen (key)
+attmat_mh = F.softmax(scoremat_mh / math.sqrt(headdim), dim=-1)
+zis_mh = torch.einsum("BCST,BTCH->BSCH", attmat_mh, vis_mh)  # batch x seqlen (query) x headcnt x headdim
+zis = zis_mh.reshape(batch, token_num, headcnt * headdim)
+# The block does not do the operation of concat and linear layer operations on this.
+# We can verify the output, with Multi-Head Attention
+mha = nn.MultiheadAttention(embdim, headcnt, batch_first=True, )
+print(mha.in_proj_weight.shape)  # 3 * embdim x embdim
+mha.in_proj_weight.data = torch.cat([Wq, Wk, Wv], dim=1).T
+attn_out, attn_weights = mha(tokens, tokens, tokens, average_attn_weights=False, )
+# Which is the same as attmat_mh
+assert torch.allclose(attmat_mh, attn_weights, atol=1e-6, rtol=1e-6)  # True
+print(attn_weights.shape)  # batch, heads, tokens, tokens.
+print(attn_out.shape)
+# Casual Mask from Scratch
+# Calculate Casual Mask, this is described in the paper when we do not want to attend to the future tokens, in decoder.
+attn_mask = torch.ones(token_num, token_num, )
+attn_mask = -1E4 * torch.triu(attn_mask, 1)
+print(attn_mask)
+scoremat_mh_msk = torch.einsum("BSCH,BTCH->BCST", qis_mh, kis_mh)  # batch x headcnt x seqlen (query) x seqlen (key)
+scoremat_mh_msk += attn_mask  # add the attn mask to the scores before SoftMax normalization
+attmat_mh_msk = F.softmax(scoremat_mh_msk / math.sqrt(headdim), dim=-1)
+zis_mh_msk = torch.einsum("BCST,BTCH->BSCH", attmat_mh_msk, vis_mh)  # batch x seqlen (query) x headcnt x headdim
+zis_msk = zis_mh_msk.reshape(batch, token_num, headcnt * headdim)
+attn_out_causal, attn_weights_causal = mha(tokens, tokens, tokens, average_attn_weights=False, attn_mask=attn_mask)
+# Plotting all heads of the attention mechanism.
+plt.figure()
+for head in range(headcnt):
+	plt.subplot(3, 4, head + 1)
+	plt.imshow(attn_weights_causal[0, head].detach().numpy())
+	plt.title(f"head {head}")
+	plt.axis("off")
+plt.show()
+# Transformer Block from Scratch
+# Modeling the Transformer Block from Scratch using PyTorch
+# Transformer Block contains:
+#     - Layer norm
+#     - Skip connections
+#     - Multi-head attention
+#     - MLP, Feedforward net
+class TransformerBlock(nn.Module):
+	def __init__(self, embdim:int, headcnt, *args, dropout=0.0, **kwargs) -> None:
+		super().__init__(*args, **kwargs)
+		self.ln1 = nn.LayerNorm(embdim)
+		self.ln2 = nn.LayerNorm(embdim)
+		self.attn = nn.MultiheadAttention(embdim, headcnt, batch_first=True,)
+		self.ffn = nn.Sequential(
+			nn.Linear(embdim, 4 * embdim),
+			nn.GELU(),
+			nn.Linear(4 * embdim, embdim),
+			nn.Dropout(dropout),
+		)
+	def forward(self, x, is_causal=True):
+		"""
+		Input to forward function is matrix with shape B, S, E, we can assume therefore that input and positional embeddings have been added.
+		"""
+		batch, token_num, hidden_dim = x.shape
+		if is_causal:
+			attn_mask = torch.ones(token_num, token_num,)
+			attn_mask = -1E4 * torch.triu(attn_mask,1)
+		else:
+			attn_mask = None
+		residue = x
+		attn_output, attn_weights = self.attn(x, x, x, average_attn_weights=False, )
+		x = residue + attn_output
+		x = self.ln1(x)
+		residue = x
+		ffn_output = self.ffn(x)
+		output = residue + ffn_output
+		return output
+if __name__ == "__main__":
+	# Testing the Transformer Block
+	print("Testing the Transformer Block")
+	transformer_block = TransformerBlock(embdim, headcnt)
+	tokens = torch.randn(1, 5, embdim)
+	output = transformer_block(tokens)
+	print(output.shape)