Spaces:
Sleeping
Sleeping
| import torch | |
| from torch import nn | |
| from feed_forward_nn import feedforward | |
| from positional_encoding import Positional_Encoding | |
| from multihead_attention import MultiHeadAttention | |
| d_model = 512 # main model dimension | |
| num_heads = 8 # number of heads | |
| d_ff = 2048 # feedforward hidden dimension | |
| seq_len = 128 # max input length | |
| vocab_size = 30000 | |
| embedding_layer = nn.Embedding(vocab_size, d_model) | |
| pos_encoding = Positional_Encoding(seq_len, d_model) | |
| def prepare_encoder_input(token_ids): | |
| token_ids = torch.tensor(token_ids).unsqueeze(0) # (1, seq_len) | |
| # 1. Convert token IDs β learned embeddings | |
| x = embedding_layer(token_ids) # (1, seq_len, d_model) | |
| # 2. Add sinusoidal positional encoding | |
| x = pos_encoding(x) # (1, seq_len, d_model) | |
| return x | |
| # [7, 1542, 98] => "I Love you" | |
| # token 7 - I | |
| # token 1542 - Love | |
| # token 98 - you | |
| x = prepare_encoder_input([7, 1542, 98]) | |
| print(x.shape) # (1, 3, 512) | |
| # Meaning: | |
| # 1 β batch size | |
| # 3 β sequence length (["I", "love", "you"]) | |
| # 512 β d_model (original transformer dimension) | |
| # INPUT β EMBEDDING β POSITIONAL ENCODING β ENCODER | |
| # Input to encoder always shape of this | |
| # (batch_size, seq_len, d_model) | |
| # 1οΈβ£ "AI is transforming the world." | |
| # 2οΈβ£ "Deep learning models are powerful." | |
| # 3οΈβ£ "Transformers use attention mechanisms." | |
| # 4οΈβ£ "Neural networks learn patterns from data." | |
| # We will feed all 4 sentences to the Transformer together. | |
| # batch_size = 4 | |
| # seq_len (number of tokens per sentence) | |
| # Maximum input length that your encoder will accept. | |
| # Sentence 1: | |
| # "AI is transforming the world." | |
| # Tokens = | |
| # ["AI", "is", "transforming", "the", "world"] | |
| # β seq_len = 5 | |
| # Sentence 2: | |
| # "Deep learning models are powerful." | |
| # Tokens = | |
| # ["Deep", "learning", "models", "are", "powerful"] | |
| # β seq_len = 5 | |
| # Sentence 3: | |
| # "Transformers use attention mechanisms." | |
| # Tokens = | |
| # ["Transformers", "use", "attention", "mechanisms"] | |
| # β seq_len = 4 | |
| # But we need same length for all sentences in a batch β so pad it: | |
| # ["Transformers", "use", "attention", "mechanisms", "<PAD>"] | |
| # β seq_len = 5 | |
| # Sentence 4: | |
| # "Neural networks learn patterns from data." | |
| # Tokens = | |
| # ["Neural", "networks", "learn", "patterns", "from", "data"] | |
| # β seq_len = 6 | |
| # But max length is 6, so all others must match: | |
| # Pad the rest: | |
| # Sentence 1 β 5 tokens β add 1 pad | |
| # Sentence 2 β 5 tokens β add 1 pad | |
| # Sentence 3 β 4 tokens β add 2 pads | |
| # Sentence 4 β 6 tokens β no pad | |
| # seq_len = 6 | |
| # d_model (embedding dimension) | |
| # Letβs assume each token is converted to a vector of size 10 (small example). | |
| # d_model = 10 | |
| # Final Input Tensor to Transformer | |
| # (batch_size, seq_len, d_model) | |
| # [4, 6, 10] | |
| class Encoder_block(nn.Module): | |
| def __init__(self, d_model, d_ff, num_heads, dropout=0.1): | |
| super().__init__() | |
| self.ffn = feedforward(d_model, d_ff) | |
| self.multi_att = MultiHeadAttention(d_model, num_heads) #d_model >> embed_dim | |
| self.norm_layer1 = nn.LayerNorm(d_model) | |
| self.norm_layer2 = nn.LayerNorm(d_model) | |
| self.dropout = nn.Dropout(dropout) | |
| # x β [MHA] β mha_out | |
| # x + Dropout(mha_out) β Norm β out1 | |
| # out1 β [FFN] β ffn_out | |
| # out1 + Dropout(ffn_out) β Norm β out2 | |
| def forward(self, x, mask=None): | |
| # Multi-Head Self Attention | |
| mha_out, attn = self.multi_att(x, mask) | |
| # first Add & Norm (Residual connection) | |
| residual_1 = x + self.dropout(mha_out) | |
| norm_layer1_out = self.norm_layer1(residual_1) | |
| # Feed Forward Network | |
| ffn_out = self.ffn(norm_layer1_out) | |
| # second Add & Norm (Residual connection) | |
| residual_2 = norm_layer1_out + self.dropout(ffn_out) | |
| norm_layer2_out = self.norm_layer2(residual_2) | |
| return norm_layer2_out, attn | |