Spaces:
Sleeping
Sleeping
File size: 4,045 Bytes
f358631 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | import torch
from torch import nn
from feed_forward_nn import feedforward
from positional_encoding import Positional_Encoding
from multihead_attention import MultiHeadAttention
d_model = 512 # main model dimension
num_heads = 8 # number of heads
d_ff = 2048 # feedforward hidden dimension
seq_len = 128 # max input length
vocab_size = 30000
embedding_layer = nn.Embedding(vocab_size, d_model)
pos_encoding = Positional_Encoding(seq_len, d_model)
def prepare_encoder_input(token_ids):
token_ids = torch.tensor(token_ids).unsqueeze(0) # (1, seq_len)
# 1. Convert token IDs β learned embeddings
x = embedding_layer(token_ids) # (1, seq_len, d_model)
# 2. Add sinusoidal positional encoding
x = pos_encoding(x) # (1, seq_len, d_model)
return x
# [7, 1542, 98] => "I Love you"
# token 7 - I
# token 1542 - Love
# token 98 - you
x = prepare_encoder_input([7, 1542, 98])
print(x.shape) # (1, 3, 512)
# Meaning:
# 1 β batch size
# 3 β sequence length (["I", "love", "you"])
# 512 β d_model (original transformer dimension)
# INPUT β EMBEDDING β POSITIONAL ENCODING β ENCODER
# Input to encoder always shape of this
# (batch_size, seq_len, d_model)
# 1οΈβ£ "AI is transforming the world."
# 2οΈβ£ "Deep learning models are powerful."
# 3οΈβ£ "Transformers use attention mechanisms."
# 4οΈβ£ "Neural networks learn patterns from data."
# We will feed all 4 sentences to the Transformer together.
# batch_size = 4
# seq_len (number of tokens per sentence)
# Maximum input length that your encoder will accept.
# Sentence 1:
# "AI is transforming the world."
# Tokens =
# ["AI", "is", "transforming", "the", "world"]
# β seq_len = 5
# Sentence 2:
# "Deep learning models are powerful."
# Tokens =
# ["Deep", "learning", "models", "are", "powerful"]
# β seq_len = 5
# Sentence 3:
# "Transformers use attention mechanisms."
# Tokens =
# ["Transformers", "use", "attention", "mechanisms"]
# β seq_len = 4
# But we need same length for all sentences in a batch β so pad it:
# ["Transformers", "use", "attention", "mechanisms", "<PAD>"]
# β seq_len = 5
# Sentence 4:
# "Neural networks learn patterns from data."
# Tokens =
# ["Neural", "networks", "learn", "patterns", "from", "data"]
# β seq_len = 6
# But max length is 6, so all others must match:
# Pad the rest:
# Sentence 1 β 5 tokens β add 1 pad
# Sentence 2 β 5 tokens β add 1 pad
# Sentence 3 β 4 tokens β add 2 pads
# Sentence 4 β 6 tokens β no pad
# seq_len = 6
# d_model (embedding dimension)
# Letβs assume each token is converted to a vector of size 10 (small example).
# d_model = 10
# Final Input Tensor to Transformer
# (batch_size, seq_len, d_model)
# [4, 6, 10]
class Encoder_block(nn.Module):
def __init__(self, d_model, d_ff, num_heads, dropout=0.1):
super().__init__()
self.ffn = feedforward(d_model, d_ff)
self.multi_att = MultiHeadAttention(d_model, num_heads) #d_model >> embed_dim
self.norm_layer1 = nn.LayerNorm(d_model)
self.norm_layer2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
# x β [MHA] β mha_out
# x + Dropout(mha_out) β Norm β out1
# out1 β [FFN] β ffn_out
# out1 + Dropout(ffn_out) β Norm β out2
def forward(self, x, mask=None):
# Multi-Head Self Attention
mha_out, attn = self.multi_att(x, mask)
# first Add & Norm (Residual connection)
residual_1 = x + self.dropout(mha_out)
norm_layer1_out = self.norm_layer1(residual_1)
# Feed Forward Network
ffn_out = self.ffn(norm_layer1_out)
# second Add & Norm (Residual connection)
residual_2 = norm_layer1_out + self.dropout(ffn_out)
norm_layer2_out = self.norm_layer2(residual_2)
return norm_layer2_out, attn
|