File size: 4,045 Bytes
f358631
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import torch
from torch import nn
from feed_forward_nn import feedforward
from positional_encoding import Positional_Encoding
from multihead_attention import MultiHeadAttention

d_model = 512  # main model dimension
num_heads = 8  # number of heads
d_ff = 2048    # feedforward hidden dimension
seq_len = 128  # max input length
vocab_size = 30000

embedding_layer = nn.Embedding(vocab_size, d_model)
pos_encoding = Positional_Encoding(seq_len, d_model)

def prepare_encoder_input(token_ids):
    token_ids = torch.tensor(token_ids).unsqueeze(0)  # (1, seq_len)

    # 1. Convert token IDs β†’ learned embeddings
    x = embedding_layer(token_ids)                      # (1, seq_len, d_model)

    # 2. Add sinusoidal positional encoding
    x = pos_encoding(x)                                 # (1, seq_len, d_model)

    return x

# [7, 1542, 98] => "I Love you"
# token 7 - I 
# token 1542 - Love
# token 98 - you

x = prepare_encoder_input([7, 1542, 98])
print(x.shape)      # (1, 3, 512)

# Meaning:
# 1 β†’ batch size
# 3 β†’ sequence length (["I", "love", "you"])
# 512 β†’ d_model (original transformer dimension)


# INPUT β†’ EMBEDDING β†’ POSITIONAL ENCODING β†’ ENCODER
# Input to encoder always shape of this
# (batch_size, seq_len, d_model)

# 1️⃣ "AI is transforming the world."
# 2️⃣ "Deep learning models are powerful."
# 3️⃣ "Transformers use attention mechanisms."
# 4️⃣ "Neural networks learn patterns from data."

# We will feed all 4 sentences to the Transformer together.
# batch_size = 4
# seq_len (number of tokens per sentence)
# Maximum input length that your encoder will accept.

# Sentence 1:
# "AI is transforming the world."

# Tokens =
# ["AI", "is", "transforming", "the", "world"]
# β†’ seq_len = 5

# Sentence 2:
# "Deep learning models are powerful."

# Tokens =
# ["Deep", "learning", "models", "are", "powerful"]
# β†’ seq_len = 5


# Sentence 3:
# "Transformers use attention mechanisms."

# Tokens =
# ["Transformers", "use", "attention", "mechanisms"]
# β†’ seq_len = 4
# But we need same length for all sentences in a batch β†’ so pad it:

# ["Transformers", "use", "attention", "mechanisms", "<PAD>"]
# β†’ seq_len = 5

# Sentence 4:
# "Neural networks learn patterns from data."
# Tokens =
# ["Neural", "networks", "learn", "patterns", "from", "data"]

# β†’ seq_len = 6
# But max length is 6, so all others must match:
# Pad the rest:
# Sentence 1 β†’ 5 tokens β†’ add 1 pad
# Sentence 2 β†’ 5 tokens β†’ add 1 pad
# Sentence 3 β†’ 4 tokens β†’ add 2 pads
# Sentence 4 β†’ 6 tokens β†’ no pad

# seq_len = 6

# d_model (embedding dimension)
# Let’s assume each token is converted to a vector of size 10 (small example).
# d_model = 10

# Final Input Tensor to Transformer
# (batch_size, seq_len, d_model)
# [4, 6, 10]


class Encoder_block(nn.Module):
    def __init__(self, d_model, d_ff, num_heads, dropout=0.1):
        super().__init__()
        self.ffn = feedforward(d_model, d_ff)
        self.multi_att = MultiHeadAttention(d_model, num_heads) #d_model >> embed_dim
        self.norm_layer1 = nn.LayerNorm(d_model)
        self.norm_layer2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

        # x β†’ [MHA] β†’ mha_out
        # x + Dropout(mha_out) β†’ Norm β†’ out1

        # out1 β†’ [FFN] β†’ ffn_out
        # out1 + Dropout(ffn_out) β†’ Norm β†’ out2

    def forward(self, x, mask=None):
        # Multi-Head Self Attention
        mha_out, attn = self.multi_att(x, mask)

        # first Add & Norm (Residual connection)
        residual_1 = x + self.dropout(mha_out) 
        norm_layer1_out = self.norm_layer1(residual_1)

         # Feed Forward Network
        ffn_out = self.ffn(norm_layer1_out)

        # second Add & Norm (Residual connection)
        residual_2 = norm_layer1_out + self.dropout(ffn_out)
        norm_layer2_out = self.norm_layer2(residual_2)

        return norm_layer2_out, attn