File size: 6,409 Bytes
5e724b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import torch
from torch import nn
import math


class Positional_Encoding(nn.Module):
    def __init__(self, seq_len, d_model):
        super().__init__()

        PE = torch.zeros(seq_len, d_model)

        position = torch.arange(0, seq_len).unsqueeze(-1)

        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000) / d_model))

        PE[:, 0::2] = torch.sin(position * div_term)

        PE[:, 1::2] = torch.cos(position * div_term)

        # PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
        # PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))

        pe = PE.unsqueeze(0)
        # print("pe: ", pe)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        
        return x 





# --- test ---
# x = torch.tensor([[[1.2]*512, [1.3]*512, [1.4]*512]])  # shape: (1, 3, 512)
# print(x[0:2, 0:2])
# pe = Positional_Encoding(seq_len=3, d_model=512)
# print(pe)
# out = pe(x)
# print(x.shape, out.shape, x.shape)
# print(out[0, 0, :10])  # first 10 dims of first word


# tensor([[[1.2000, 1.2000, 1.2000,  ..., 1.2000, 1.2000, 1.2000],
#          [1.3000, 1.3000, 1.3000,  ..., 1.3000, 1.3000, 1.3000]]])
# pe:  tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
#            0.0000e+00,  1.0000e+00],
#          [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
#            1.0366e-04,  1.0000e+00],
#          [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
#            2.0733e-04,  1.0000e+00]]])
# Positional_Encoding()
# torch.Size([1, 3, 512]) torch.Size([1, 3, 512]) torch.Size([1, 3, 512])
# tensor([1.2000, 2.2000, 1.2000, 2.2000, 1.2000, 2.2000, 1.2000, 2.2000, 1.2000,
#         2.2000])

# from above we can see that 
# we have x embddings and pe positinal embeddings and then we have thier sum
# out = x + pe
# x => tensor([[[1.2000, 1.2000
# pe => tensor([[[ 0.0000e+00,  1.0000e+00
# out => tensor([1.2000, 2.2000



# Assume we have 3 words β†’ embeddings:
# z1, z2, z3 β†’ each of size d_model (say 512).
# So your input embeddings matrix is of shape (seq_len=3, d_model=512).

def positional_encoding(seq_len, d_model):
    PE = torch.zeros(seq_len, d_model)
    # We’re creating a matrix to store positional encodings for each token position.
    # (seq_len=3, d_model=512)
    # 3 positions, each with a vector of size 512.
    # tensor([[0., 0., 0.,  ..., 0., 0., 0.],
    #     [0., 0., 0.,  ..., 0., 0., 0.],
    #     [0., 0., 0.,  ..., 0., 0., 0.]])


    position = torch.arange(0, seq_len).unsqueeze(-1)
    # tensor([0, 1, 2]) before unsqeeze
    # after unsqeeze
    # tensor([[0],
        # [1],
        # [2]])
    # Shape β†’ (3, 1)
    # Each row represents the position index of a token (z1, z2, z3).
    

    div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000) / d_model))
    # print(div_term.shape)
    # torch.Size([256])

    # πŸ‘‰ d_model = 512
    # πŸ‘‰ hum alternate karte hain β€” ek sin, ek cos
    # πŸ‘‰ matlab half (512/2 = 256) frequencies lenge
    # πŸ‘‰ har frequency se 2 dimension banenge β†’ sin + cos

    # So final positional encoding ka shape hota hai [seq_len, 512],
    # jisme har even index pe sine values hoti hain,
    # aur har odd index pe cos values.

    # Ye dono milke har position ke liye ek unique pattern bana dete hain,
    # jisse model ko word order samajh me aata hai πŸš€

    # torch.arange(0, d_model, 2) β†’ [0, 2, 4, …]
    # βœ… Ye hi 2i ka kaam kar raha hai.

    # (-log(10000)/d_model) β†’ divide by d_model and take negative log

    # Multiplying dono β†’ 0*(-log(10000)/d_model) = 0 for i=0

    # Multiplying by 2, 4, … β†’ automatically scale ho jata hai



    # Apply sine to even indices, cosine to odd indices
    PE[:, 0::2] = torch.sin(position * div_term)
    # PE[:, 0::2]
    # : β†’ all rows (all positions)
    # 0::2 β†’ start at index 0, take every 2nd column
    # Matlab even indices β†’ [0, 2, 4, ...
    # Ye sine values ke liye reserve kiya jata hai

    PE[:, 1::2] = torch.cos(position * div_term)
    # PE[:, 1::2]
    # : β†’ all rows (all positions)
    # 1::2 β†’ start at index 1, take every 2nd column
    # Matlab odd indices β†’ [1, 3, 5, ...]
    # Ye cosine values ke liye reserve kiya jata hai

    # PE, PE.shape >> torch.Size([3, 512])

    # tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
    #       0.0000e+00,  1.0000e+00],
    #     [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
    #       1.0366e-04,  1.0000e+00],
    #     [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
    #       2.0733e-04,  1.0000e+00]])


    # First row β†’ position 0 β†’ mostly sin(0)=0, cos(0)=1
    # Second row β†’ position 1 β†’ sine/cos of different frequencies, gradually changing
    # Third row β†’ position 2 β†’ values keep changing according to frequency scaling

    pe = PE.unsqueeze(0)            # add batch dimension
    # self.register_buffer('pe', pe)  # save as non-trainable buffer

    return  PE

# print(positional_encoding(3, 512))



class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Create matrix of shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        
        # Apply sine to even indices, cosine to odd indices
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)  # shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (batch, seq_len, d_model)
        x = x + self.pe[:, :x.size(1)]
        # self.pe[:, ... , :] β†’ first dimension me kuch change nahi (kyunki pe ke paas ek hi batch dim hai)
        # x.size(1) β†’ ye input ka actual sequence length deta hai (yaha 50)
        # Toh self.pe[:, :x.size(1), :] ka shape hoga (1, 50, 512),
        # jo input ke seq_len ke hisab se truncate ho gaya (agar pe ka max_len 100 tha to ab 50 tak hi lena hai).

        return x