Food-Image-Retrieval-AI / positional_encoding.py
musk12's picture
Upload 10 files
5e724b1 verified
import torch
from torch import nn
import math
class Positional_Encoding(nn.Module):
def __init__(self, seq_len, d_model):
super().__init__()
PE = torch.zeros(seq_len, d_model)
position = torch.arange(0, seq_len).unsqueeze(-1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000) / d_model))
PE[:, 0::2] = torch.sin(position * div_term)
PE[:, 1::2] = torch.cos(position * div_term)
# PE(pos, 2i) = sin(pos / 10000^(2i/d_model))
# PE(pos, 2i+1) = cos(pos / 10000^(2i/d_model))
pe = PE.unsqueeze(0)
# print("pe: ", pe)
self.register_buffer("pe", pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1), :]
return x
# --- test ---
# x = torch.tensor([[[1.2]*512, [1.3]*512, [1.4]*512]]) # shape: (1, 3, 512)
# print(x[0:2, 0:2])
# pe = Positional_Encoding(seq_len=3, d_model=512)
# print(pe)
# out = pe(x)
# print(x.shape, out.shape, x.shape)
# print(out[0, 0, :10]) # first 10 dims of first word
# tensor([[[1.2000, 1.2000, 1.2000, ..., 1.2000, 1.2000, 1.2000],
# [1.3000, 1.3000, 1.3000, ..., 1.3000, 1.3000, 1.3000]]])
# pe: tensor([[[ 0.0000e+00, 1.0000e+00, 0.0000e+00, ..., 1.0000e+00,
# 0.0000e+00, 1.0000e+00],
# [ 8.4147e-01, 5.4030e-01, 8.2186e-01, ..., 1.0000e+00,
# 1.0366e-04, 1.0000e+00],
# [ 9.0930e-01, -4.1615e-01, 9.3641e-01, ..., 1.0000e+00,
# 2.0733e-04, 1.0000e+00]]])
# Positional_Encoding()
# torch.Size([1, 3, 512]) torch.Size([1, 3, 512]) torch.Size([1, 3, 512])
# tensor([1.2000, 2.2000, 1.2000, 2.2000, 1.2000, 2.2000, 1.2000, 2.2000, 1.2000,
# 2.2000])
# from above we can see that
# we have x embddings and pe positinal embeddings and then we have thier sum
# out = x + pe
# x => tensor([[[1.2000, 1.2000
# pe => tensor([[[ 0.0000e+00, 1.0000e+00
# out => tensor([1.2000, 2.2000
# Assume we have 3 words β†’ embeddings:
# z1, z2, z3 β†’ each of size d_model (say 512).
# So your input embeddings matrix is of shape (seq_len=3, d_model=512).
def positional_encoding(seq_len, d_model):
PE = torch.zeros(seq_len, d_model)
# We’re creating a matrix to store positional encodings for each token position.
# (seq_len=3, d_model=512)
# 3 positions, each with a vector of size 512.
# tensor([[0., 0., 0., ..., 0., 0., 0.],
# [0., 0., 0., ..., 0., 0., 0.],
# [0., 0., 0., ..., 0., 0., 0.]])
position = torch.arange(0, seq_len).unsqueeze(-1)
# tensor([0, 1, 2]) before unsqeeze
# after unsqeeze
# tensor([[0],
# [1],
# [2]])
# Shape β†’ (3, 1)
# Each row represents the position index of a token (z1, z2, z3).
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000) / d_model))
# print(div_term.shape)
# torch.Size([256])
# πŸ‘‰ d_model = 512
# πŸ‘‰ hum alternate karte hain β€” ek sin, ek cos
# πŸ‘‰ matlab half (512/2 = 256) frequencies lenge
# πŸ‘‰ har frequency se 2 dimension banenge β†’ sin + cos
# So final positional encoding ka shape hota hai [seq_len, 512],
# jisme har even index pe sine values hoti hain,
# aur har odd index pe cos values.
# Ye dono milke har position ke liye ek unique pattern bana dete hain,
# jisse model ko word order samajh me aata hai πŸš€
# torch.arange(0, d_model, 2) β†’ [0, 2, 4, …]
# βœ… Ye hi 2i ka kaam kar raha hai.
# (-log(10000)/d_model) β†’ divide by d_model and take negative log
# Multiplying dono β†’ 0*(-log(10000)/d_model) = 0 for i=0
# Multiplying by 2, 4, … β†’ automatically scale ho jata hai
# Apply sine to even indices, cosine to odd indices
PE[:, 0::2] = torch.sin(position * div_term)
# PE[:, 0::2]
# : β†’ all rows (all positions)
# 0::2 β†’ start at index 0, take every 2nd column
# Matlab even indices β†’ [0, 2, 4, ...
# Ye sine values ke liye reserve kiya jata hai
PE[:, 1::2] = torch.cos(position * div_term)
# PE[:, 1::2]
# : β†’ all rows (all positions)
# 1::2 β†’ start at index 1, take every 2nd column
# Matlab odd indices β†’ [1, 3, 5, ...]
# Ye cosine values ke liye reserve kiya jata hai
# PE, PE.shape >> torch.Size([3, 512])
# tensor([[ 0.0000e+00, 1.0000e+00, 0.0000e+00, ..., 1.0000e+00,
# 0.0000e+00, 1.0000e+00],
# [ 8.4147e-01, 5.4030e-01, 8.2186e-01, ..., 1.0000e+00,
# 1.0366e-04, 1.0000e+00],
# [ 9.0930e-01, -4.1615e-01, 9.3641e-01, ..., 1.0000e+00,
# 2.0733e-04, 1.0000e+00]])
# First row β†’ position 0 β†’ mostly sin(0)=0, cos(0)=1
# Second row β†’ position 1 β†’ sine/cos of different frequencies, gradually changing
# Third row β†’ position 2 β†’ values keep changing according to frequency scaling
pe = PE.unsqueeze(0) # add batch dimension
# self.register_buffer('pe', pe) # save as non-trainable buffer
return PE
# print(positional_encoding(3, 512))
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
# Create matrix of shape (max_len, d_model)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
# Apply sine to even indices, cosine to odd indices
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0) # shape: (1, max_len, d_model)
self.register_buffer('pe', pe)
def forward(self, x):
# x shape: (batch, seq_len, d_model)
x = x + self.pe[:, :x.size(1)]
# self.pe[:, ... , :] β†’ first dimension me kuch change nahi (kyunki pe ke paas ek hi batch dim hai)
# x.size(1) β†’ ye input ka actual sequence length deta hai (yaha 50)
# Toh self.pe[:, :x.size(1), :] ka shape hoga (1, 50, 512),
# jo input ke seq_len ke hisab se truncate ho gaya (agar pe ka max_len 100 tha to ab 50 tak hi lena hai).
return x