duycse1603's picture
[Add] source
6163604
import math
import torch.nn as nn
import torch
from torch.nn import functional as F
from timm.models.layers.helpers import to_2tuple
from typing import Tuple, Union, List
class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, img_size=(224, 224), patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
assert isinstance(img_size, tuple)
patch_size = to_2tuple(patch_size)
div_h, mod_h = divmod(img_size[0], patch_size[0])
div_w, mod_w = divmod(img_size[1], patch_size[1])
self.img_size = (patch_size[0]*(div_h + (1 if mod_h > 0 else 0)), \
patch_size[1]*(div_w + (1 if mod_w > 0 else 0)))
self.grid_size = (self.img_size[0] // patch_size[0], self.img_size[1] // patch_size[1])
self.patch_size = patch_size
self.num_patches = self.grid_size[0] * self.grid_size[1]
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, x):
_, _, H, W = x.shape
div_h, mod_h = divmod(H, self.patch_size[0])
div_w, mod_w = divmod(W, self.patch_size[1])
pad_H =self.patch_size[0]*(div_h + (1 if mod_h > 0 else 0)) - H
pad_W = self.patch_size[1]*(div_w + (1 if mod_w > 0 else 0)) - W
x = F.pad(x, (0, pad_W, 0 , pad_H))
assert x.shape[2] % self.patch_size[0] == 0 and x.shape[3] % self.patch_size[1] == 0
proj_x = self.proj(x).flatten(2).transpose(1, 2)
return proj_x, {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.img_size[0] or x.shape[3] != self.img_size[1])
class HybridEmbed(nn.Module):
""" CNN Feature Map Embedding
Extract feature map from CNN, flatten, project to embedding dim.
"""
def __init__(self, backbone, img_size: Tuple[int], patch_size=Union[List, int], feature_size=None, in_chans=3, embed_dim=768):
super().__init__()
assert isinstance(backbone, nn.Module)
if isinstance(patch_size, int):
patch_size = to_2tuple(patch_size)
else:
patch_size = tuple(patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.backbone = backbone
if feature_size is None:
with torch.no_grad():
# NOTE Most reliable way of determining output dims is to run forward pass
training = backbone.training
if training:
backbone.eval()
o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))
if isinstance(o, (list, tuple)):
o = o[-1] # last feature if backbone outputs list/tuple of features
feature_size = o.shape[-2:]
feature_dim = o.shape[1]
backbone.train(training)
else:
feature_size = to_2tuple(feature_size)
if hasattr(self.backbone, 'feature_info'):
feature_dim = self.backbone.feature_info.channels()[-1]
else:
feature_dim = self.backbone.num_features
assert feature_size[0] >= patch_size[0] and feature_size[1] >= patch_size[1]
div_h, mod_h = divmod(feature_size[0], patch_size[0])
div_w, mod_w = divmod(feature_size[1], patch_size[1])
self.feature_size = (patch_size[0]*(div_h + (1 if mod_h > 0 else 0)), patch_size[1]*(div_w + (1 if mod_w > 0 else 0)))
assert self.feature_size[0] % patch_size[0] == 0 and self.feature_size[1] % patch_size[1] == 0
self.grid_size = (self.feature_size[0] // patch_size[0], self.feature_size[1] // patch_size[1])
self.num_patches = self.grid_size[0] * self.grid_size[1]
self.proj = nn.Conv2d(feature_dim, embed_dim, kernel_size=patch_size, stride=patch_size)
def forward(self, x):
origin_size = x.shape[-2:]
x = self.backbone(x)
f_h, f_w = x.shape[2:]
# assert f_h >= self.patch_size[0] and f_w >= self.patch_size[1]
div_h, mod_h = divmod(f_h, self.patch_size[0])
div_w, mod_w = divmod(f_w, self.patch_size[1])
pad_H =self.patch_size[0]*(div_h + (1 if mod_h > 0 else 0)) - f_h
pad_W = self.patch_size[1]*(div_w + (1 if mod_w > 0 else 0)) - f_w
x = F.pad(x, (0, pad_W, 0 , pad_H))
assert x.shape[2] % self.patch_size[0] == 0 and x.shape[3] % self.patch_size[1] == 0
if isinstance(x, (list, tuple)):
x = x[-1] # last feature if backbone outputs list/tuple of features
proj_x = self.proj(x).flatten(2).transpose(1, 2)
return proj_x, (pad_W, pad_H), {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.feature_size[0] or x.shape[3] != self.feature_size[1])
class HybridEmbed1D(nn.Module):
""" CNN Feature Map Embedding which using 1D embed patching
from https://arxiv.org/pdf/2111.08314.pdf, which benefits for text recognition task.Check paper for more detail
Extract feature map from CNN, flatten, project to embedding dim.
"""
def __init__(self, backbone, img_size: Tuple[int], feature_size=None, patch_size=1, in_chans=3, embed_dim=768):
super().__init__()
assert isinstance(backbone, nn.Module)
self.img_size = img_size
self.backbone = backbone
self.embed_dim = embed_dim
if feature_size is None:
with torch.no_grad():
# NOTE Most reliable way of determining output dims is to run forward pass
training = backbone.training
if training:
backbone.eval()
o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))
if isinstance(o, (list, tuple)):
o = o[-1] # last feature if backbone outputs list/tuple of features
feature_size = o.shape[-2:]
feature_dim = o.shape[1]
backbone.train(training)
else:
feature_size = to_2tuple(feature_size)
if hasattr(self.backbone, 'feature_info'):
feature_dim = self.backbone.feature_info.channels()[-1]
else:
feature_dim = self.backbone.num_features
self.window_width = patch_size
assert feature_size[1] >= self.window_width
div_w, mod_w = divmod(feature_size[1], self.window_width)
self.feature_size = (feature_size[0], self.window_width*(div_w + (1 if mod_w > 0 else 0)))
assert self.feature_size[1] % self.window_width == 0
self.grid_size = (1, self.feature_size[1] // self.window_width)
self.num_patches = self.grid_size[1]
self.proj = nn.Conv1d(feature_dim, embed_dim, kernel_size=self.window_width, stride=self.window_width, bias=True)
def forward(self, x):
batch_size = x.shape[0]
x = self.backbone(x)
f_h, f_w = x.shape[2:]
assert f_w >= self.window_width
div_w, mod_w = divmod(f_w, self.window_width)
pad_W = self.window_width*(div_w + (1 if mod_w > 0 else 0)) - f_w
x = F.pad(x, (0, pad_W))
assert x.shape[3] % self.window_width == 0
if isinstance(x, (list, tuple)):
x = x[-1] # last feature if backbone outputs list/tuple of features
proj_x = torch.zeros(batch_size, self.embed_dim, f_h, x.shape[3]//self.window_width, device=x.device, dtype=x.dtype)
for i in range(f_h):
proj = self.proj(x[:, :, i, :])
proj_x[:, :, i, :] = proj
proj_x = proj_x.mean(dim=2).transpose(1, 2) #BCHW->BCW
return proj_x, (pad_W, ), {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.feature_size[0] or x.shape[3] != self.feature_size[1])