import math import torch.nn as nn import torch from torch.nn import functional as F from timm.models.layers.helpers import to_2tuple from typing import Tuple, Union, List class PatchEmbed(nn.Module): """ Image to Patch Embedding """ def __init__(self, img_size=(224, 224), patch_size=16, in_chans=3, embed_dim=768): super().__init__() assert isinstance(img_size, tuple) patch_size = to_2tuple(patch_size) div_h, mod_h = divmod(img_size[0], patch_size[0]) div_w, mod_w = divmod(img_size[1], patch_size[1]) self.img_size = (patch_size[0]*(div_h + (1 if mod_h > 0 else 0)), \ patch_size[1]*(div_w + (1 if mod_w > 0 else 0))) self.grid_size = (self.img_size[0] // patch_size[0], self.img_size[1] // patch_size[1]) self.patch_size = patch_size self.num_patches = self.grid_size[0] * self.grid_size[1] self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) def forward(self, x): _, _, H, W = x.shape div_h, mod_h = divmod(H, self.patch_size[0]) div_w, mod_w = divmod(W, self.patch_size[1]) pad_H =self.patch_size[0]*(div_h + (1 if mod_h > 0 else 0)) - H pad_W = self.patch_size[1]*(div_w + (1 if mod_w > 0 else 0)) - W x = F.pad(x, (0, pad_W, 0 , pad_H)) assert x.shape[2] % self.patch_size[0] == 0 and x.shape[3] % self.patch_size[1] == 0 proj_x = self.proj(x).flatten(2).transpose(1, 2) return proj_x, {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.img_size[0] or x.shape[3] != self.img_size[1]) class HybridEmbed(nn.Module): """ CNN Feature Map Embedding Extract feature map from CNN, flatten, project to embedding dim. """ def __init__(self, backbone, img_size: Tuple[int], patch_size=Union[List, int], feature_size=None, in_chans=3, embed_dim=768): super().__init__() assert isinstance(backbone, nn.Module) if isinstance(patch_size, int): patch_size = to_2tuple(patch_size) else: patch_size = tuple(patch_size) self.img_size = img_size self.patch_size = patch_size self.backbone = backbone if feature_size is None: with torch.no_grad(): # NOTE Most reliable way of determining output dims is to run forward pass training = backbone.training if training: backbone.eval() o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1])) if isinstance(o, (list, tuple)): o = o[-1] # last feature if backbone outputs list/tuple of features feature_size = o.shape[-2:] feature_dim = o.shape[1] backbone.train(training) else: feature_size = to_2tuple(feature_size) if hasattr(self.backbone, 'feature_info'): feature_dim = self.backbone.feature_info.channels()[-1] else: feature_dim = self.backbone.num_features assert feature_size[0] >= patch_size[0] and feature_size[1] >= patch_size[1] div_h, mod_h = divmod(feature_size[0], patch_size[0]) div_w, mod_w = divmod(feature_size[1], patch_size[1]) self.feature_size = (patch_size[0]*(div_h + (1 if mod_h > 0 else 0)), patch_size[1]*(div_w + (1 if mod_w > 0 else 0))) assert self.feature_size[0] % patch_size[0] == 0 and self.feature_size[1] % patch_size[1] == 0 self.grid_size = (self.feature_size[0] // patch_size[0], self.feature_size[1] // patch_size[1]) self.num_patches = self.grid_size[0] * self.grid_size[1] self.proj = nn.Conv2d(feature_dim, embed_dim, kernel_size=patch_size, stride=patch_size) def forward(self, x): origin_size = x.shape[-2:] x = self.backbone(x) f_h, f_w = x.shape[2:] # assert f_h >= self.patch_size[0] and f_w >= self.patch_size[1] div_h, mod_h = divmod(f_h, self.patch_size[0]) div_w, mod_w = divmod(f_w, self.patch_size[1]) pad_H =self.patch_size[0]*(div_h + (1 if mod_h > 0 else 0)) - f_h pad_W = self.patch_size[1]*(div_w + (1 if mod_w > 0 else 0)) - f_w x = F.pad(x, (0, pad_W, 0 , pad_H)) assert x.shape[2] % self.patch_size[0] == 0 and x.shape[3] % self.patch_size[1] == 0 if isinstance(x, (list, tuple)): x = x[-1] # last feature if backbone outputs list/tuple of features proj_x = self.proj(x).flatten(2).transpose(1, 2) return proj_x, (pad_W, pad_H), {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.feature_size[0] or x.shape[3] != self.feature_size[1]) class HybridEmbed1D(nn.Module): """ CNN Feature Map Embedding which using 1D embed patching from https://arxiv.org/pdf/2111.08314.pdf, which benefits for text recognition task.Check paper for more detail Extract feature map from CNN, flatten, project to embedding dim. """ def __init__(self, backbone, img_size: Tuple[int], feature_size=None, patch_size=1, in_chans=3, embed_dim=768): super().__init__() assert isinstance(backbone, nn.Module) self.img_size = img_size self.backbone = backbone self.embed_dim = embed_dim if feature_size is None: with torch.no_grad(): # NOTE Most reliable way of determining output dims is to run forward pass training = backbone.training if training: backbone.eval() o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1])) if isinstance(o, (list, tuple)): o = o[-1] # last feature if backbone outputs list/tuple of features feature_size = o.shape[-2:] feature_dim = o.shape[1] backbone.train(training) else: feature_size = to_2tuple(feature_size) if hasattr(self.backbone, 'feature_info'): feature_dim = self.backbone.feature_info.channels()[-1] else: feature_dim = self.backbone.num_features self.window_width = patch_size assert feature_size[1] >= self.window_width div_w, mod_w = divmod(feature_size[1], self.window_width) self.feature_size = (feature_size[0], self.window_width*(div_w + (1 if mod_w > 0 else 0))) assert self.feature_size[1] % self.window_width == 0 self.grid_size = (1, self.feature_size[1] // self.window_width) self.num_patches = self.grid_size[1] self.proj = nn.Conv1d(feature_dim, embed_dim, kernel_size=self.window_width, stride=self.window_width, bias=True) def forward(self, x): batch_size = x.shape[0] x = self.backbone(x) f_h, f_w = x.shape[2:] assert f_w >= self.window_width div_w, mod_w = divmod(f_w, self.window_width) pad_W = self.window_width*(div_w + (1 if mod_w > 0 else 0)) - f_w x = F.pad(x, (0, pad_W)) assert x.shape[3] % self.window_width == 0 if isinstance(x, (list, tuple)): x = x[-1] # last feature if backbone outputs list/tuple of features proj_x = torch.zeros(batch_size, self.embed_dim, f_h, x.shape[3]//self.window_width, device=x.device, dtype=x.dtype) for i in range(f_h): proj = self.proj(x[:, :, i, :]) proj_x[:, :, i, :] = proj proj_x = proj_x.mean(dim=2).transpose(1, 2) #BCHW->BCW return proj_x, (pad_W, ), {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.feature_size[0] or x.shape[3] != self.feature_size[1])