Spaces:

duycse1603
/

math2tex

Runtime error

App Files Files Community

math2tex / HybridViT /module /component /seq_modeling /addon_module /patchembed.py

duycse1603

[Add] source

6163604 over 2 years ago

raw

history blame contribute delete

7.8 kB

	import math
	import torch.nn as nn
	import torch
	from torch.nn import functional as F
	from timm.models.layers.helpers import to_2tuple
	from typing import Tuple, Union, List

	class PatchEmbed(nn.Module):
	""" Image to Patch Embedding
	"""
	def __init__(self, img_size=(224, 224), patch_size=16, in_chans=3, embed_dim=768):
	super().__init__()
	assert isinstance(img_size, tuple)
	patch_size = to_2tuple(patch_size)
	div_h, mod_h = divmod(img_size[0], patch_size[0])
	div_w, mod_w = divmod(img_size[1], patch_size[1])
	self.img_size = (patch_size[0]*(div_h + (1 if mod_h > 0 else 0)), \
	patch_size[1]*(div_w + (1 if mod_w > 0 else 0)))
	self.grid_size = (self.img_size[0] // patch_size[0], self.img_size[1] // patch_size[1])
	self.patch_size = patch_size
	self.num_patches = self.grid_size[0] * self.grid_size[1]
	self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

	def forward(self, x):
	_, _, H, W = x.shape
	div_h, mod_h = divmod(H, self.patch_size[0])
	div_w, mod_w = divmod(W, self.patch_size[1])
	pad_H =self.patch_size[0]*(div_h + (1 if mod_h > 0 else 0)) - H
	pad_W = self.patch_size[1]*(div_w + (1 if mod_w > 0 else 0)) - W
	x = F.pad(x, (0, pad_W, 0 , pad_H))
	assert x.shape[2] % self.patch_size[0] == 0 and x.shape[3] % self.patch_size[1] == 0
	proj_x = self.proj(x).flatten(2).transpose(1, 2)
	return proj_x, {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.img_size[0] or x.shape[3] != self.img_size[1])

	class HybridEmbed(nn.Module):
	""" CNN Feature Map Embedding
	Extract feature map from CNN, flatten, project to embedding dim.
	"""
	def __init__(self, backbone, img_size: Tuple[int], patch_size=Union[List, int], feature_size=None, in_chans=3, embed_dim=768):
	super().__init__()
	assert isinstance(backbone, nn.Module)
	if isinstance(patch_size, int):
	patch_size = to_2tuple(patch_size)
	else:
	patch_size = tuple(patch_size)
	self.img_size = img_size
	self.patch_size = patch_size
	self.backbone = backbone
	if feature_size is None:
	with torch.no_grad():
	# NOTE Most reliable way of determining output dims is to run forward pass
	training = backbone.training
	if training:
	backbone.eval()
	o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))
	if isinstance(o, (list, tuple)):
	o = o[-1] # last feature if backbone outputs list/tuple of features
	feature_size = o.shape[-2:]
	feature_dim = o.shape[1]
	backbone.train(training)
	else:
	feature_size = to_2tuple(feature_size)
	if hasattr(self.backbone, 'feature_info'):
	feature_dim = self.backbone.feature_info.channels()[-1]
	else:
	feature_dim = self.backbone.num_features

	assert feature_size[0] >= patch_size[0] and feature_size[1] >= patch_size[1]

	div_h, mod_h = divmod(feature_size[0], patch_size[0])
	div_w, mod_w = divmod(feature_size[1], patch_size[1])

	self.feature_size = (patch_size[0](div_h + (1 if mod_h > 0 else 0)), patch_size[1](div_w + (1 if mod_w > 0 else 0)))
	assert self.feature_size[0] % patch_size[0] == 0 and self.feature_size[1] % patch_size[1] == 0
	self.grid_size = (self.feature_size[0] // patch_size[0], self.feature_size[1] // patch_size[1])
	self.num_patches = self.grid_size[0] * self.grid_size[1]
	self.proj = nn.Conv2d(feature_dim, embed_dim, kernel_size=patch_size, stride=patch_size)

	def forward(self, x):
	origin_size = x.shape[-2:]
	x = self.backbone(x)
	f_h, f_w = x.shape[2:]
	# assert f_h >= self.patch_size[0] and f_w >= self.patch_size[1]

	div_h, mod_h = divmod(f_h, self.patch_size[0])
	div_w, mod_w = divmod(f_w, self.patch_size[1])

	pad_H =self.patch_size[0]*(div_h + (1 if mod_h > 0 else 0)) - f_h
	pad_W = self.patch_size[1]*(div_w + (1 if mod_w > 0 else 0)) - f_w
	x = F.pad(x, (0, pad_W, 0 , pad_H))

	assert x.shape[2] % self.patch_size[0] == 0 and x.shape[3] % self.patch_size[1] == 0
	if isinstance(x, (list, tuple)):
	x = x[-1] # last feature if backbone outputs list/tuple of features

	proj_x = self.proj(x).flatten(2).transpose(1, 2)
	return proj_x, (pad_W, pad_H), {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.feature_size[0] or x.shape[3] != self.feature_size[1])

	class HybridEmbed1D(nn.Module):
	""" CNN Feature Map Embedding which using 1D embed patching
	from https://arxiv.org/pdf/2111.08314.pdf, which benefits for text recognition task.Check paper for more detail
	Extract feature map from CNN, flatten, project to embedding dim.
	"""
	def __init__(self, backbone, img_size: Tuple[int], feature_size=None, patch_size=1, in_chans=3, embed_dim=768):
	super().__init__()
	assert isinstance(backbone, nn.Module)
	self.img_size = img_size
	self.backbone = backbone
	self.embed_dim = embed_dim
	if feature_size is None:
	with torch.no_grad():
	# NOTE Most reliable way of determining output dims is to run forward pass
	training = backbone.training
	if training:
	backbone.eval()
	o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))
	if isinstance(o, (list, tuple)):
	o = o[-1] # last feature if backbone outputs list/tuple of features
	feature_size = o.shape[-2:]
	feature_dim = o.shape[1]
	backbone.train(training)
	else:
	feature_size = to_2tuple(feature_size)
	if hasattr(self.backbone, 'feature_info'):
	feature_dim = self.backbone.feature_info.channels()[-1]
	else:
	feature_dim = self.backbone.num_features

	self.window_width = patch_size
	assert feature_size[1] >= self.window_width
	div_w, mod_w = divmod(feature_size[1], self.window_width)
	self.feature_size = (feature_size[0], self.window_width*(div_w + (1 if mod_w > 0 else 0)))
	assert self.feature_size[1] % self.window_width == 0
	self.grid_size = (1, self.feature_size[1] // self.window_width)
	self.num_patches = self.grid_size[1]
	self.proj = nn.Conv1d(feature_dim, embed_dim, kernel_size=self.window_width, stride=self.window_width, bias=True)

	def forward(self, x):
	batch_size = x.shape[0]
	x = self.backbone(x)
	f_h, f_w = x.shape[2:]
	assert f_w >= self.window_width

	div_w, mod_w = divmod(f_w, self.window_width)
	pad_W = self.window_width*(div_w + (1 if mod_w > 0 else 0)) - f_w

	x = F.pad(x, (0, pad_W))
	assert x.shape[3] % self.window_width == 0

	if isinstance(x, (list, tuple)):
	x = x[-1] # last feature if backbone outputs list/tuple of features

	proj_x = torch.zeros(batch_size, self.embed_dim, f_h, x.shape[3]//self.window_width, device=x.device, dtype=x.dtype)

	for i in range(f_h):
	proj = self.proj(x[:, :, i, :])
	proj_x[:, :, i, :] = proj

	proj_x = proj_x.mean(dim=2).transpose(1, 2) #BCHW->BCW

	return proj_x, (pad_W, ), {'height': x.shape[2], 'width': x.shape[3]}, (x.shape[2] != self.feature_size[0] or x.shape[3] != self.feature_size[1])