Spaces:

maxhuber
/

bidr-relight

Sleeping

App Files Files Community

bidr-relight / src /models /vit.py

maxhuber

Upload 14 files

3336231 verified 3 months ago

raw

history blame contribute delete

13.6 kB

	"""
	Patch-to-Patch Vision Transformer Models
	----------------------------------------

	This module implements two variants of Vision Transformer (ViT) architectures
	for dense regression tasks, designed around a patch-to-patch learning paradigm.
	Both models decompose images into non-overlapping patches, process them through
	a transformer encoder, and then reconstruct the output image.

	Components:
	- Patchify:
	Splits an input image into flattened non-overlapping patches.

	- Unpatchify:
	Reconstructs an image tensor from a sequence of flattened patches.

	- ViT_Patch2Patch (Version 1):
	• Patchify + linear projection into embedding space.
	• Sinusoidal positional encoding.
	• Transformer encoder with configurable depth/heads.
	• Linear decoder back to patch space, then Unpatchify.

	- ViT_Patch2Patch_ver2 (Version 2):
	• Patch embedding via Conv2d (stride = patch size).
	• Learned positional embeddings with dropout.
	• Transformer encoder with configurable depth/heads.
	• CNN-based decoder with PixelShuffle layers for super-resolution-style
	upsampling back to the original image resolution.

	Utilities:
	- test_model:
	Simple wrapper to run a forward pass and log output shapes.
	- main:
	Runs lightweight tests of both model variants on a dummy input tensor.

	Usage Example:
	>>> import torch
	>>> from vit_patch2patch import ViT_Patch2Patch, ViT_Patch2Patch_ver2
	>>> model = ViT_Patch2Patch(img_size=512, patch_size=8, in_ch=3, out_ch=3)
	>>> dummy = torch.randn(1, 3, 512, 512)
	>>> out = model(dummy)
	>>> print(out.shape) # torch.Size([1, 3, 512, 512])

	Notes:
	- Logging is used to track initialization parameters and parameter counts.
	- Default settings assume square images (H = W = img_size).
	- PixelShuffle decoder in Version 2 assumes patch_size divisible by upscaling factors.
	"""

	import logging
	import torch
	from torch import Tensor
	import torch.nn as nn
	# import timm


	##############################################################################################################


	class PositionalEncoding(nn.Module):
	def __init__(self, emb_size: int, max_len: int = 1000):
	"""
	Sinusoidal Positional Encoding Module.

	Args:
	emb_size (int): The size of the embedding dimension.
	max_len (int): The maximum length of the sequence.
	"""
	super(PositionalEncoding, self).__init__()
	self.logger = logging.getLogger(self.__class__.__name__)
	self.logger.info(
	f"Initializing PositionalEncoding with emb_size={emb_size}, max_len={max_len}"
	)

	position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
	div_term = torch.exp(
	torch.arange(0, emb_size, 2).float() * (-torch.log(10000.0) / emb_size)
	)
	pe = torch.zeros(max_len, emb_size)
	pe[:, 0::2] = torch.sin(position * div_term)
	pe[:, 1::2] = torch.cos(position * div_term)
	self.register_buffer("positional_encoding", pe.unsqueeze(0))

	def forward(self, x: Tensor) -> Tensor:
	"""
	Add positional encoding to the input tensor.

	Args:
	x (Tensor): Input tensor of shape (batch_size, seq_len, emb_size).

	Returns:
	Tensor: Positional encoded tensor of the same shape as input.
	"""
	seq_len = x.size(1)
	self.logger.debug(f"Adding positional encoding to tensor of shape {x.shape}")

	# Ensure positional encoding is on the same device as x
	self.positional_encoding = self.positional_encoding.to(x.device)
	return x + self.positional_encoding[:, :seq_len, :]


	##############################################################################################################


	class Patchify(nn.Module):
	def __init__(self, patch_size):
	super().__init__()
	self.patch_size = patch_size
	self.logger = logging.getLogger(self.__class__.__name__)
	self.logger.info(
	f"Initializing {self.__class__.__name__} \| patch_size = {self.patch_size}"
	)

	def forward(self, x): # (B, C, H, W)
	B, C, H, W = x.shape
	p = self.patch_size
	assert H % p == 0 and W % p == 0
	x = x.unfold(2, p, p).unfold(3, p, p) # B, C, H//p, W//p, p, p
	x = x.permute(0, 2, 3, 1, 4, 5).flatten(1, 3) # B, N, C, p, p
	return x.reshape(B, -1, C * p * p) # B, N, patch_dim


	class Unpatchify(nn.Module):
	def __init__(self, patch_size, out_channels, image_size):
	super().__init__()
	self.patch_size = patch_size
	self.out_channels = out_channels
	self.image_size = image_size
	self.logger = logging.getLogger(self.__class__.__name__)
	self.logger.info(
	f"Initializing {self.__class__.__name__} \| patch_size = {self.patch_size} \| out_channels={out_channels} \| image_size={image_size}"
	)

	def forward(self, x): # (B, N, patch_dim)
	B, N, D = x.shape
	p = self.patch_size
	H, W = self.image_size
	C = self.out_channels

	x = x.reshape(B, H // p, W // p, C, p, p)
	x = x.permute(0, 3, 1, 4, 2, 5).reshape(B, C, H, W)
	return x


	class ViT_Patch2Patch(nn.Module):
	def __init__(
	self,
	img_size=512,
	patch_size=8,
	in_ch=3,
	out_ch=3,
	embed_dim=512,
	depth=6,
	heads=8,
	):
	super().__init__()
	self.patch_size = patch_size
	self.img_size = img_size
	self.num_patches = (img_size // patch_size) ** 2
	self.patch_dim = in_ch * patch_size * patch_size
	self.output_dim = out_ch * patch_size * patch_size
	self.logger = logging.getLogger(self.__class__.__name__)
	self.logger.info(
	f"Initalizing {self.__class__.__name__} \| img_size={img_size} \| patch_size={patch_size}"
	"\| in_ch={in_ch} \| out_ch={out_ch} \| embed_dim={embed_dim} \| depth={depth} \| heads={heads}"
	)
	# Modules
	self.patchify = Patchify(patch_size)
	self.proj = nn.Linear(self.patch_dim, embed_dim)
	self.pos_encoding = PositionalEncoding(
	emb_size=embed_dim, max_len=self.num_patches
	)
	encoder_layer = nn.TransformerEncoderLayer(
	embed_dim, heads, dim_feedforward=embed_dim * 4, batch_first=True
	)
	self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=depth)
	self.decoder = nn.Linear(embed_dim, self.output_dim)
	self.unpatchify = Unpatchify(patch_size, out_ch, (img_size, img_size))
	self._log_parameter_count()

	def _log_parameter_count(self):
	"""
	Logs total and trainable parameters in the model, summarized by top-level modules.
	"""
	self.logger.info(
	f"{self.__class__.__name__} Parameter Summary (Top-Level Modules):"
	)
	self.logger.info("-" * 80)
	total_params = 0
	trainable_params = 0

	for name, module in self.named_children(): # Only top-level children
	mod_total = sum(p.numel() for p in module.parameters())
	mod_trainable = sum(
	p.numel() for p in module.parameters() if p.requires_grad
	)
	total_params += mod_total
	trainable_params += mod_trainable
	self.logger.info(
	f"{name:<25} \| Total: {mod_total:<20} \| Trainable: {mod_trainable:,}"
	)

	self.logger.info("-" * 80)
	self.logger.info(f"Total Parameters: {total_params:,}")
	self.logger.info(f"Trainable Parameters: {trainable_params:,}")

	def forward(self, x):
	x = self.patchify(x) # (B, N, patch_dim)
	x = self.proj(x) # (B, N, embed_dim)
	x = self.pos_encoding(x) # (B, N, embed_dim)
	x = self.encoder(x) # (B, N, embed_dim)
	x = self.decoder(x) # (B, N, patch_output_dim)
	x = self.unpatchify(x) # (B, out_ch, H, W)
	return x


	class ViT_Patch2Patch_ver2(nn.Module):
	"""
	SOme changes from above:
	- learned patch embed using a conv layer with kernelsize=patchsize
	- learned positional embedinng, no longer using sinusoidal
	- added some dropout
	- decoder:
	- replaced simple linear decoder from embed dim to output dim (pre patchify)
	- using PixelShuffle super resolution technique
	o https://docs.pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html
	"""

	def __init__(
	self,
	img_size=512,
	patch_size=8,
	in_ch=3,
	out_ch=3,
	embed_dim=512,
	depth=6,
	heads=8,
	dropout=0.0,
	):
	super().__init__()
	self.img_size = img_size
	self.patch_size = patch_size
	self.num_patches = (img_size // patch_size) ** 2
	self.embed_dim = embed_dim
	self.logger = logging.getLogger(self.__class__.__name__)
	self.logger.info(
	f"Initialized {self.__class__.__name__} with img_size={img_size}, patch_size={patch_size}"
	)

	# Patch embedding via conv
	self.patch_embed = nn.Conv2d(
	in_ch, embed_dim, kernel_size=patch_size, stride=patch_size
	)
	self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim))
	self.pos_dropout = nn.Dropout(p=dropout)

	# Transformer encoder
	encoder_layer = nn.TransformerEncoderLayer(
	d_model=embed_dim,
	nhead=heads,
	dim_feedforward=embed_dim * 4,
	dropout=dropout,
	batch_first=True,
	)
	self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=depth)

	# Decoder: maps transformer output back to full-resolution image
	self.decoder = nn.Sequential( # IF patchsize = 16
	nn.Conv2d(
	embed_dim, 512, 3, padding=1
	), # [B, 768, 32, 32] -> [B, 512, 32, 32]
	nn.ReLU(),
	nn.Conv2d(
	512, 1024, 3, padding=1
	), # [B, 512, 32, 32] -> [B, 1024, 32, 32]
	nn.ReLU(),
	nn.PixelShuffle(4), # [B, 1024, 32, 32] -> [B, 64, 128, 128]
	nn.Conv2d(64, 64, 3, padding=1), # [B, 64, 128, 128] -> [B, 64, 128, 128]
	nn.ReLU(),
	nn.Conv2d(64, 64, 3, padding=1), # [B, 64, 128, 128] -> [B, 64, 128, 128]
	nn.ReLU(),
	nn.PixelShuffle(4), # [B, 64, 128, 128] -> [B, 4, 512, 512]
	nn.Conv2d(4, out_ch, 3, padding=1), # [B, 4, 512, 512] -> [B, 3, 512, 512]
	)

	self._log_parameter_count()

	def _log_parameter_count(self):
	"""
	Logs total and trainable parameters in the model, summarized by top-level modules.
	"""
	self.logger.info(
	f"{self.__class__.__name__} Parameter Summary (Top-Level Modules):"
	)
	self.logger.info("-" * 80)
	total_params = 0
	trainable_params = 0

	for name, module in self.named_children(): # Only top-level children
	mod_total = sum(p.numel() for p in module.parameters())
	mod_trainable = sum(
	p.numel() for p in module.parameters() if p.requires_grad
	)
	total_params += mod_total
	trainable_params += mod_trainable
	self.logger.info(
	f"{name:<25} \| Total: {mod_total:<20} \| Trainable: {mod_trainable:,}"
	)

	self.logger.info("-" * 80)
	self.logger.info(f"Total Parameters: {total_params:,}")
	self.logger.info(f"Trainable Parameters: {trainable_params:,}")

	def forward(self, x):
	B = x.size(0)

	# Patch embedding
	x = self.patch_embed(x) # [B, embed_dim, H//p, W//p]
	H_p, W_p = x.shape[2], x.shape[3]
	x = x.flatten(2).transpose(1, 2) # [B, N, embed_dim]

	# Add positional embedding
	x = x + self.pos_embed[:, : x.size(1), :]
	x = self.pos_dropout(x)

	# Transformer
	x = self.encoder(x) # [B, N, embed_dim]

	# Reshape back to 2D grid
	x = x.transpose(1, 2).reshape(
	B, self.embed_dim, H_p, W_p
	) # [B, embed_dim, H//p, W//p]

	# Decode to full-res output
	out = self.decoder(x) # [B, out_ch, H, W]
	return out


	# ============================================================================================================
	# TESTING
	# ============================================================================================================


	def test_model(model, name, input_tensor):
	try:
	print(f"Testing {name}...")
	out = model(input_tensor)
	print(f"{name} output shape: {out.shape}\n")
	except Exception as e:
	print(f"{name} failed with error: {e}\n")


	def main():
	logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

	print("Starting model tests...\n")
	# Set image parameters
	B, C, H, W = 1, 3, 512, 512
	dummy_input = torch.randn(B, C, H, W)

	# 1. ViT Patch2Patch (version 1)
	model1 = ViT_Patch2Patch(img_size=512, patch_size=8, in_ch=3, out_ch=3)
	test_model(model1, "ViT_Patch2Patch (version 1)", dummy_input)
	print("==" * 50)

	# 2. ViT Patch2Patch (version 2)
	model1 = ViT_Patch2Patch_ver2(img_size=512, patch_size=8, in_ch=3, out_ch=3)
	test_model(model1, "ViT_Patch2Patch (version 2)", dummy_input)
	print("==" * 50)


	if __name__ == "__main__":
	main()