Upload folder using huggingface_hub

36c95ba verified 6 months ago

11.2 kB

	from typing import cast, List, Optional, Tuple, Union

	import torch
	import torch.nn as nn

	import kornia
	from kornia.augmentation.base import _AugmentationBase, MixAugmentationBase, TensorWithTransformMat
	from kornia.augmentation.container.base import SequentialBase
	from kornia.augmentation.container.utils import InputApplyInverse, MaskApplyInverse

	from .image import ImageSequential, ParamItem

	__all__ = ["VideoSequential"]


	class VideoSequential(ImageSequential):
	r"""VideoSequential for processing 5-dim video data like (B, T, C, H, W) and (B, C, T, H, W).

	`VideoSequential` is used to replace `nn.Sequential` for processing video data augmentations.
	By default, `VideoSequential` enabled `same_on_frame` to make sure the same augmentations happen
	across temporal dimension. Meanwhile, it will not affect other augmentation behaviours like the
	settings on `same_on_batch`, etc.

	Args:
	*args: a list of augmentation module.
	data_format: only BCTHW and BTCHW are supported.
	same_on_frame: apply the same transformation across the channel per frame.
	random_apply: randomly select a sublist (order agnostic) of args to
	apply transformation.
	If int, a fixed number of transformations will be selected.
	If (a,), x number of transformations (a <= x <= len(args)) will be selected.
	If (a, b), x number of transformations (a <= x <= b) will be selected.
	If None, the whole list of args will be processed as a sequence.

	Note:
	Transformation matrix returned only considers the transformation applied in ``kornia.augmentation`` module.
	Those transformations in ``kornia.geometry`` will not be taken into account.

	Example:
	If set `same_on_frame` to True, we would expect the same augmentation has been applied to each
	timeframe.

	>>> input, label = torch.randn(2, 3, 1, 5, 6).repeat(1, 1, 4, 1, 1), torch.tensor([0, 1])
	>>> aug_list = VideoSequential(
	... kornia.augmentation.ColorJitter(0.1, 0.1, 0.1, 0.1, p=1.0),
	... kornia.color.BgrToRgb(),
	... kornia.augmentation.RandomAffine(360, p=1.0),
	... random_apply=10,
	... data_format="BCTHW",
	... same_on_frame=True)
	>>> output = aug_list(input)
	>>> (output[0, :, 0] == output[0, :, 1]).all()
	tensor(True)
	>>> (output[0, :, 1] == output[0, :, 2]).all()
	tensor(True)
	>>> (output[0, :, 2] == output[0, :, 3]).all()
	tensor(True)

	If set `same_on_frame` to False:

	>>> aug_list = VideoSequential(
	... kornia.augmentation.ColorJitter(0.1, 0.1, 0.1, 0.1, p=1.0),
	... kornia.augmentation.RandomAffine(360, p=1.0),
	... kornia.augmentation.RandomMixUp(p=1.0),
	... data_format="BCTHW",
	... same_on_frame=False)
	>>> output, lab = aug_list(input)
	>>> output.shape, lab.shape
	(torch.Size([2, 3, 4, 5, 6]), torch.Size([2, 4, 3]))
	>>> (output[0, :, 0] == output[0, :, 1]).all()
	tensor(False)

	Reproduce with provided params.
	>>> out2, lab2 = aug_list(input, label, params=aug_list._params)
	>>> torch.equal(output, out2)
	True
	"""

	def __init__(
	self,
	*args: nn.Module,
	data_format: str = "BTCHW",
	same_on_frame: bool = True,
	random_apply: Union[int, bool, Tuple[int, int]] = False,
	) -> None:
	super().__init__(*args, same_on_batch=None, return_transform=None, keepdim=None, random_apply=random_apply)
	self.same_on_frame = same_on_frame
	self.data_format = data_format.upper()
	if self.data_format not in ["BCTHW", "BTCHW"]:
	raise AssertionError(f"Only `BCTHW` and `BTCHW` are supported. Got `{data_format}`.")
	self._temporal_channel: int
	if self.data_format == "BCTHW":
	self._temporal_channel = 2
	elif self.data_format == "BTCHW":
	self._temporal_channel = 1

	def __infer_channel_exclusive_batch_shape__(self, batch_shape: torch.Size, chennel_index: int) -> torch.Size:
	# Fix mypy complains: error: Incompatible return value type (got "Tuple[int, ...]", expected "Size")
	return cast(torch.Size, batch_shape[:chennel_index] + batch_shape[chennel_index + 1:])

	def __repeat_param_across_channels__(self, param: torch.Tensor, frame_num: int) -> torch.Tensor:
	"""Repeat parameters across channels.

	The input is shaped as (B, ...), while to output (B * same_on_frame, ...), which
	to guarantee that the same transformation would happen for each frame.

	(B1, B2, ..., Bn) => (B1, ... B1, B2, ..., B2, ..., Bn, ..., Bn)
	\| ch_size \| \| ch_size \| ..., \| ch_size \|
	"""
	repeated = param[:, None, ...].repeat(1, frame_num, ([1] len(param.shape[1:])))
	return repeated.reshape(-1, *list(param.shape[1:])) # type: ignore

	def _input_shape_convert_in(
	self, input: torch.Tensor, label: Optional[torch.Tensor], frame_num: int
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	# Convert any shape to (B, T, C, H, W)
	if self.data_format == "BCTHW":
	# Convert (B, C, T, H, W) to (B, T, C, H, W)
	input = input.transpose(1, 2)
	if self.data_format == "BTCHW":
	pass

	if label is not None:
	if label.shape == input.shape[:2]:
	# if label is provided as (B, T)
	label = label.view(-1)
	elif label.shape == input.shape[:1]:
	label = label[..., None].repeat(1, frame_num).view(-1)
	elif label.shape == torch.Size([input.shape[0] * input.shape[1]]):
	# Skip the conversion if label is provided as (B * T,)
	pass
	else:
	raise NotImplementedError(f"Invalid label shape of {label.shape}.")
	input = input.reshape(-1, *input.shape[2:])
	return input, label

	def _input_shape_convert_back(
	self, input: torch.Tensor, label: Optional[torch.Tensor], frame_num: int
	) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
	input = input.view(-1, frame_num, *input.shape[1:])
	if self.data_format == "BCTHW":
	input = input.transpose(1, 2)
	if self.data_format == "BTCHW":
	pass

	if label is not None:
	label = label.view(input.size(0), frame_num, -1)
	return input, label

	def forward_parameters(self, batch_shape: torch.Size) -> List[ParamItem]:
	frame_num = batch_shape[self._temporal_channel]
	named_modules = self.get_forward_sequence()
	# Got param generation shape to (B, C, H, W). Ignoring T.
	batch_shape = self.__infer_channel_exclusive_batch_shape__(batch_shape, self._temporal_channel)

	if not self.same_on_frame:
	# Overwrite param generation shape to (B * T, C, H, W).
	batch_shape = torch.Size([batch_shape[0] * frame_num, *batch_shape[1:]])

	params = []
	for name, module in named_modules:
	if isinstance(module, (SequentialBase,)):
	seq_param = module.forward_parameters(batch_shape)
	if self.same_on_frame:
	raise ValueError("Sequential is currently unsupported for ``same_on_frame``.")
	param = ParamItem(name, seq_param)
	elif isinstance(module, (_AugmentationBase, MixAugmentationBase)):
	mod_param = module.forward_parameters(batch_shape)
	if self.same_on_frame:
	for k, v in mod_param.items():
	# TODO: revise colorjitter order param in the future to align the standard.
	if not (k == "order" and isinstance(module, kornia.augmentation.ColorJitter)):
	mod_param.update({k: self.__repeat_param_across_channels__(v, frame_num)})
	param = ParamItem(name, mod_param)
	else:
	param = ParamItem(name, None)
	params.append(param)
	return params

	def inverse(self, input: torch.Tensor, params: Optional[List[ParamItem]] = None) -> torch.Tensor:
	"""Inverse transformation.

	Used to inverse a tensor according to the performed transformation by a forward pass, or with respect to
	provided parameters.
	"""
	if self.apply_inverse_func in (InputApplyInverse, MaskApplyInverse):
	frame_num: int = input.size(self._temporal_channel)
	input, _ = self._input_shape_convert_in(input, None, frame_num)
	else:
	batch_size: int = input.size(0)
	input = input.view(-1, *input.shape[2:])

	input = super().inverse(input, params)
	if self.apply_inverse_func in (InputApplyInverse, MaskApplyInverse):
	input, _ = self._input_shape_convert_back(input, None, frame_num)
	else:
	input = input.view(batch_size, -1, *input.shape[1:])

	return input

	def forward( # type: ignore
	self, input: torch.Tensor, label: Optional[torch.Tensor] = None, params: Optional[List[ParamItem]] = None
	) -> Union[TensorWithTransformMat, Tuple[TensorWithTransformMat, torch.Tensor]]:
	"""Define the video computation performed."""
	if len(input.shape) != 5:
	raise AssertionError(f"Input must be a 5-dim tensor. Got {input.shape}.")

	if params is None:
	params = self.forward_parameters(input.shape)

	# Size of T
	if self.apply_inverse_func in (InputApplyInverse, MaskApplyInverse):
	frame_num: int = input.size(self._temporal_channel)
	input, label = self._input_shape_convert_in(input, label, frame_num)
	else:
	if label is not None:
	raise ValueError(f"Invalid label value. Got {label}")
	batch_size: int = input.size(0)
	input = input.view(-1, *input.shape[2:])

	out = super().forward(input, label, params) # type: ignore
	if self.return_label:
	output, label = cast(Tuple[TensorWithTransformMat, torch.Tensor], out)
	else:
	output = cast(TensorWithTransformMat, out)

	if isinstance(output, (tuple, list)):
	if self.apply_inverse_func in (InputApplyInverse, MaskApplyInverse):
	_out, label = self._input_shape_convert_back(output[0], label, frame_num)
	output = (_out, output[1])
	else:
	if label is not None:
	raise ValueError(f"Invalid label value. Got {label}")
	output = output[0].view(batch_size, -1, *output[0].shape[1:])
	else:
	if self.apply_inverse_func in (InputApplyInverse, MaskApplyInverse):
	output, label = self._input_shape_convert_back(output, label, frame_num)
	else:
	if label is not None:
	raise ValueError(f"Invalid label value. Got {label}")
	output = output.view(batch_size, -1, *output.shape[1:])

	return self.__packup_output__(output, label)