diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..9f11b755a17d8192c60f61cb17b8902dffbd9f23 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea/ diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..363fcab7ed6e9634e198cf5555ceb88932c9a245 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md index 52ee3d9aa040cf0f348ec7c19537c1dae33c466c..6f05ed973ee898958435fe7855307338a521faa3 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,84 @@ ---- -title: FashionFlow -emoji: 🏆 -colorFrom: green -colorTo: purple -sdk: streamlit -sdk_version: 1.37.1 -app_file: app.py -pinned: false ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +
+ +

FashionFlow: Leveraging Diffusion Models for Dynamic Fashion Video Synthesis from Static Imagery

+ +

+This repository has the official code for 'FashionFlow: Leveraging Diffusion Models for Dynamic Fashion Video Synthesis from Static Imagery'. +We have included the pre-trained checkpoint, dataset and results. +

+ +> **Abstract:** *Our study introduces a new image-to-video generator called FashionFlow to generate fashion videos. By utilising a diffusion model, we are able to create short videos from still fashion images. Our approach involves developing and connecting relevant components with the diffusion model, which results in the creation of high-fidelity videos that are aligned with the conditional image. The components include the use of pseudo-3D convolutional layers to generate videos efficiently. VAE and CLIP encoders capture vital characteristics from still images to condition the diffusion model at a global level. Our research demonstrates a successful synthesis of fashion videos featuring models posing from various angles, showcasing the fit and appearance of the garment. Our findings hold great promise for improving and enhancing the shopping experience for the online fashion industry.* + + +## Teaser +![image](sample/teaser.gif) + +## Requirements +- Python 3.9 +- PyTorch 1.11+ +- Tensoboard +- cv2 +- transformers +- diffusers + +## Model Specification + +The model was developed using PyTorch and loads pretrained weights for VAE and CLIP. The latent diffusion model consists of a 1D convolutional layer stacked against a 2D convolutional layer (forming a pseudo 3D convolution) and includes attention layers. See the ```model_structure.txt``` file to see the exact layers of our LDM. + +## Installation + +Clone this repository: + +``` +git clone https://github.com/1702609/FashionFlow +cd ./FashionFlow/ +``` + +Install PyTorch and other dependencies: + +``` +pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113 +pip install -r requirements.txt +``` + +## Dataset + +Download the Fashion dataset by clicking on this link: +[[Fashion dataset]](https://vision.cs.ubc.ca/datasets/fashion/) + +Extract the files and place them in the ```fashion_dataset``` directory. The dataset should be organised as follows: + +``` +fashion_dataset + test + |-- 91-3003CN5S.mp4 + |-- 91BjuE6irxS.mp4 + |-- 91bxAN6BjAS.mp4 + |-- ... + train + |-- 81FyMPk-WIS.mp4 + |-- 91+bCFG1jOS.mp4 + |-- 91+PxmDyrgS.mp4 + |-- ... +``` + +Feel free to add your own dataset while following the provided file and folder structure. + +## Pre-trained Checkpoint + +Download the checkpoint by clicking on this link: +[[Pre-trained checkpoints]](https://www.dropbox.com/scl/fi/p9fv7o3j7ti0yu2umsgmv/FashionFlow_checkpoint.pth?rlkey=mqsto9i4ujh6xhvab0e2s6n7d&dl=0) +Extract the files and place them in the ```checkpoint``` directory + +## Inference +To run the inference of our model, execute ```python inference.py```. The results will be saved in the ```result``` directory. + +## Train + +Before training, images and videos have to be projected to latent space for efficient training. Execute ```python project_latent_space.py``` where the tensors will be saved in the ```fashion_dataset_tensor``` directory. + +Run ```python -m torch.distributed.launch --nproc_per_node= train.py``` to train the model. The checkpoints will be saved in the ```checkpoint``` directory periodically. Also, you can view the training progress using tensorboardX located in ```video_progress``` or find the generated ```.mp4``` on ```training_sample```. + +## Comparison + +![image](sample/comparison.gif) \ No newline at end of file diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..5071edd996f5c60f3287fa056ed4f136116d1181 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,68 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5b3c9fac-51c3-4ecc-8606-5a298076560e", + "metadata": {}, + "outputs": [], + "source": [ + "from huggingface_hub import notebook_login\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "51fe1170-c6eb-4b3a-a055-663faf35ab5a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6c6d20c8f5e847d7985f6b49a7206a2d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='
i 1') * rearrange(emb, 'j -> 1 j') + return torch.cat((emb.sin(), emb.cos()), dim=-1).type(dtype) + + +# layernorm 3d + +class ChanLayerNorm(nn.Module): + def __init__(self, dim): + super().__init__() + self.g = nn.Parameter(torch.ones(dim, 1, 1, 1)) + + def forward(self, x): + eps = 1e-5 if x.dtype == torch.float32 else 1e-3 + var = torch.var(x, dim=1, unbiased=False, keepdim=True) + mean = torch.mean(x, dim=1, keepdim=True) + return (x - mean) * var.clamp(min=eps).rsqrt() * self.g + + +# feedforward + +def shift_token(t): + t, t_shift = t.chunk(2, dim=1) + t_shift = F.pad(t_shift, (0, 0, 0, 0, 1, -1), value=0.) + return torch.cat((t, t_shift), dim=1) + + +class GEGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=1) + return x * F.gelu(gate) + + +class FeedForward(nn.Module): + def __init__(self, dim, mult=4): + super().__init__() + + inner_dim = int(dim * mult * 2 / 3) + self.proj_in = nn.Sequential( + nn.Conv3d(dim, inner_dim * 2, 1, bias=False), + GEGLU() + ) + + self.proj_out = nn.Sequential( + ChanLayerNorm(inner_dim), + nn.Conv3d(inner_dim, dim, 1, bias=False) + ) + + def forward(self, x, enable_time=True): + x = self.proj_in(x) + if enable_time: + x = shift_token(x) + return self.proj_out(x) + + +# best relative positional encoding + +class ContinuousPositionBias(nn.Module): + """ from https://arxiv.org/abs/2111.09883 """ + + def __init__( + self, + *, + dim, + heads, + num_dims=1, + layers=2 + ): + super().__init__() + self.num_dims = num_dims + + self.net = nn.ModuleList([]) + self.net.append(nn.Sequential(nn.Linear(self.num_dims, dim), nn.SiLU())) + + for _ in range(layers - 1): + self.net.append(nn.Sequential(nn.Linear(dim, dim), nn.SiLU())) + + self.net.append(nn.Linear(dim, heads)) + + @property + def device(self): + return next(self.parameters()).device + + def forward(self, *dimensions): + device = self.device + + shape = torch.tensor(dimensions, device=device) + rel_pos_shape = 2 * shape - 1 + + # calculate strides + + strides = torch.flip(rel_pos_shape, (0,)).cumprod(dim=-1) + strides = torch.flip(F.pad(strides, (1, -1), value=1), (0,)) + + # get all positions and calculate all the relative distances + + positions = [torch.arange(d, device=device) for d in dimensions] + grid = torch.stack(torch.meshgrid(*positions, indexing='ij'), dim=-1) + grid = rearrange(grid, '... c -> (...) c') + rel_dist = rearrange(grid, 'i c -> i 1 c') - rearrange(grid, 'j c -> 1 j c') + + # get all relative positions across all dimensions + + rel_positions = [torch.arange(-d + 1, d, device=device) for d in dimensions] + rel_pos_grid = torch.stack(torch.meshgrid(*rel_positions, indexing='ij'), dim=-1) + rel_pos_grid = rearrange(rel_pos_grid, '... c -> (...) c') + + # mlp input + + bias = rel_pos_grid.float() + + for layer in self.net: + bias = layer(bias) + + # convert relative distances to indices of the bias + + rel_dist += (shape - 1) # make sure all positive + rel_dist *= strides + rel_dist_indices = rel_dist.sum(dim=-1) + + # now select the bias for each unique relative position combination + + bias = bias[rel_dist_indices] + return rearrange(bias, 'i j h -> h i j') + + +# helper classes + +class CrossAttention(nn.Module): + def __init__(self, n_heads, d_embed, d_cross, in_proj_bias=True, out_proj_bias=True): + super().__init__() + self.q_proj = nn.Linear(d_embed, d_embed, bias=in_proj_bias) + self.k_proj = nn.Linear(d_cross, d_embed, bias=in_proj_bias) + self.v_proj = nn.Linear(d_cross, d_embed, bias=in_proj_bias) + self.out_proj = nn.Linear(d_embed, d_embed, bias=out_proj_bias) + self.n_heads = n_heads + self.d_head = d_embed // n_heads + + def forward(self, x, y): + input_shape = x.shape + batch_size, sequence_length, d_embed = input_shape + interim_shape = (batch_size, -1, self.n_heads, self.d_head) + + q = self.q_proj(x) + k = self.k_proj(y) + v = self.v_proj(y) + + q = q.view(interim_shape).transpose(1, 2) + k = k.view(interim_shape).transpose(1, 2) + v = v.view(interim_shape).transpose(1, 2) + + weight = q @ k.transpose(-1, -2) + weight /= math.sqrt(self.d_head) + weight = F.softmax(weight, dim=-1) + + output = weight @ v + output = output.transpose(1, 2).contiguous() + output = output.view(input_shape) + output = self.out_proj(output) + return output + +class AttentionBlock(nn.Module): + def __init__(self, n_head: int, n_embd: int, d_context=768): + super().__init__() + channels = n_head * n_embd + + #self.groupnorm = nn.GroupNorm(32, channels, eps=1e-6) + #self.conv_input = PseudoConv3d(channels, channels, 1) + self.layernorm_2 = nn.LayerNorm(channels) + self.attention_2 = CrossAttention(n_head, channels, d_context, in_proj_bias=False) + self.layernorm_3 = nn.LayerNorm(channels) + self.linear_geglu_1 = nn.Linear(channels, 4 * channels * 2) + self.linear_geglu_2 = nn.Linear(4 * channels, channels) + self.conv_output = PseudoConv3d(channels, channels, 1, bias=False) + + def forward(self, x, context): + b, c, *_, h, w = x.shape + #x = self.groupnorm(x) + #x = self.conv_input(x) + x = rearrange(x, 'b c f h w -> b (h w f) c') + + residue_short = x + x = self.layernorm_2(x) + x = self.attention_2(x, context) + x += residue_short + + residue_short = x + x = self.layernorm_3(x) + x, gate = self.linear_geglu_1(x).chunk(2, dim=-1) + x = x * F.gelu(gate) + x = self.linear_geglu_2(x) + x += residue_short + + x = rearrange(x, 'b (h w f) c -> b c f h w', b=b, c=c, h=h, w=w) + x = self.conv_output(x) + return x + +class Attention(nn.Module): + def __init__( + self, + dim, + dim_head=64, + heads=8 + ): + super().__init__() + self.heads = heads + self.scale = dim_head ** -0.5 + inner_dim = dim_head * heads + + self.norm = nn.LayerNorm(dim) + + self.to_q = nn.Linear(dim, inner_dim, bias=False) + self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) + self.to_out = nn.Linear(inner_dim, dim, bias=False) + nn.init.zeros_(self.to_out.weight.data) # identity with skip connection + + def forward( + self, + x, + rel_pos_bias=None + ): + x = self.norm(x) + + q, k, v = self.to_q(x), *self.to_kv(x).chunk(2, dim=-1) + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=self.heads), (q, k, v)) + + q = q * self.scale + + sim = einsum('b h i d, b h j d -> b h i j', q, k) + + if exists(rel_pos_bias): + sim = sim + rel_pos_bias + + attn = sim.softmax(dim=-1) + + out = einsum('b h i j, b h j d -> b h i d', attn, v) + + out = rearrange(out, 'b h n d -> b n (h d)') + return self.to_out(out) + + +# main contribution - pseudo 3d conv + +class PseudoConv3d(nn.Module): + def __init__( + self, + dim, + dim_out=None, + kernel_size=3, + *, + temporal_kernel_size=None, + **kwargs + ): + super().__init__() + dim_out = default(dim_out, dim) + temporal_kernel_size = default(temporal_kernel_size, kernel_size) + + self.spatial_conv = nn.Conv2d(dim, dim_out, kernel_size=kernel_size, padding=kernel_size // 2) + self.temporal_conv = nn.Conv1d(dim_out, dim_out, kernel_size=temporal_kernel_size, + padding=temporal_kernel_size // 2) if kernel_size > 1 else None + + if exists(self.temporal_conv): + nn.init.dirac_(self.temporal_conv.weight.data) # initialized to be identity + nn.init.zeros_(self.temporal_conv.bias.data) + + def forward( + self, + x, + enable_time=True + ): + b, c, *_, h, w = x.shape + + is_video = x.ndim == 5 + enable_time &= is_video + + if is_video: + x = rearrange(x, 'b c f h w -> (b f) c h w') + + x = self.spatial_conv(x) + + if is_video: + x = rearrange(x, '(b f) c h w -> b c f h w', b=b) + + if not enable_time or not exists(self.temporal_conv): + return x + + x = rearrange(x, 'b c f h w -> (b h w) c f') + + x = self.temporal_conv(x) + + x = rearrange(x, '(b h w) c f -> b c f h w', h=h, w=w) + + return x + + +# factorized spatial temporal attention from Ho et al. + +class SpatioTemporalAttention(nn.Module): + def __init__( + self, + dim, + *, + dim_head=64, + heads=8, + add_feed_forward=True, + ff_mult=4 + ): + super().__init__() + self.spatial_attn = Attention(dim=dim, dim_head=dim_head, heads=heads) + self.spatial_rel_pos_bias = ContinuousPositionBias(dim=dim // 2, heads=heads, num_dims=2) + + self.temporal_attn = Attention(dim=dim, dim_head=dim_head, heads=heads) + self.temporal_rel_pos_bias = ContinuousPositionBias(dim=dim // 2, heads=heads, num_dims=1) + + self.has_feed_forward = add_feed_forward + if not add_feed_forward: + return + + self.ff = FeedForward(dim=dim, mult=ff_mult) + + def forward( + self, + x, + enable_time=True + ): + b, c, *_, h, w = x.shape + is_video = x.ndim == 5 + enable_time &= is_video + + if is_video: + x = rearrange(x, 'b c f h w -> (b f) (h w) c') + else: + x = rearrange(x, 'b c h w -> b (h w) c') + + space_rel_pos_bias = self.spatial_rel_pos_bias(h, w) + + x = self.spatial_attn(x, rel_pos_bias=space_rel_pos_bias) + x + + if is_video: + x = rearrange(x, '(b f) (h w) c -> b c f h w', b=b, h=h, w=w) + else: + x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w) + + if enable_time: + x = rearrange(x, 'b c f h w -> (b h w) f c') + + time_rel_pos_bias = self.temporal_rel_pos_bias(x.shape[1]) + + x = self.temporal_attn(x, rel_pos_bias=time_rel_pos_bias) + x + + x = rearrange(x, '(b h w) f c -> b c f h w', w=w, h=h) + + if self.has_feed_forward: + x = self.ff(x, enable_time=enable_time) + x + + return x + + +# resnet block + +class Block(nn.Module): + def __init__( + self, + dim, + dim_out, + kernel_size=3, + temporal_kernel_size=None, + groups=8 + ): + super().__init__() + self.project = PseudoConv3d(dim, dim_out, 3) + self.norm = nn.GroupNorm(groups, dim_out) + self.act = nn.SiLU() + + def forward( + self, + x, + scale_shift=None, + enable_time=False + ): + x = self.project(x, enable_time=enable_time) + x = self.norm(x) + + if exists(scale_shift): + scale, shift = scale_shift + x = x * (scale + 1) + shift + + return self.act(x) + + +class ResnetBlock(nn.Module): + def __init__( + self, + dim, + dim_out, + *, + timestep_cond_dim=None, + groups=8 + ): + super().__init__() + + self.timestep_mlp = None + + if exists(timestep_cond_dim): + self.timestep_mlp = nn.Sequential( + nn.SiLU(), + nn.Linear(timestep_cond_dim, dim_out * 2) + ) + + self.block1 = Block(dim, dim_out, groups=groups) + self.block2 = Block(dim_out, dim_out, groups=groups) + self.res_conv = PseudoConv3d(dim, dim_out, 1) if dim != dim_out else nn.Identity() + + def forward( + self, + x, + timestep_emb=None, + enable_time=True + ): + assert not (exists(timestep_emb) ^ exists(self.timestep_mlp)) + + scale_shift = None + + if exists(self.timestep_mlp) and exists(timestep_emb): + time_emb = self.timestep_mlp(timestep_emb) + to_einsum_eq = 'b c 1 1 1' if x.ndim == 5 else 'b c 1 1' + time_emb = rearrange(time_emb, f'b c -> {to_einsum_eq}') + scale_shift = time_emb.chunk(2, dim=1) + + h = self.block1(x, scale_shift=scale_shift, enable_time=enable_time) + + h = self.block2(h, enable_time=enable_time) + + return h + self.res_conv(x) + + +# pixelshuffle upsamples and downsamples +# where time dimension can be configured + +class Downsample(nn.Module): + def __init__( + self, + dim, + downsample_space=True, + downsample_time=False, + nonlin=False + ): + super().__init__() + assert downsample_space or downsample_time + + self.down_space = nn.Sequential( + Rearrange('b c (h p1) (w p2) -> b (c p1 p2) h w', p1=2, p2=2), + nn.Conv2d(dim * 4, dim, 1, bias=False), + nn.SiLU() if nonlin else nn.Identity() + ) if downsample_space else None + + self.down_time = nn.Sequential( + Rearrange('b c (f p) h w -> b (c p) f h w', p=2), + nn.Conv3d(dim * 2, dim, 1, bias=False), + nn.SiLU() if nonlin else nn.Identity() + ) if downsample_time else None + + def forward( + self, + x, + enable_time=True + ): + is_video = x.ndim == 5 + + if is_video: + x = rearrange(x, 'b c f h w -> b f c h w') + x, ps = pack([x], '* c h w') + + if exists(self.down_space): + x = self.down_space(x) + + if is_video: + x, = unpack(x, ps, '* c h w') + x = rearrange(x, 'b f c h w -> b c f h w') + + if not is_video or not exists(self.down_time) or not enable_time: + return x + + x = self.down_time(x) + + return x + + +class Upsample(nn.Module): + def __init__( + self, + dim, + upsample_space=True, + upsample_time=False, + nonlin=False + ): + super().__init__() + assert upsample_space or upsample_time + + self.up_space = nn.Sequential( + nn.Conv2d(dim, dim * 4, 1), + nn.SiLU() if nonlin else nn.Identity(), + Rearrange('b (c p1 p2) h w -> b c (h p1) (w p2)', p1=2, p2=2) + ) if upsample_space else None + + self.up_time = nn.Sequential( + nn.Conv3d(dim, dim * 2, 1), + nn.SiLU() if nonlin else nn.Identity(), + Rearrange('b (c p) f h w -> b c (f p) h w', p=2) + ) if upsample_time else None + + self.init_() + + def init_(self): + if exists(self.up_space): + self.init_conv_(self.up_space[0], 4) + + if exists(self.up_time): + self.init_conv_(self.up_time[0], 2) + + def init_conv_(self, conv, factor): + o, *remain_dims = conv.weight.shape + conv_weight = torch.empty(o // factor, *remain_dims) + nn.init.kaiming_uniform_(conv_weight) + conv_weight = repeat(conv_weight, 'o ... -> (o r) ...', r=factor) + + conv.weight.data.copy_(conv_weight) + nn.init.zeros_(conv.bias.data) + + def forward( + self, + x, + enable_time=True + ): + is_video = x.ndim == 5 + + if is_video: + x = rearrange(x, 'b c f h w -> b f c h w') + x, ps = pack([x], '* c h w') + + if exists(self.up_space): + x = self.up_space(x) + + if is_video: + x, = unpack(x, ps, '* c h w') + x = rearrange(x, 'b f c h w -> b c f h w') + + if not is_video or not exists(self.up_time) or not enable_time: + return x + + x = self.up_time(x) + + return x + + +class SpaceTimeUnet(nn.Module): + def __init__( + self, + *, + dim, + channels=4, + dim_mult=(1, 2, 4, 8), + self_attns=(False, False, False, True), + temporal_compression=(False, True, True, True), + resnet_block_depths=(2, 2, 2, 2), + attn_dim_head=64, + attn_heads=8, + condition_on_timestep=False, + ): + super().__init__() + assert len(dim_mult) == len(self_attns) == len(temporal_compression) == len(resnet_block_depths) + num_layers = len(dim_mult) + + dims = [dim, *map(lambda mult: mult * dim, dim_mult)] + dim_in_out = zip(dims[:-1], dims[1:]) + + + # determine the valid multiples of the image size and frames of the video + self.frame_multiple = 2 ** sum(tuple(map(int, temporal_compression))) + self.image_size_multiple = 2 ** num_layers + + # timestep conditioning for DDPM, not to be confused with the time dimension of the video + + self.to_timestep_cond = None + timestep_cond_dim = (dim * 4) if condition_on_timestep else None + + if condition_on_timestep: + self.to_timestep_cond = nn.Sequential( + SinusoidalPosEmb(dim), + nn.Linear(dim, timestep_cond_dim), + nn.SiLU() + ) + + # Cross Attention + cross_attention_D1 = AttentionBlock(1, 64) # 64 + cross_attention_D2 = AttentionBlock(1, 128) # 128 + cross_attention_D3 = AttentionBlock(2, 128) # 256 + cross_attention_D4 = AttentionBlock(4, 128) # 512 + + cross_attention_U1 = AttentionBlock(4, 64) # 256 + cross_attention_U2 = AttentionBlock(2, 64) # 128 + cross_attention_U3 = AttentionBlock(1, 64) # 64 + cross_attention_U4 = AttentionBlock(1, 64) # 64 + + cross_attns_down = (cross_attention_D1, cross_attention_D2, cross_attention_D3, cross_attention_D4) + cross_attns_up = (cross_attention_U4, cross_attention_U3, cross_attention_U2, cross_attention_U1) + # layers + + self.downs = mlist([]) + self.ups = mlist([]) + + attn_kwargs = dict( + dim_head=attn_dim_head, + heads=attn_heads + ) + + mid_dim = dims[-1] + + self.mid_block1 = ResnetBlock(mid_dim, mid_dim, timestep_cond_dim=timestep_cond_dim) + self.mid_attn = SpatioTemporalAttention(dim=mid_dim) + self.mid_block2 = ResnetBlock(mid_dim, mid_dim, timestep_cond_dim=timestep_cond_dim) + for _, self_attend, (dim_in, dim_out), compress_time, resnet_block_depth, cross_attns_d, cross_attns_u in zip(range(num_layers), + self_attns, + dim_in_out, + temporal_compression, + resnet_block_depths, + cross_attns_down, + cross_attns_up): + assert resnet_block_depth >= 1 + self.downs.append(mlist([ + ResnetBlock(dim_in, dim_out, timestep_cond_dim=timestep_cond_dim), + mlist([ResnetBlock(dim_out, dim_out) for _ in range(resnet_block_depth)]), + SpatioTemporalAttention(dim=dim_out, **attn_kwargs) if self_attend else None, + Downsample(dim_out, downsample_time=compress_time), + cross_attns_d if exists(cross_attns_d) else None + ])) + self.ups.append(mlist([ + ResnetBlock(dim_out * 2, dim_in, timestep_cond_dim=timestep_cond_dim), + mlist( + [ResnetBlock(dim_in + (dim_out if ind == 0 else 0), dim_in) for ind in range(resnet_block_depth)]), + SpatioTemporalAttention(dim=dim_in, **attn_kwargs) if self_attend else None, + Upsample(dim_out, upsample_time=compress_time), + cross_attns_u if exists(cross_attns_u) else None + + ])) + self.skip_scale = 2 ** -0.5 # paper shows faster convergence + + self.conv_in = PseudoConv3d(dim=channels, dim_out=dim, kernel_size=7, temporal_kernel_size=3) + self.conv_out = PseudoConv3d(dim=dim, dim_out=channels, kernel_size=3, temporal_kernel_size=3) + + def forward( + self, + x, + clip_vae_embed, + timestep=None, + enable_time=True + ): + + assert not (exists(self.to_timestep_cond) ^ exists(timestep)) + is_video = x.ndim == 5 + + if enable_time and is_video: + frames = x.shape[2] + assert divisible_by(frames, + self.frame_multiple), f'number of frames on the video ({frames}) must be divisible by the frame multiple ({self.frame_multiple})' + + height, width = x.shape[-2:] + assert divisible_by(height, self.image_size_multiple) and divisible_by(width, + self.image_size_multiple), f'height and width of the image or video must be a multiple of {self.image_size_multiple}' + + # main logic + + t = self.to_timestep_cond(rearrange(timestep, '... -> (...)')) if exists(timestep) else None + x = self.conv_in(x, enable_time=enable_time) + + hiddens = [] + for init_block, blocks, maybe_attention, downsample, cross_attn in self.downs: + x = init_block(x, t, enable_time=enable_time) + hiddens.append(x.clone()) + for block in blocks: + x = block(x, enable_time=enable_time) + if exists(maybe_attention): + x = maybe_attention(x, enable_time=enable_time) # only happens in the last layer + hiddens.append(x.clone()) + x = downsample(x, enable_time=enable_time) + if exists(cross_attn): + x = cross_attn(x, clip_vae_embed) + + x = self.mid_block1(x, t, enable_time=enable_time) + x = self.mid_attn(x, enable_time=enable_time) + x = self.mid_block2(x, t, enable_time=enable_time) + + for init_block, blocks, maybe_attention, upsample, cross_attn in reversed(self.ups): + x = upsample(x, enable_time=enable_time) + x = torch.cat((hiddens.pop() * self.skip_scale, x), dim=1) + x = init_block(x, t, enable_time=enable_time) + x = torch.cat((hiddens.pop() * self.skip_scale, x), dim=1) + for block in blocks: + x = block(x, enable_time=enable_time) + if exists(maybe_attention): + x = maybe_attention(x, enable_time=enable_time) + if exists(cross_attn): + x = cross_attn(x, clip_vae_embed) + + x = self.conv_out(x, enable_time=enable_time) + return x + +if __name__ == '__main__': + Net = SpaceTimeUnet( + dim=64, + channels=3, + dim_mult=(1, 2, 4, 8), + temporal_compression=(False, False, False, True), + self_attns=(False, False, False, True), + condition_on_timestep=False) + + x = torch.randn([1,8,3,32,32]) + sample_output = Net(x.permute(0, 2, 1, 3, 4)) diff --git a/models/unet_dual_encoder.py b/models/unet_dual_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..aad5c9ba86dfa73c93c9695d58aac9f7fd842503 --- /dev/null +++ b/models/unet_dual_encoder.py @@ -0,0 +1,62 @@ +# Load pretrained 2D UNet and modify with temporal attention +import torch +import torch.nn as nn +import torch.utils.checkpoint +from einops import rearrange + +from diffusers.models import UNet2DConditionModel + +def get_unet(pretrained_model_name_or_path, revision, resolution=256, n_poses=5): + # Load pretrained UNet layers + unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", + subfolder="unet", + revision="ebb811dd71cdc38a204ecbdd6ac5d580f529fd8c", + cache_dir="checkpoints/unet") + + # Modify input layer to have 1 additional input channels (pose) + weights = unet.conv_in.weight.clone() + unet.conv_in = nn.Conv2d(4 + 2*n_poses, weights.shape[0], kernel_size=3, padding=(1, 1)) # input noise + n poses + with torch.no_grad(): + unet.conv_in.weight[:, :4] = weights # original weights + unet.conv_in.weight[:, 3:] = torch.zeros(unet.conv_in.weight[:, 3:].shape) # new weights initialized to zero + + return unet + +''' + This module takes in CLIP + VAE embeddings and outputs CLIP-compatible embeddings. +''' +class Embedding_Adapter(nn.Module): + def __init__(self, input_nc=38, output_nc=4, norm_layer=nn.InstanceNorm2d, chkpt=None): + super(Embedding_Adapter, self).__init__() + + self.save_method_name = "adapter" + + self.pool = nn.MaxPool2d(2) + self.vae2clip = nn.Linear(1280, 768) + + self.linear1 = nn.Linear(54, 50) # 50 x 54 shape + + # initialize weights + with torch.no_grad(): + self.linear1.weight = nn.Parameter(torch.eye(50, 54)) + + if chkpt is not None: + pass + + def forward(self, clip, vae): + + vae = self.pool(vae) # 1 4 80 64 --> 1 4 40 32 + vae = rearrange(vae, 'b c h w -> b c (h w)') # 1 4 20 16 --> 1 4 1280 + + vae = self.vae2clip(vae) # 1 4 768 + + # Concatenate + concat = torch.cat((clip, vae), 1) + + # Encode + + concat = rearrange(concat, 'b c d -> b d c') + concat = self.linear1(concat) + concat = rearrange(concat, 'b d c -> b c d') + + return concat \ No newline at end of file diff --git a/project_latent_space.py b/project_latent_space.py new file mode 100644 index 0000000000000000000000000000000000000000..549db6ff6dadebe8cbfcb45d6b1980c41387bff6 --- /dev/null +++ b/project_latent_space.py @@ -0,0 +1,75 @@ +import torchvision.transforms as transforms +import os.path as osp +import cv2 +import torch +import os, argparse +import tqdm +from PIL import Image +from diffusers import AutoencoderKL +import random +device = torch.device("cuda") + +parser = argparse.ArgumentParser(description="Configuration of the tensor projection.") +parser.add_argument('--dataset', default="fashion_dataset/train", help="Path to the dataset") +parser.add_argument('--output_dir', default="fashion_dataset_tensor", help="Path to save the tensors") +args = parser.parse_args() + +vae = AutoencoderKL.from_pretrained( + "CompVis/stable-diffusion-v1-4", + subfolder="vae", + revision="ebb811dd71cdc38a204ecbdd6ac5d580f529fd8c" + ).to(device) +vae.requires_grad_(False) + +@torch.no_grad() +def VAE_encode(video): + for i in range(video.shape[0]): + image = video[i, :, :, :] + image = image.unsqueeze(0) + if i == 0: + init_latent_dist = vae.encode(image).latent_dist.sample() + init_latent_dist *= 0.18215 + encoded_video = (init_latent_dist).unsqueeze(1) + else: + init_latent_dist = vae.encode(image).latent_dist.sample() + init_latent_dist *= 0.18215 + encoded_video = torch.cat([encoded_video, (init_latent_dist).unsqueeze(1)], 1) + return encoded_video + +def get_transform(): + image_transforms = transforms.Compose( + [ + transforms.Resize((640, 512), interpolation=transforms.InterpolationMode.BILINEAR), + transforms.ToTensor(), + ]) + return image_transforms + + +path = osp.join(args.dataset) +video_names = os.listdir(path) +transform = get_transform() + +if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + +for video_name in tqdm.tqdm(video_names): + cap = cv2.VideoCapture(osp.join(path, video_name)) + numberOfFrames = 241 + number = random.randint(0, numberOfFrames - 70) + for i in range(number, number + 70): + cap.set(cv2.CAP_PROP_POS_FRAMES, i) + _, frame = cap.read() + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frame = Image.fromarray(frame) + frame = transform(frame) + if i == number: + inputImage = frame + torch.save(inputImage, args.output_dir + "/" + video_name[:-4] + "_image.pt") + frame = frame.unsqueeze(0) + restOfVideo = torch.clone(frame) + else: + frame = frame.unsqueeze(0) + restOfVideo = torch.cat([restOfVideo, frame], 0) + restOfVideo = restOfVideo.to(device=device) + vae_video = VAE_encode(restOfVideo).detach().cpu()[0] + torch.save(vae_video, args.output_dir + "/" + video_name[:-4] + ".pt") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..04217dfba88fb463092ab76dfe938ac6e4469af7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,30 @@ +accelerate==0.26.1 +certifi==2023.11.17 +charset-normalizer==3.3.2 +diffusers==0.14.0 +einops==0.7.0 +filelock==3.13.1 +fsspec==2023.12.2 +huggingface-hub==0.20.2 +idna==3.6 +importlib-metadata==7.0.1 +numpy==1.26.3 +opencv-python==4.9.0.80 +packaging==23.2 +pillow==10.2.0 +protobuf==4.25.2 +psutil==5.9.7 +PyYAML==6.0.1 +regex==2023.12.25 +requests==2.31.0 +safetensors==0.4.1 +tensorboardX==2.6.2.2 +tokenizers==0.15.0 +torch==1.11.0+cu113 +torchaudio==0.11.0+cu113 +torchvision==0.12.0+cu113 +tqdm==4.66.1 +transformers==4.36.2 +typing_extensions==4.9.0 +urllib3==2.1.0 +zipp==3.17.0 diff --git a/sample/blue.jpg b/sample/blue.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9f5ce29e53c08cc8008129e4357f1e52a9a6fe81 Binary files /dev/null and b/sample/blue.jpg differ diff --git a/sample/green.jpg b/sample/green.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a530db242d8628a4ae659e4ad79eaa6094be19b0 Binary files /dev/null and b/sample/green.jpg differ diff --git a/sample/silver.jpg b/sample/silver.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bd34129121e2b7bba4446beac58df2641bfc214c Binary files /dev/null and b/sample/silver.jpg differ diff --git a/src/deps/__init__.py b/src/deps/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/deps/facial_recognition/__init__.py b/src/deps/facial_recognition/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7480b272937eee204f6343b4654803d58aad1071 --- /dev/null +++ b/src/deps/facial_recognition/__init__.py @@ -0,0 +1,3 @@ +""" +Copy-pasted from https://github.com/orpatashnik/StyleCLIP/tree/main/models/facial_recognition/__init__.py +""" diff --git a/src/deps/facial_recognition/helpers.py b/src/deps/facial_recognition/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..9832f2031b853de88a92e4e1840405023a8fcbb7 --- /dev/null +++ b/src/deps/facial_recognition/helpers.py @@ -0,0 +1,123 @@ +""" +Copy-pasted from https://github.com/orpatashnik/StyleCLIP/tree/main/models/facial_recognition/helpers.py +""" + +from collections import namedtuple +import torch +from torch.nn import Conv2d, BatchNorm2d, PReLU, ReLU, Sigmoid, MaxPool2d, AdaptiveAvgPool2d, Sequential, Module + +""" +ArcFace implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch) +""" + + +class Flatten(Module): + def forward(self, input): + return input.view(input.size(0), -1) + + +def l2_norm(input, axis=1): + norm = torch.norm(input, 2, axis, True) + output = torch.div(input, norm) + return output + + +class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])): + """ A named tuple describing a ResNet block. """ + + +def get_block(in_channel, depth, num_units, stride=2): + return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)] + + +def get_blocks(num_layers): + if num_layers == 50: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=4), + get_block(in_channel=128, depth=256, num_units=14), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 100: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=13), + get_block(in_channel=128, depth=256, num_units=30), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 152: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=8), + get_block(in_channel=128, depth=256, num_units=36), + get_block(in_channel=256, depth=512, num_units=3) + ] + else: + raise ValueError("Invalid number of layers: {}. Must be one of [50, 100, 152]".format(num_layers)) + return blocks + + +class SEModule(Module): + def __init__(self, channels, reduction): + super(SEModule, self).__init__() + self.avg_pool = AdaptiveAvgPool2d(1) + self.fc1 = Conv2d(channels, channels // reduction, kernel_size=1, padding=0, bias=False) + self.relu = ReLU(inplace=True) + self.fc2 = Conv2d(channels // reduction, channels, kernel_size=1, padding=0, bias=False) + self.sigmoid = Sigmoid() + + def forward(self, x): + module_input = x + x = self.avg_pool(x) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + return module_input * x + + +class bottleneck_IR(Module): + def __init__(self, in_channel, depth, stride): + super(bottleneck_IR, self).__init__() + if in_channel == depth: + self.shortcut_layer = MaxPool2d(1, stride) + else: + self.shortcut_layer = Sequential( + Conv2d(in_channel, depth, (1, 1), stride, bias=False), + BatchNorm2d(depth) + ) + self.res_layer = Sequential( + BatchNorm2d(in_channel), + Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), PReLU(depth), + Conv2d(depth, depth, (3, 3), stride, 1, bias=False), BatchNorm2d(depth) + ) + + def forward(self, x): + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + return res + shortcut + + +class bottleneck_IR_SE(Module): + def __init__(self, in_channel, depth, stride): + super(bottleneck_IR_SE, self).__init__() + if in_channel == depth: + self.shortcut_layer = MaxPool2d(1, stride) + else: + self.shortcut_layer = Sequential( + Conv2d(in_channel, depth, (1, 1), stride, bias=False), + BatchNorm2d(depth) + ) + self.res_layer = Sequential( + BatchNorm2d(in_channel), + Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), + PReLU(depth), + Conv2d(depth, depth, (3, 3), stride, 1, bias=False), + BatchNorm2d(depth), + SEModule(depth, 16) + ) + + def forward(self, x): + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + return res + shortcut diff --git a/src/deps/facial_recognition/model_irse.py b/src/deps/facial_recognition/model_irse.py new file mode 100644 index 0000000000000000000000000000000000000000..5a5bf1e5d1b1a03be24af2a09e29052e55752fec --- /dev/null +++ b/src/deps/facial_recognition/model_irse.py @@ -0,0 +1,88 @@ +""" +Copy-pasted from https://github.com/orpatashnik/StyleCLIP/tree/main/models/facial_recognition/model_irse.py +""" +from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Dropout, Sequential, Module +from .helpers import get_blocks, Flatten, bottleneck_IR, bottleneck_IR_SE, l2_norm + +""" +Modified Backbone implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch) +""" + +class Backbone(Module): + WEIGHTS_URL = "https://www.dropbox.com/s/n6xicva1lrghb5w/model_ir_se50.pth?dl=1" + + def __init__(self, input_size, num_layers, mode='ir', drop_ratio=0.4, affine=True): + super(Backbone, self).__init__() + assert input_size in [112, 224], "input_size should be 112 or 224" + assert num_layers in [50, 100, 152], "num_layers should be 50, 100 or 152" + assert mode in ['ir', 'ir_se'], "mode should be ir or ir_se" + blocks = get_blocks(num_layers) + if mode == 'ir': + unit_module = bottleneck_IR + elif mode == 'ir_se': + unit_module = bottleneck_IR_SE + self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False), + BatchNorm2d(64), + PReLU(64)) + if input_size == 112: + self.output_layer = Sequential(BatchNorm2d(512), + Dropout(drop_ratio), + Flatten(), + Linear(512 * 7 * 7, 512), + BatchNorm1d(512, affine=affine)) + else: + self.output_layer = Sequential(BatchNorm2d(512), + Dropout(drop_ratio), + Flatten(), + Linear(512 * 14 * 14, 512), + BatchNorm1d(512, affine=affine)) + + modules = [] + for block in blocks: + for bottleneck in block: + modules.append(unit_module(bottleneck.in_channel, + bottleneck.depth, + bottleneck.stride)) + self.body = Sequential(*modules) + + def forward(self, x): + x = self.input_layer(x) + x = self.body(x) + x = self.output_layer(x) + return l2_norm(x) + + +def IR_50(input_size): + """Constructs a ir-50 model.""" + model = Backbone(input_size, num_layers=50, mode='ir', drop_ratio=0.4, affine=False) + return model + + +def IR_101(input_size): + """Constructs a ir-101 model.""" + model = Backbone(input_size, num_layers=100, mode='ir', drop_ratio=0.4, affine=False) + return model + + +def IR_152(input_size): + """Constructs a ir-152 model.""" + model = Backbone(input_size, num_layers=152, mode='ir', drop_ratio=0.4, affine=False) + return model + + +def IR_SE_50(input_size): + """Constructs a ir_se-50 model.""" + model = Backbone(input_size, num_layers=50, mode='ir_se', drop_ratio=0.4, affine=False) + return model + + +def IR_SE_101(input_size): + """Constructs a ir_se-101 model.""" + model = Backbone(input_size, num_layers=100, mode='ir_se', drop_ratio=0.4, affine=False) + return model + + +def IR_SE_152(input_size): + """Constructs a ir_se-152 model.""" + model = Backbone(input_size, num_layers=152, mode='ir_se', drop_ratio=0.4, affine=False) + return model diff --git a/src/dnnlib/__init__.py b/src/dnnlib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2f08cf36f11f9b0fd94c1b7caeadf69b98375b04 --- /dev/null +++ b/src/dnnlib/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +from .util import EasyDict, make_cache_dir_path diff --git a/src/dnnlib/util.py b/src/dnnlib/util.py new file mode 100644 index 0000000000000000000000000000000000000000..0aa5f409b65385d7f4c459121a59826e64233152 --- /dev/null +++ b/src/dnnlib/util.py @@ -0,0 +1,480 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Miscellaneous utility classes and functions.""" + +import ctypes +import fnmatch +import importlib +import inspect +import numpy as np +import os +import shutil +import sys +import types +import io +import pickle +import re +import requests +import html +import hashlib +import glob +import tempfile +import urllib +import urllib.request +import uuid + +from distutils.util import strtobool +from typing import Any, List, Tuple, Union, Dict + + +# Util classes +# ------------------------------------------------------------------------------------------ + + +class EasyDict(dict): + """Convenience class that behaves like a dict but allows access with the attribute syntax.""" + + def __getattr__(self, name: str) -> Any: + try: + return self[name] + except KeyError: + raise AttributeError(name) + + def __setattr__(self, name: str, value: Any) -> None: + self[name] = value + + def __delattr__(self, name: str) -> None: + del self[name] + + def to_dict(self) -> Dict: + return {k: (v.to_dict() if isinstance(v, EasyDict) else v) for (k, v) in self.items()} + + +class Logger(object): + """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file.""" + + def __init__(self, file_name: str = None, file_mode: str = "w", should_flush: bool = True): + self.file = None + + if file_name is not None: + self.file = open(file_name, file_mode) + + self.should_flush = should_flush + self.stdout = sys.stdout + self.stderr = sys.stderr + + sys.stdout = self + sys.stderr = self + + def __enter__(self) -> "Logger": + return self + + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: + self.close() + + def write(self, text: Union[str, bytes]) -> None: + """Write text to stdout (and a file) and optionally flush.""" + if isinstance(text, bytes): + text = text.decode() + if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash + return + + if self.file is not None: + self.file.write(text) + + self.stdout.write(text) + + if self.should_flush: + self.flush() + + def flush(self) -> None: + """Flush written text to both stdout and a file, if open.""" + if self.file is not None: + self.file.flush() + + self.stdout.flush() + + def close(self) -> None: + """Flush, close possible files, and remove stdout/stderr mirroring.""" + self.flush() + + # if using multiple loggers, prevent closing in wrong order + if sys.stdout is self: + sys.stdout = self.stdout + if sys.stderr is self: + sys.stderr = self.stderr + + if self.file is not None: + self.file.close() + self.file = None + + +# Cache directories +# ------------------------------------------------------------------------------------------ + +_dnnlib_cache_dir = None + +def set_cache_dir(path: str) -> None: + global _dnnlib_cache_dir + _dnnlib_cache_dir = path + +def make_cache_dir_path(*paths: str) -> str: + if _dnnlib_cache_dir is not None: + return os.path.join(_dnnlib_cache_dir, *paths) + if 'DNNLIB_CACHE_DIR' in os.environ: + return os.path.join(os.environ['DNNLIB_CACHE_DIR'], *paths) + if 'HOME' in os.environ: + return os.path.join(os.environ['HOME'], '.cache', 'dnnlib', *paths) + if 'USERPROFILE' in os.environ: + return os.path.join(os.environ['USERPROFILE'], '.cache', 'dnnlib', *paths) + return os.path.join(tempfile.gettempdir(), '.cache', 'dnnlib', *paths) + +# Small util functions +# ------------------------------------------------------------------------------------------ + + +def format_time(seconds: Union[int, float]) -> str: + """Convert the seconds to human readable string with days, hours, minutes and seconds.""" + s = int(np.rint(seconds)) + + if s < 60: + return "{0}s".format(s) + elif s < 60 * 60: + return "{0}m {1:02}s".format(s // 60, s % 60) + elif s < 24 * 60 * 60: + return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60) + else: + return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60) + + +def ask_yes_no(question: str) -> bool: + """Ask the user the question until the user inputs a valid answer.""" + while True: + try: + print("{0} [y/n]".format(question)) + return strtobool(input().lower()) + except ValueError: + pass + + +def tuple_product(t: Tuple) -> Any: + """Calculate the product of the tuple elements.""" + result = 1 + + for v in t: + result *= v + + return result + + +_str_to_ctype = { + "uint8": ctypes.c_ubyte, + "uint16": ctypes.c_uint16, + "uint32": ctypes.c_uint32, + "uint64": ctypes.c_uint64, + "int8": ctypes.c_byte, + "int16": ctypes.c_int16, + "int32": ctypes.c_int32, + "int64": ctypes.c_int64, + "float32": ctypes.c_float, + "float64": ctypes.c_double +} + + +def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]: + """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes.""" + type_str = None + + if isinstance(type_obj, str): + type_str = type_obj + elif hasattr(type_obj, "__name__"): + type_str = type_obj.__name__ + elif hasattr(type_obj, "name"): + type_str = type_obj.name + else: + raise RuntimeError("Cannot infer type name from input") + + assert type_str in _str_to_ctype.keys() + + my_dtype = np.dtype(type_str) + my_ctype = _str_to_ctype[type_str] + + assert my_dtype.itemsize == ctypes.sizeof(my_ctype) + + return my_dtype, my_ctype + + +def is_pickleable(obj: Any) -> bool: + try: + with io.BytesIO() as stream: + pickle.dump(obj, stream) + return True + except: + return False + + +# Functionality to import modules/objects by name, and call functions by name +# ------------------------------------------------------------------------------------------ + +def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]: + """Searches for the underlying module behind the name to some python object. + Returns the module and the object name (original name with module part removed).""" + + # allow convenience shorthands, substitute them by full names + obj_name = re.sub("^np.", "numpy.", obj_name) + obj_name = re.sub("^tf.", "tensorflow.", obj_name) + + # list alternatives for (module_name, local_obj_name) + parts = obj_name.split(".") + name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)] + + # try each alternative in turn + for module_name, local_obj_name in name_pairs: + try: + module = importlib.import_module(module_name) # may raise ImportError + get_obj_from_module(module, local_obj_name) # may raise AttributeError + return module, local_obj_name + except: + pass + + # maybe some of the modules themselves contain errors? + for module_name, _local_obj_name in name_pairs: + try: + importlib.import_module(module_name) # may raise ImportError + except ImportError: + if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"): + raise + + # maybe the requested attribute is missing? + for module_name, local_obj_name in name_pairs: + try: + module = importlib.import_module(module_name) # may raise ImportError + get_obj_from_module(module, local_obj_name) # may raise AttributeError + except ImportError: + pass + + # we are out of luck, but we have no idea why + raise ImportError(obj_name) + + +def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any: + """Traverses the object name and returns the last (rightmost) python object.""" + if obj_name == '': + return module + obj = module + for part in obj_name.split("."): + obj = getattr(obj, part) + return obj + + +def get_obj_by_name(name: str) -> Any: + """Finds the python object with the given name.""" + module, obj_name = get_module_from_obj_name(name) + return get_obj_from_module(module, obj_name) + + +def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any: + """Finds the python object with the given name and calls it as a function.""" + assert func_name is not None + func_obj = get_obj_by_name(func_name) + assert callable(func_obj) + return func_obj(*args, **kwargs) + + +def construct_class_by_name(*args, class_name: str = None, **kwargs) -> Any: + """Finds the python class with the given name and constructs it with the given arguments.""" + return call_func_by_name(*args, func_name=class_name, **kwargs) + + +def get_module_dir_by_obj_name(obj_name: str) -> str: + """Get the directory path of the module containing the given object name.""" + module, _ = get_module_from_obj_name(obj_name) + return os.path.dirname(inspect.getfile(module)) + + +def is_top_level_function(obj: Any) -> bool: + """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'.""" + return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__ + + +def get_top_level_function_name(obj: Any) -> str: + """Return the fully-qualified name of a top-level function.""" + assert is_top_level_function(obj) + module = obj.__module__ + if module == '__main__': + module = os.path.splitext(os.path.basename(sys.modules[module].__file__))[0] + return module + "." + obj.__name__ + + +# File system helpers +# ------------------------------------------------------------------------------------------ + +def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]: + """List all files recursively in a given directory while ignoring given file and directory names. + Returns list of tuples containing both absolute and relative paths.""" + assert os.path.isdir(dir_path) + base_name = os.path.basename(os.path.normpath(dir_path)) + + if ignores is None: + ignores = [] + + result = [] + + for root, dirs, files in os.walk(dir_path, topdown=True): + for ignore_ in ignores: + dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)] + + # dirs need to be edited in-place + for d in dirs_to_remove: + dirs.remove(d) + + files = [f for f in files if not fnmatch.fnmatch(f, ignore_)] + + absolute_paths = [os.path.join(root, f) for f in files] + relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths] + + if add_base_to_relative: + relative_paths = [os.path.join(base_name, p) for p in relative_paths] + + assert len(absolute_paths) == len(relative_paths) + result += zip(absolute_paths, relative_paths) + + return result + + +def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None: + """Takes in a list of tuples of (src, dst) paths and copies files. + Will create all necessary directories.""" + for file in files: + target_dir_name = os.path.dirname(file[1]) + + # will create all intermediate-level directories + if not os.path.exists(target_dir_name): + os.makedirs(target_dir_name) + + shutil.copyfile(file[0], file[1]) + + +# URL helpers +# ------------------------------------------------------------------------------------------ + +def is_url(obj: Any, allow_file_urls: bool = False) -> bool: + """Determine whether the given object is a valid URL string.""" + if not isinstance(obj, str) or not "://" in obj: + return False + if allow_file_urls and obj.startswith('file://'): + return True + try: + res = requests.compat.urlparse(obj) + if not res.scheme or not res.netloc or not "." in res.netloc: + return False + res = requests.compat.urlparse(requests.compat.urljoin(obj, "/")) + if not res.scheme or not res.netloc or not "." in res.netloc: + return False + except: + return False + return True + + +def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> Any: + """Download the given URL and return a binary-mode file object to access the data.""" + assert num_attempts >= 1 + assert not (return_filename and (not cache)) + + # Doesn't look like an URL scheme so interpret it as a local filename. + if not re.match('^[a-z]+://', url): + return url if return_filename else open(url, "rb") + + # Handle file URLs. This code handles unusual file:// patterns that + # arise on Windows: + # + # file:///c:/foo.txt + # + # which would translate to a local '/c:/foo.txt' filename that's + # invalid. Drop the forward slash for such pathnames. + # + # If you touch this code path, you should test it on both Linux and + # Windows. + # + # Some internet resources suggest using urllib.request.url2pathname() but + # but that converts forward slashes to backslashes and this causes + # its own set of problems. + if url.startswith('file://'): + filename = urllib.parse.urlparse(url).path + if re.match(r'^/[a-zA-Z]:', filename): + filename = filename[1:] + return filename if return_filename else open(filename, "rb") + + assert is_url(url) + + # Lookup from cache. + if cache_dir is None: + cache_dir = make_cache_dir_path('downloads') + + url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest() + if cache: + cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*")) + if len(cache_files) == 1: + filename = cache_files[0] + return filename if return_filename else open(filename, "rb") + + # Download. + url_name = None + url_data = None + with requests.Session() as session: + if verbose: + print("Downloading %s ..." % url, end="", flush=True) + for attempts_left in reversed(range(num_attempts)): + try: + with session.get(url) as res: + res.raise_for_status() + if len(res.content) == 0: + raise IOError("No data received") + + if len(res.content) < 8192: + content_str = res.content.decode("utf-8") + if "download_warning" in res.headers.get("Set-Cookie", ""): + links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link] + if len(links) == 1: + url = requests.compat.urljoin(url, links[0]) + raise IOError("Google Drive virus checker nag") + if "Google Drive - Quota exceeded" in content_str: + raise IOError("Google Drive download quota exceeded -- please try again later") + + match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", "")) + url_name = match[1] if match else url + url_data = res.content + if verbose: + print(" done") + break + except KeyboardInterrupt: + raise + except: + if not attempts_left: + if verbose: + print(" failed") + raise + if verbose: + print(".", end="", flush=True) + + # Save to cache. + if cache: + safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name) + cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name) + temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name) + os.makedirs(cache_dir, exist_ok=True) + with open(temp_file, "wb") as f: + f.write(url_data) + os.replace(temp_file, cache_file) # atomic + if return_filename: + return cache_file + + # Return data as file object. + assert not return_filename + return io.BytesIO(url_data) diff --git a/src/infra/__init__.py b/src/infra/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/infra/experiments.yaml b/src/infra/experiments.yaml new file mode 100644 index 0000000000000000000000000000000000000000..129793a50cc973571ad4bd41e3363c5733722172 --- /dev/null +++ b/src/infra/experiments.yaml @@ -0,0 +1,60 @@ +#---------------------------------------------------------------------------- +# Here, we keep the experiments HPs in case we want to do mass-launching via SLURM +#---------------------------------------------------------------------------- + +mocogan_sg2: + common_args: + model: mocogan + training.batch: 16 + dataset.max_num_frames: 32 + experiments: + b16_mnf16: + sampling: traditional_16 + dataset.max_num_frames: 16 + model.generator.motion.long_history: false + +#---------------------------------------------------------------------------- + +ffs: + common_args: + sampling.num_frames_per_video: 3 + experiments: + mnf1024_sfpm32_minperiod16: {} + mnf1024_sfpm32_minperiod32: + model.generator.time_enc.min_period_len: 32 + +#---------------------------------------------------------------------------- + +sky_timelapse: + common_args: + sampling.num_frames_per_video: 3 + experiments: + mnf1024_sfpm32_minperiod16: {} + mnf1024_sfpm256_minperiod256: + model.generator.motion.motion_z_distance: 256 + model.generator.time_enc.min_period_len: 256 + +#---------------------------------------------------------------------------- + +highres: + common_args: + training.metrics: \"fvd2048_16f,fvd2048_128f_subsample,fid50k_full\" + training.batch: 16 + sampling.num_frames_per_video: 2 + experiments: + mnf1024_sfpm32_minperiod16_batch16: {} + mnf32_sfpm32_minperiod16_batch16: + dataset.max_num_frames: 32 + +#---------------------------------------------------------------------------- + +cond_ablation_ffs: + common_args: + sampling.num_frames_per_video: 3 + experiments: + hyper_mod: + model.discriminator.hyper_type: hyper + without_proj_cond: + model.discriminator.dummy_c: true + +#---------------------------------------------------------------------------- diff --git a/src/infra/launch.py b/src/infra/launch.py new file mode 100644 index 0000000000000000000000000000000000000000..611f2a5d83ee7555640fc6637f9a1b2a849db638 --- /dev/null +++ b/src/infra/launch.py @@ -0,0 +1,113 @@ +""" +Run a __reproducible__ experiment on __allocated__ resources +It submits a slurm job(s) with the given hyperparams which will then execute `slurm_job.py` +This is the main entry-point +""" + +import os +os.environ["HYDRA_FULL_ERROR"] = "1" + +import subprocess +import re + +import hydra +from omegaconf import DictConfig, OmegaConf +from pathlib import Path + +from utils import create_project_dir, recursive_instantiate + +#---------------------------------------------------------------------------- + +HYDRA_ARGS = "hydra.run.dir=. hydra.output_subdir=null hydra/job_logging=disabled hydra/hydra_logging=disabled" + +#---------------------------------------------------------------------------- + +@hydra.main(config_path="../../configs", config_name="config.yaml") +def main(cfg: DictConfig): + recursive_instantiate(cfg) + OmegaConf.set_struct(cfg, True) + cfg.env.project_path = str(cfg.env.project_path) # This is needed to evaluate ${hydra:runtime.cwd} + + before_train_cmd = '\n'.join(cfg.env.before_train_commands) + before_train_cmd = before_train_cmd + '\n' if len(before_train_cmd) > 0 else '' + torch_extensions_dir = os.environ.get('TORCH_EXTENSIONS_DIR', cfg.env.torch_extensions_dir) + training_cmd = f'{before_train_cmd}TORCH_EXTENSIONS_DIR={torch_extensions_dir} cd {cfg.project_release_dir} && {cfg.env.python_bin} src/train.py {HYDRA_ARGS}' + quiet = cfg.get('quiet', False) + training_cmd_save_path = os.path.join(cfg.project_release_dir, 'training_cmd.sh') + cfg_save_path = os.path.join(cfg.project_release_dir, 'experiment_config.yaml') + + if not quiet: + print('<=== TRAINING COMMAND START ===>') + print(training_cmd) + print('<=== TRAINING COMMAND END ===>') + + is_running_from_scratch = True + + if cfg.training.resume == "latest" and os.path.isdir(cfg.project_release_dir) and os.path.isfile(training_cmd_save_path) and os.path.isfile(cfg_save_path): + is_running_from_scratch = False + if not quiet: + print("We are going to resume the training and the experiment already exists. " \ + "That's why the provided config/training_cmd are discarded and the project dir is not created.") + + if is_running_from_scratch and not cfg.print_only: + create_project_dir( + cfg.project_release_dir, + cfg.env.objects_to_copy, + cfg.env.symlinks_to_create, + quiet=quiet, + ignore_uncommited_changes=cfg.get('ignore_uncommited_changes', False), + overwrite=cfg.get('overwrite', False)) + + with open(training_cmd_save_path, 'w') as f: + f.write(training_cmd + '\n') + if not quiet: + print(f'Saved training command in {training_cmd_save_path}') + + with open(cfg_save_path, 'w') as f: + OmegaConf.save(config=cfg, f=f) + if not quiet: + print(f'Saved config in {cfg_save_path}') + + if not cfg.print_only: + os.chdir(cfg.project_release_dir) + + if cfg.slurm: + assert Path(cfg.dataset.path_for_slurm_job).exists() + + curr_job_id = None + + for i in range(cfg.job_sequence_length): + if i == 0: + deps_args_str = '' + else: + deps_args_str = f'--dependency=afterany:{curr_job_id}' + + # Submitting the slurm job + qos_arg_str = f'--account {os.environ["PRIORITY_BOOST_ACC"]}' if cfg.use_qos else '' + output_file_arg_str = f'--output {cfg.project_release_dir}/slurm_{i}.log' + submit_job_cmd = f'sbatch {cfg.sbatch_args_str} {output_file_arg_str} {qos_arg_str} --export=ALL,{cfg.env_args_str} {deps_args_str} src/infra/slurm_job_proxy.sh' + + if cfg.print_only: + print(submit_job_cmd) + curr_job_id = "DUMMY_JOB_ID" + else: + result = subprocess.run(submit_job_cmd, stdout=subprocess.PIPE, shell=True) + output_str = result.stdout.decode("utf-8").strip("\n") # It has a format of "Submitted batch job 17033559" + if not quiet or i == 0: + print(output_str) + curr_job_id = re.findall(r"^Submitted\ batch\ job\ \d{5,8}$", output_str) + assert len(curr_job_id) == 1, f"Bad output: `{output_str}`" + curr_job_id = int(curr_job_id[0][len('Submitted batch job '):]) + else: + assert cfg.job_sequence_length == 1, "You can use a job sequence only when running via slurm." + if cfg.print_only: + print(training_cmd) + else: + os.system(training_cmd) + +#---------------------------------------------------------------------------- + +if __name__ == "__main__": + main() + +#---------------------------------------------------------------------------- \ No newline at end of file diff --git a/src/infra/slurm_batch_launch.py b/src/infra/slurm_batch_launch.py new file mode 100644 index 0000000000000000000000000000000000000000..7a3bd5a67ee6ec4f7a6831e248a8a9a46cb19ea8 --- /dev/null +++ b/src/infra/slurm_batch_launch.py @@ -0,0 +1,96 @@ +import os +import argparse +import copy +from typing import List, Dict, Optional +from omegaconf import OmegaConf, DictConfig +from src.infra.utils import cfg_to_args_str + +#---------------------------------------------------------------------------- + +HYDRA_ARGS = "hydra.run.dir=. hydra.output_subdir=null hydra/job_logging=disabled hydra/hydra_logging=disabled" + +#---------------------------------------------------------------------------- + +def batch_launch(launcher: str, experiments_dir: os.PathLike, cfg: DictConfig, datasets: List[str], print_only: bool, time: str, use_qos: bool=False, other_args: Dict={}, num_gpus: int=4, *args, **kwargs): + for dataset in datasets: + for exp_args in construct_experiments_args(cfg, *args, **kwargs): + exp_args['sbatch_args.time'] = time + exp_args['experiments_dir'] = experiments_dir + exp_args['dataset'] = dataset + exp_args['env'] = 'ibex' + exp_args['use_qos'] = use_qos + exp_args = {**exp_args, **other_args} + curr_exp_args_str = cfg_to_args_str(exp_args, use_dashes=False) + launching_command = f"{launcher} num_gpus={num_gpus} {curr_exp_args_str}" + + if print_only: + os.makedirs(exp_args['experiments_dir'], exist_ok=True) + print(launching_command) + else: + os.system(launching_command) + +#---------------------------------------------------------------------------- + +def construct_experiments_args(cfg: DictConfig, experiments_list: Optional[List[str]]=None, suffix: str="") -> List[Dict]: + args_dicts = [] + common_cfg = cfg.get('common_args', {}) + + for exp_name, exp_cfg in to_dict(cfg.experiments).items(): + if not experiments_list is None and not exp_name in experiments_list: + continue + curr_exp_cfg = {**copy.deepcopy(to_dict(common_cfg)), **to_dict(exp_cfg)} + curr_exp_cfg['exp_suffix'] = f'{exp_name}{suffix}' + args_dicts.append(curr_exp_cfg) + + return args_dicts + +#---------------------------------------------------------------------------- + +def to_dict(cfg) -> Dict: + return OmegaConf.to_container(OmegaConf.create({**cfg})) + +#---------------------------------------------------------------------------- + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Experiments launcher") + parser.add_argument('-e', '--series_name', type=str, required=True, help="Which experiments series to launch?") + parser.add_argument('-d', '--datasets', required=True, type=str, help='Comma-separate list of datasets') + parser.add_argument('-p', '--print_only', action='store_true', help='Just print commands and exit?') + parser.add_argument('-t', '--time', type=str, default='1-0', help='Which time to specify for the sbatch command?') + parser.add_argument('-q', '--use_qos', action='store_true', help='Should we use QoS to launch jobs?') + parser.add_argument('--experiments_list', type=str, help='Should we run only some specific experiments from this experiments series?') + parser.add_argument('--other_args', type=str, default="", help='Additional arguments for the experiments') + parser.add_argument('--suffix', type=str, default="", help='Additional suffix for the experiments') + parser.add_argument('--num_gpus', type=int, default=4, help='Number of GPUs to use per each experiment') + parser.add_argument('--project_dir', type=str, default=os.getcwd(), help='Project directory path') + parser.add_argument('--project_dir_for_exps_cfg', type=str, help="Overwrite the project directory to use for experiments.yaml. Useful for debugging the config.") + args = parser.parse_args() + + os.chdir(args.project_dir) + user = os.environ.get('USER', 'unknown') + python_bin = os.path.join(args.project_dir, 'env/bin/python') + launcher = f"{python_bin} src/infra/launch.py {HYDRA_ARGS} +quiet=true slurm=true" + experiments_dir = f'experiments/{user}/{args.series_name}' + exps_cfg_path = os.path.join(args.project_dir if args.project_dir_for_exps_cfg is None else args.project_dir_for_exps_cfg, 'src/infra/experiments.yaml') + all_exp_series = OmegaConf.load(exps_cfg_path) + assert args.series_name in all_exp_series, f"Experiments series not found: {args.series_name}" + cfg = all_exp_series[args.series_name] + datasets = args.datasets.split(',') + experiments_list = None if args.experiments_list is None else args.experiments_list.split(',') + other_args = {kv.split('=')[0]: kv.split('=')[1] for kv in args.other_args.split(',') if len(kv.split('=')) == 2} + + batch_launch( + launcher=launcher, + experiments_dir=experiments_dir, + cfg=cfg, + datasets=datasets, + print_only=args.print_only, + time=args.time, + use_qos=args.use_qos, + experiments_list=experiments_list, + other_args=other_args, + suffix=args.suffix, + num_gpus=args.num_gpus, + ) + +#---------------------------------------------------------------------------- diff --git a/src/infra/slurm_job.py b/src/infra/slurm_job.py new file mode 100644 index 0000000000000000000000000000000000000000..da5060f71b560f2917339577a9637b56f322f104 --- /dev/null +++ b/src/infra/slurm_job.py @@ -0,0 +1,46 @@ +""" +Must be launched from the released project dir +""" + +import os +import time +import random +import subprocess +from shutil import copyfile + +import hydra +from omegaconf import DictConfig + +# Unfortunately, (AFAIK) we cannot pass arguments normally (to parse them with argparse) +# that's why we are reading them from env +SLURM_JOB_ID = os.getenv('SLURM_JOB_ID') +project_dir = os.getenv('project_dir') +python_bin = os.getenv('python_bin') + +# Printing the environment +print('PROJECT DIR:', project_dir) +print(f'SLURM_JOB_ID: {SLURM_JOB_ID}') +print('HOSTNAME:', subprocess.run(['hostname'], stdout=subprocess.PIPE).stdout.decode('utf-8')) +print(subprocess.run([os.path.join(os.path.dirname(python_bin), 'gpustat')], stdout=subprocess.PIPE).stdout.decode('utf-8')) + +@hydra.main(config_name=os.path.join(project_dir, 'experiment_config.yaml')) +def main(cfg: DictConfig): + os.chdir(project_dir) + + target_data_dir_base = os.path.dirname(cfg.dataset.path) + if os.path.islink(target_data_dir_base): + os.makedirs(os.readlink(target_data_dir_base), exist_ok=True) + else: + os.makedirs(target_data_dir_base, exist_ok=True) + + copyfile(cfg.dataset.path_for_slurm_job, cfg.dataset.path) + print(f'Copied the data: {cfg.dataset.path_for_slurm_job} => {cfg.dataset.path}. Starting the training...') + + training_cmd = open('training_cmd.sh').read() + print('<=== TRAINING COMMAND ===>') + print(training_cmd) + os.system(training_cmd) + + +if __name__ == "__main__": + main() diff --git a/src/infra/slurm_job_proxy.sh b/src/infra/slurm_job_proxy.sh new file mode 100644 index 0000000000000000000000000000000000000000..d9905d962bd274cecd4ceeba3899e352f3bfc600 --- /dev/null +++ b/src/infra/slurm_job_proxy.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# We need this proxy so not to put the shebang into `slurm_job.py` +# We cannot put a shebang there since we use different python executors for it +$python_bin $python_script diff --git a/src/infra/utils.py b/src/infra/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7c611032772ca1f70cdbbecaca2cf71d3e76e25f --- /dev/null +++ b/src/infra/utils.py @@ -0,0 +1,140 @@ +import os +import shutil +import subprocess +from distutils.dir_util import copy_tree +from shutil import copyfile +from typing import List, Optional + +from hydra.utils import instantiate +import click +import git +from omegaconf import DictConfig + +#---------------------------------------------------------------------------- + +def copy_objects(target_dir: os.PathLike, objects_to_copy: List[os.PathLike]): + for src_path in objects_to_copy: + trg_path = os.path.join(target_dir, os.path.basename(src_path)) + + if os.path.islink(src_path): + os.symlink(os.readlink(src_path), trg_path) + elif os.path.isfile(src_path): + copyfile(src_path, trg_path) + elif os.path.isdir(src_path): + copy_tree(src_path, trg_path) + else: + raise NotImplementedError(f"Unknown object type: {src_path}") + +#---------------------------------------------------------------------------- + +def create_symlinks(target_dir: os.PathLike, symlinks_to_create: List[os.PathLike]): + """ + Creates symlinks to the given paths + """ + for src_path in symlinks_to_create: + trg_path = os.path.join(target_dir, os.path.basename(src_path)) + + if os.path.islink(src_path): + # Let's not create symlinks to symlinks + # Since dropping the current symlink will break the experiment + os.symlink(os.readlink(src_path), trg_path) + else: + print(f'Creating a symlink to {src_path}, so try not to delete it occasionally!') + os.symlink(src_path, trg_path) + +#---------------------------------------------------------------------------- + +def is_git_repo(path: os.PathLike): + try: + _ = git.Repo(path).git_dir + return True + except git.exc.InvalidGitRepositoryError: + return False + +#---------------------------------------------------------------------------- + +def create_project_dir( + project_dir: os.PathLike, + objects_to_copy: List[os.PathLike], + symlinks_to_create: List[os.PathLike], + quiet: bool=False, + ignore_uncommited_changes: bool=False, + overwrite: bool=False): + + if is_git_repo(os.getcwd()) and are_there_uncommitted_changes(): + if ignore_uncommited_changes or click.confirm("There are uncommited changes. Continue?", default=False): + pass + else: + raise PermissionError("Cannot created a dir when there are uncommited changes") + + if os.path.exists(project_dir): + if overwrite or click.confirm(f'Dir {project_dir} already exists. Overwrite it?', default=False): + shutil.rmtree(project_dir) + else: + print('User refused to delete an existing project dir.') + raise PermissionError("There is an existing dir and I cannot delete it.") + + os.makedirs(project_dir) + copy_objects(project_dir, objects_to_copy) + create_symlinks(project_dir, symlinks_to_create) + + if not quiet: + print(f'Created a project dir: {project_dir}') + +#---------------------------------------------------------------------------- + +def get_git_hash() -> Optional[str]: + if not is_git_repo(os.getcwd()): + return None + + try: + return subprocess \ + .check_output(['git', 'rev-parse', '--short', 'HEAD']) \ + .decode("utf-8") \ + .strip() + except: + return None + +#---------------------------------------------------------------------------- + +# def get_experiment_path(master_dir: os.PathLike, experiment_name: str) -> os.PathLike: +# return os.path.join(master_dir, f"{experiment_name}-{get_git_hash()}") + +#---------------------------------------------------------------------------- + +def get_git_hash_suffix() -> str: + git_hash: Optional[str] = get_git_hash() + git_hash_suffix = "-nogit" if git_hash is None else f"-{git_hash}" + + return git_hash_suffix + +#---------------------------------------------------------------------------- + +def are_there_uncommitted_changes() -> bool: + return len(subprocess.check_output('git status -s'.split()).decode("utf-8")) > 0 + +#---------------------------------------------------------------------------- + +def cfg_to_args_str(cfg: DictConfig, use_dashes=True) -> str: + dashes = '--' if use_dashes else '' + + return ' '.join([f'{dashes}{p}={cfg[p]}' for p in cfg]) + +#---------------------------------------------------------------------------- + +def recursive_instantiate(cfg: DictConfig): + for key in cfg: + # print(type(cfg[key])) + if isinstance(cfg[key], DictConfig): + if '_target_' in cfg[key]: + cfg[key] = instantiate(cfg[key]) + else: + recursive_instantiate(cfg[key]) + +#---------------------------------------------------------------------------- + +def num_gpus_to_mem(num_gpus: int, mem_per_gpu: 64) -> str: + # Doing it here since hydra config cannot do formatting for ${...} + return f"{num_gpus * mem_per_gpu}G" + +#---------------------------------------------------------------------------- \ No newline at end of file diff --git a/src/metrics/__init__.py b/src/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1e1a5ba99e56a56ecaa14f7d4fa41777789c0cf --- /dev/null +++ b/src/metrics/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +# empty diff --git a/src/metrics/frechet_inception_distance.py b/src/metrics/frechet_inception_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..a569cc60ef0fb555ffc3418202ff0eacaa83e7b0 --- /dev/null +++ b/src/metrics/frechet_inception_distance.py @@ -0,0 +1,54 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Frechet Inception Distance (FID) from the paper +"GANs trained by a two time-scale update rule converge to a local Nash +equilibrium". Matches the original implementation by Heusel et al. at +https://github.com/bioinf-jku/TTUR/blob/master/fid.py""" + +import numpy as np +import scipy.linalg +from . import metric_utils + +NUM_FRAMES_IN_BATCH = {128: 32, 256: 32, 512: 8, 1024: 2} + +#---------------------------------------------------------------------------- + +def compute_fid(opts, max_real, num_gen): + # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz + detector_url = 'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/metrics/inception-2015-12-05.pkl' + detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer. + + batch_size = NUM_FRAMES_IN_BATCH[opts.dataset_kwargs.resolution] + + mu_real, sigma_real = metric_utils.compute_feature_stats_for_dataset( + opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, + rel_lo=0, rel_hi=0, capture_mean_cov=True, max_items=max_real, use_image_dataset=True).get_mean_cov() + + if opts.generator_as_dataset: + compute_gen_stats_fn = metric_utils.compute_feature_stats_for_dataset + gen_opts = metric_utils.rewrite_opts_for_gen_dataset(opts) + gen_kwargs = dict(use_image_dataset=True) + else: + compute_gen_stats_fn = metric_utils.compute_feature_stats_for_generator + gen_opts = opts + gen_kwargs = dict() + + mu_gen, sigma_gen = compute_gen_stats_fn( + opts=gen_opts, detector_url=detector_url, detector_kwargs=detector_kwargs, batch_size=batch_size, + rel_lo=0, rel_hi=1, capture_mean_cov=True, max_items=num_gen, **gen_kwargs).get_mean_cov() + + if opts.rank != 0: + return float('nan') + + m = np.square(mu_gen - mu_real).sum() + s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member + fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2)) + return float(fid) + +#---------------------------------------------------------------------------- diff --git a/src/metrics/frechet_video_distance.py b/src/metrics/frechet_video_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..b9434c2dae145da891ff24dfd483f3f1d83a901d --- /dev/null +++ b/src/metrics/frechet_video_distance.py @@ -0,0 +1,59 @@ +""" +Frechet Video Distance (FVD). Matches the original tensorflow implementation from +https://github.com/google-research/google-research/blob/master/frechet_video_distance/frechet_video_distance.py +up to the upsampling operation. Note that this tf.hub I3D model is different from the one released in the I3D repo. +""" + +import copy +import numpy as np +import scipy.linalg +from . import metric_utils + +#---------------------------------------------------------------------------- + +NUM_FRAMES_IN_BATCH = {128: 128, 256: 128, 512: 64, 1024: 32} + +#---------------------------------------------------------------------------- + +def compute_fvd(opts, max_real: int, num_gen: int, num_frames: int, subsample_factor: int=1): + # Perfectly reproduced torchscript version of the I3D model, trained on Kinetics-400, used here: + # https://github.com/google-research/google-research/blob/master/frechet_video_distance/frechet_video_distance.py + # Note that the weights on tf.hub (used in the script above) differ from the original released weights + detector_url = 'https://www.dropbox.com/s/ge9e5ujwgetktms/i3d_torchscript.pt?dl=1' + detector_kwargs = dict(rescale=True, resize=True, return_features=True) # Return raw features before the softmax layer. + + opts = copy.deepcopy(opts) + opts.dataset_kwargs.load_n_consecutive = num_frames + opts.dataset_kwargs.subsample_factor = subsample_factor + opts.dataset_kwargs.discard_short_videos = True + batch_size = NUM_FRAMES_IN_BATCH[opts.dataset_kwargs.resolution] // num_frames + + mu_real, sigma_real = metric_utils.compute_feature_stats_for_dataset( + opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, rel_lo=0, rel_hi=0, + capture_mean_cov=True, max_items=max_real, temporal_detector=True, batch_size=batch_size).get_mean_cov() + + if opts.generator_as_dataset: + compute_gen_stats_fn = metric_utils.compute_feature_stats_for_dataset + gen_opts = metric_utils.rewrite_opts_for_gen_dataset(opts) + gen_opts.dataset_kwargs.load_n_consecutive = num_frames + gen_opts.dataset_kwargs.load_n_consecutive_random_offset = False + gen_opts.dataset_kwargs.subsample_factor = subsample_factor + gen_kwargs = dict() + else: + compute_gen_stats_fn = metric_utils.compute_feature_stats_for_generator + gen_opts = opts + gen_kwargs = dict(num_video_frames=num_frames, subsample_factor=subsample_factor) + + mu_gen, sigma_gen = compute_gen_stats_fn( + opts=gen_opts, detector_url=detector_url, detector_kwargs=detector_kwargs, rel_lo=0, rel_hi=1, capture_mean_cov=True, + max_items=num_gen, temporal_detector=True, batch_size=batch_size, **gen_kwargs).get_mean_cov() + + if opts.rank != 0: + return float('nan') + + m = np.square(mu_gen - mu_real).sum() + s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member + fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2)) + return float(fid) + +#---------------------------------------------------------------------------- diff --git a/src/metrics/inception_score.py b/src/metrics/inception_score.py new file mode 100644 index 0000000000000000000000000000000000000000..4dbbcd3ca70513d1dcf288505d3adaa1fc66eaa9 --- /dev/null +++ b/src/metrics/inception_score.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Inception Score (IS) from the paper "Improved techniques for training +GANs". Matches the original implementation by Salimans et al. at +https://github.com/openai/improved-gan/blob/master/inception_score/model.py""" + +import numpy as np +from . import metric_utils + +#---------------------------------------------------------------------------- + +def compute_is(opts, num_gen, num_splits): + # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz + detector_url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/inception-2015-12-05.pt' + detector_kwargs = dict(no_output_bias=True) # Match the original implementation by not applying bias in the softmax layer. + + if opts.generator_as_dataset: + compute_gen_stats_fn = metric_utils.compute_feature_stats_for_dataset + gen_opts = metric_utils.rewrite_opts_for_gen_dataset(opts) + gen_kwargs = dict(use_image_dataset=True) + else: + compute_gen_stats_fn = metric_utils.compute_feature_stats_for_generator + gen_opts = opts + gen_kwargs = dict() + + gen_probs = compute_gen_stats_fn( + opts=gen_opts, detector_url=detector_url, detector_kwargs=detector_kwargs, + capture_all=True, max_items=num_gen, **gen_kwargs).get_all() + + if opts.rank != 0: + return float('nan'), float('nan') + + scores = [] + for i in range(num_splits): + part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits] + kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True))) + kl = np.mean(np.sum(kl, axis=1)) + scores.append(np.exp(kl)) + return float(np.mean(scores)), float(np.std(scores)) + +#---------------------------------------------------------------------------- diff --git a/src/metrics/kernel_inception_distance.py b/src/metrics/kernel_inception_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..772f9f941dc0dd1644ed35206bafe80511179807 --- /dev/null +++ b/src/metrics/kernel_inception_distance.py @@ -0,0 +1,46 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Kernel Inception Distance (KID) from the paper "Demystifying MMD +GANs". Matches the original implementation by Binkowski et al. at +https://github.com/mbinkowski/MMD-GAN/blob/master/gan/compute_scores.py""" + +import numpy as np +from . import metric_utils + +#---------------------------------------------------------------------------- + +def compute_kid(opts, max_real, num_gen, num_subsets, max_subset_size): + # Direct TorchScript translation of http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz + detector_url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/inception-2015-12-05.pt' + detector_kwargs = dict(return_features=True) # Return raw features before the softmax layer. + + real_features = metric_utils.compute_feature_stats_for_dataset( + opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, + rel_lo=0, rel_hi=0, capture_all=True, max_items=max_real, use_image_dataset=True).get_all() + + gen_features = metric_utils.compute_feature_stats_for_generator( + opts=opts, detector_url=detector_url, detector_kwargs=detector_kwargs, + rel_lo=0, rel_hi=1, capture_all=True, max_items=num_gen).get_all() + + if opts.rank != 0: + return float('nan') + + n = real_features.shape[1] + m = min(min(real_features.shape[0], gen_features.shape[0]), max_subset_size) + t = 0 + for _subset_idx in range(num_subsets): + x = gen_features[np.random.choice(gen_features.shape[0], m, replace=False)] + y = real_features[np.random.choice(real_features.shape[0], m, replace=False)] + a = (x @ x.T / n + 1) ** 3 + (y @ y.T / n + 1) ** 3 + b = (x @ y.T / n + 1) ** 3 + t += (a.sum() - np.diag(a).sum()) / (m - 1) - b.sum() * 2 / m + kid = t / num_subsets / m + return float(kid) * 1000.0 + +#---------------------------------------------------------------------------- diff --git a/src/metrics/metric_main.py b/src/metrics/metric_main.py new file mode 100644 index 0000000000000000000000000000000000000000..75e8e98792105d788a63891852cf60f3fe0885f3 --- /dev/null +++ b/src/metrics/metric_main.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import os +import time +import json +import torch +import numpy as np +from src import dnnlib + +from . import metric_utils +from . import frechet_inception_distance +from . import kernel_inception_distance +from . import inception_score +from . import video_inception_score +from . import frechet_video_distance + +#---------------------------------------------------------------------------- + +_metric_dict = dict() # name => fn + +def register_metric(fn): + assert callable(fn) + _metric_dict[fn.__name__] = fn + return fn + +def is_valid_metric(metric): + return metric in _metric_dict + +def list_valid_metrics(): + return list(_metric_dict.keys()) + +def is_power_of_two(n: int) -> bool: + return (n & (n-1) == 0) and n != 0 + +#---------------------------------------------------------------------------- + +def calc_metric(metric, num_runs: int=1, **kwargs): # See metric_utils.MetricOptions for the full list of arguments. + assert is_valid_metric(metric) + opts = metric_utils.MetricOptions(**kwargs) + + # Calculate. + start_time = time.time() + all_runs_results = [_metric_dict[metric](opts) for _ in range(num_runs)] + total_time = time.time() - start_time + + # Broadcast results. + for results in all_runs_results: + for key, value in list(results.items()): + if opts.num_gpus > 1: + value = torch.as_tensor(value, dtype=torch.float64, device=opts.device) + torch.distributed.broadcast(tensor=value, src=0) + value = float(value.cpu()) + results[key] = value + + if num_runs > 1: + results = {f'{key}_run{i+1:02d}': value for i, results in enumerate(all_runs_results) for key, value in results.items()} + for key, value in all_runs_results[0].items(): + all_runs_values = [r[key] for r in all_runs_results] + results[f'{key}_mean'] = np.mean(all_runs_values) + results[f'{key}_std'] = np.std(all_runs_values) + else: + results = all_runs_results[0] + + # Decorate with metadata. + return dnnlib.EasyDict( + results = dnnlib.EasyDict(results), + metric = metric, + total_time = total_time, + total_time_str = dnnlib.util.format_time(total_time), + num_gpus = opts.num_gpus, + ) + +#---------------------------------------------------------------------------- + +def report_metric(result_dict, run_dir=None, snapshot_pkl=None): + metric = result_dict['metric'] + assert is_valid_metric(metric) + if run_dir is not None and snapshot_pkl is not None: + snapshot_pkl = os.path.relpath(snapshot_pkl, run_dir) + + jsonl_line = json.dumps(dict(result_dict, snapshot_pkl=snapshot_pkl, timestamp=time.time())) + print(jsonl_line) + if run_dir is not None and os.path.isdir(run_dir): + with open(os.path.join(run_dir, f'metric-{metric}.jsonl'), 'at') as f: + f.write(jsonl_line + '\n') + +#---------------------------------------------------------------------------- +# Primary metrics. + +@register_metric +def fid50k_full(opts): + opts.dataset_kwargs.update(max_size=None, xflip=False) + fid = frechet_inception_distance.compute_fid(opts, max_real=None, num_gen=50000) + return dict(fid50k_full=fid) + +@register_metric +def kid50k_full(opts): + opts.dataset_kwargs.update(max_size=None, xflip=False) + kid = kernel_inception_distance.compute_kid(opts, max_real=1000000, num_gen=50000, num_subsets=100, max_subset_size=1000) + return dict(kid50k_full=kid) + +@register_metric +def is50k(opts): + opts.dataset_kwargs.update(max_size=None, xflip=False) + mean, std = inception_score.compute_is(opts, num_gen=50000, num_splits=10) + return dict(is50k_mean=mean, is50k_std=std) + +@register_metric +def fvd2048_16f(opts): + opts.dataset_kwargs.update(max_size=None, xflip=False) + fvd = frechet_video_distance.compute_fvd(opts, max_real=2048, num_gen=2048, num_frames=16) + return dict(fvd2048_16f=fvd) + +@register_metric +def fvd2048_128f(opts): + opts.dataset_kwargs.update(max_size=None, xflip=False) + fvd = frechet_video_distance.compute_fvd(opts, max_real=2048, num_gen=2048, num_frames=128) + return dict(fvd2048_128f=fvd) + +@register_metric +def fvd2048_128f_subsample8f(opts): + """Similar to `fvd2048_128f`, but we sample each 8-th frame""" + opts.dataset_kwargs.update(max_size=None, xflip=False) + fvd = frechet_video_distance.compute_fvd(opts, max_real=2048, num_gen=2048, num_frames=16, subsample_factor=8) + return dict(fvd2048_128f_subsample8f=fvd) + +@register_metric +def isv2048_ucf(opts): + opts.dataset_kwargs.update(max_size=None, xflip=False) + mean, std = video_inception_score.compute_isv(opts, num_gen=2048, num_splits=10, backbone='c3d_ucf101') + return dict(isv2048_ucf_mean=mean, isv2048_ucf_std=std) + +#---------------------------------------------------------------------------- +# Legacy metrics. + +@register_metric +def fid50k(opts): + opts.dataset_kwargs.update(max_size=None) + fid = frechet_inception_distance.compute_fid(opts, max_real=50000, num_gen=50000) + return dict(fid50k=fid) + +@register_metric +def kid50k(opts): + opts.dataset_kwargs.update(max_size=None) + kid = kernel_inception_distance.compute_kid(opts, max_real=50000, num_gen=50000, num_subsets=100, max_subset_size=1000) + return dict(kid50k=kid) + +#---------------------------------------------------------------------------- diff --git a/src/metrics/metric_utils.py b/src/metrics/metric_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..91846ac1573e865f72cf789b11f46a5cd8013417 --- /dev/null +++ b/src/metrics/metric_utils.py @@ -0,0 +1,332 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import os +import time +import hashlib +import pickle +import copy +import uuid +from urllib.parse import urlparse +import numpy as np +import torch +from src import dnnlib +from src.training.dataset import video_to_image_dataset_kwargs + +#---------------------------------------------------------------------------- + +class MetricOptions: + def __init__(self, G=None, G_kwargs={}, dataset_kwargs={}, num_gpus=1, rank=0, device=None, + progress=None, cache=True, gen_dataset_kwargs={}, generator_as_dataset=False): + assert 0 <= rank < num_gpus + self.G = G + self.G_kwargs = dnnlib.EasyDict(G_kwargs) + self.dataset_kwargs = dnnlib.EasyDict(dataset_kwargs) + self.num_gpus = num_gpus + self.rank = rank + self.device = device if device is not None else torch.device('cuda', rank) + self.progress = progress.sub() if progress is not None and rank == 0 else ProgressMonitor() + self.cache = cache + self.gen_dataset_kwargs = gen_dataset_kwargs + self.generator_as_dataset = generator_as_dataset + +#---------------------------------------------------------------------------- + +_feature_detector_cache = dict() + +def get_feature_detector_name(url): + return os.path.splitext(url.split('/')[-1])[0] + +def get_feature_detector(url, device=torch.device('cpu'), num_gpus=1, rank=0, verbose=False): + assert 0 <= rank < num_gpus + key = (url, device) + if key not in _feature_detector_cache: + is_leader = (rank == 0) + if not is_leader and num_gpus > 1: + torch.distributed.barrier() # leader goes first + with dnnlib.util.open_url(url, verbose=(verbose and is_leader)) as f: + if urlparse(url).path.endswith('.pkl'): + _feature_detector_cache[key] = pickle.load(f).to(device) + else: + _feature_detector_cache[key] = torch.jit.load(f).eval().to(device) + if is_leader and num_gpus > 1: + torch.distributed.barrier() # others follow + return _feature_detector_cache[key] + +#---------------------------------------------------------------------------- + +class FeatureStats: + def __init__(self, capture_all=False, capture_mean_cov=False, max_items=None): + self.capture_all = capture_all + self.capture_mean_cov = capture_mean_cov + self.max_items = max_items + self.num_items = 0 + self.num_features = None + self.all_features = None + self.raw_mean = None + self.raw_cov = None + + def set_num_features(self, num_features): + if self.num_features is not None: + assert num_features == self.num_features + else: + self.num_features = num_features + self.all_features = [] + self.raw_mean = np.zeros([num_features], dtype=np.float64) + self.raw_cov = np.zeros([num_features, num_features], dtype=np.float64) + + def is_full(self): + return (self.max_items is not None) and (self.num_items >= self.max_items) + + def append(self, x): + x = np.asarray(x, dtype=np.float32) + assert x.ndim == 2 + if (self.max_items is not None) and (self.num_items + x.shape[0] > self.max_items): + if self.num_items >= self.max_items: + return + x = x[:self.max_items - self.num_items] + + self.set_num_features(x.shape[1]) + self.num_items += x.shape[0] + if self.capture_all: + self.all_features.append(x) + if self.capture_mean_cov: + x64 = x.astype(np.float64) + self.raw_mean += x64.sum(axis=0) + self.raw_cov += x64.T @ x64 + + def append_torch(self, x, num_gpus=1, rank=0): + assert isinstance(x, torch.Tensor) and x.ndim == 2 + assert 0 <= rank < num_gpus + if num_gpus > 1: + ys = [] + for src in range(num_gpus): + y = x.clone() + torch.distributed.broadcast(y, src=src) + ys.append(y) + x = torch.stack(ys, dim=1).flatten(0, 1) # interleave samples + self.append(x.cpu().numpy()) + + def get_all(self): + assert self.capture_all + return np.concatenate(self.all_features, axis=0) + + def get_all_torch(self): + return torch.from_numpy(self.get_all()) + + def get_mean_cov(self): + assert self.capture_mean_cov + mean = self.raw_mean / self.num_items + cov = self.raw_cov / self.num_items + cov = cov - np.outer(mean, mean) + return mean, cov + + def save(self, pkl_file): + with open(pkl_file, 'wb') as f: + pickle.dump(self.__dict__, f) + + @staticmethod + def load(pkl_file): + with open(pkl_file, 'rb') as f: + s = dnnlib.EasyDict(pickle.load(f)) + obj = FeatureStats(capture_all=s.capture_all, max_items=s.max_items) + obj.__dict__.update(s) + return obj + +#---------------------------------------------------------------------------- + +class ProgressMonitor: + def __init__(self, tag=None, num_items=None, flush_interval=1000, verbose=False, progress_fn=None, pfn_lo=0, pfn_hi=1000, pfn_total=1000): + self.tag = tag + self.num_items = num_items + self.verbose = verbose + self.flush_interval = flush_interval + self.progress_fn = progress_fn + self.pfn_lo = pfn_lo + self.pfn_hi = pfn_hi + self.pfn_total = pfn_total + self.start_time = time.time() + self.batch_time = self.start_time + self.batch_items = 0 + if self.progress_fn is not None: + self.progress_fn(self.pfn_lo, self.pfn_total) + + def update(self, cur_items: int): + assert (self.num_items is None) or (cur_items <= self.num_items), f"Wrong `items` values: cur_items={cur_items}, self.num_items={self.num_items}" + if (cur_items < self.batch_items + self.flush_interval) and (self.num_items is None or cur_items < self.num_items): + return + cur_time = time.time() + total_time = cur_time - self.start_time + time_per_item = (cur_time - self.batch_time) / max(cur_items - self.batch_items, 1) + if (self.verbose) and (self.tag is not None): + print(f'{self.tag:<19s} items {cur_items:<7d} time {dnnlib.util.format_time(total_time):<12s} ms/item {time_per_item*1e3:.2f}') + self.batch_time = cur_time + self.batch_items = cur_items + + if (self.progress_fn is not None) and (self.num_items is not None): + self.progress_fn(self.pfn_lo + (self.pfn_hi - self.pfn_lo) * (cur_items / self.num_items), self.pfn_total) + + def sub(self, tag=None, num_items=None, flush_interval=1000, rel_lo=0, rel_hi=1): + return ProgressMonitor( + tag = tag, + num_items = num_items, + flush_interval = flush_interval, + verbose = self.verbose, + progress_fn = self.progress_fn, + pfn_lo = self.pfn_lo + (self.pfn_hi - self.pfn_lo) * rel_lo, + pfn_hi = self.pfn_lo + (self.pfn_hi - self.pfn_lo) * rel_hi, + pfn_total = self.pfn_total, + ) + +#---------------------------------------------------------------------------- + +@torch.no_grad() +def compute_feature_stats_for_dataset( + opts, detector_url, detector_kwargs, rel_lo=0, rel_hi=1, batch_size=64, + data_loader_kwargs=None, max_items=None, temporal_detector=False, use_image_dataset=False, + feature_stats_cls=FeatureStats, **stats_kwargs): + + dataset_kwargs = video_to_image_dataset_kwargs(opts.dataset_kwargs) if use_image_dataset else opts.dataset_kwargs + dataset = dnnlib.util.construct_class_by_name(**dataset_kwargs) + + if data_loader_kwargs is None: + data_loader_kwargs = dict(pin_memory=True, num_workers=3, prefetch_factor=2) + + # Try to lookup from cache. + cache_file = None + if opts.cache: + # Choose cache file name. + args = dict(dataset_kwargs=opts.dataset_kwargs, detector_url=detector_url, detector_kwargs=detector_kwargs, + stats_kwargs=stats_kwargs, feature_stats_cls=feature_stats_cls.__name__) + md5 = hashlib.md5(repr(sorted(args.items())).encode('utf-8')) + cache_tag = f'{dataset.name}-{get_feature_detector_name(detector_url)}-{md5.hexdigest()}' + cache_file = dnnlib.make_cache_dir_path('gan-metrics', cache_tag + '.pkl') + + # Check if the file exists (all processes must agree). + flag = os.path.isfile(cache_file) if opts.rank == 0 else False + if opts.num_gpus > 1: + flag = torch.as_tensor(flag, dtype=torch.float32, device=opts.device) + torch.distributed.broadcast(tensor=flag, src=0) + flag = (float(flag.cpu()) != 0) + + # Load. + if flag: + return feature_stats_cls.load(cache_file) + + # Initialize. + num_items = len(dataset) + if max_items is not None: + num_items = min(num_items, max_items) + stats = feature_stats_cls(max_items=num_items, **stats_kwargs) + progress = opts.progress.sub(tag='dataset features', num_items=num_items, rel_lo=rel_lo, rel_hi=rel_hi) + detector = get_feature_detector(url=detector_url, device=opts.device, num_gpus=opts.num_gpus, rank=opts.rank, verbose=progress.verbose) + + # Main loop. + item_subset = [(i * opts.num_gpus + opts.rank) % num_items for i in range((num_items - 1) // opts.num_gpus + 1)] + for batch in torch.utils.data.DataLoader(dataset=dataset, sampler=item_subset, batch_size=batch_size, **data_loader_kwargs): + images = batch['image'] + if temporal_detector: + images = images.permute(0, 2, 1, 3, 4).contiguous() # [batch_size, c, t, h, w] + + # images = images.float() / 255 + # images = torch.nn.functional.interpolate(images, size=(images.shape[2], 128, 128), mode='trilinear', align_corners=False) # downsample + # images = torch.nn.functional.interpolate(images, size=(images.shape[2], 256, 256), mode='trilinear', align_corners=False) # upsample + # images = (images * 255).to(torch.uint8) + else: + images = images.view(-1, *images.shape[-3:]) # [-1, c, h, w] + + if images.shape[1] == 1: + images = images.repeat([1, 3, *([1] * (images.ndim - 2))]) + features = detector(images.to(opts.device), **detector_kwargs) + stats.append_torch(features, num_gpus=opts.num_gpus, rank=opts.rank) + progress.update(stats.num_items) + + # Save to cache. + if cache_file is not None and opts.rank == 0: + os.makedirs(os.path.dirname(cache_file), exist_ok=True) + temp_file = cache_file + '.' + uuid.uuid4().hex + stats.save(temp_file) + os.replace(temp_file, cache_file) # atomic + return stats + +#---------------------------------------------------------------------------- + +@torch.no_grad() +def compute_feature_stats_for_generator( + opts, detector_url, detector_kwargs, rel_lo=0, rel_hi=1, batch_size: int=16, + batch_gen=None, jit=False, temporal_detector=False, num_video_frames: int=16, + feature_stats_cls=FeatureStats, subsample_factor: int=1, **stats_kwargs): + + if batch_gen is None: + batch_gen = min(batch_size, 4) + assert batch_size % batch_gen == 0 + + # Setup generator and load labels. + G = copy.deepcopy(opts.G).eval().requires_grad_(False).to(opts.device) + dataset = dnnlib.util.construct_class_by_name(**opts.dataset_kwargs) + + # Image generation func. + def run_generator(z, c, t): + img = G(z=z, c=c, t=t, **opts.G_kwargs) + bt, c, h, w = img.shape + + if temporal_detector: + img = img.view(bt // num_video_frames, num_video_frames, c, h, w) # [batch_size, t, c, h, w] + img = img.permute(0, 2, 1, 3, 4).contiguous() # [batch_size, c, t, h, w] + + # img = torch.nn.functional.interpolate(img, size=(img.shape[2], 128, 128), mode='trilinear', align_corners=False) # downsample + # img = torch.nn.functional.interpolate(img, size=(img.shape[2], 256, 256), mode='trilinear', align_corners=False) # upsample + + img = (img * 127.5 + 128).clamp(0, 255).to(torch.uint8) + return img + + # JIT. + if jit: + z = torch.zeros([batch_gen, G.z_dim], device=opts.device) + c = torch.zeros([batch_gen, G.c_dim], device=opts.device) + t = torch.zeros([batch_gen, G.cfg.sampling.num_frames_per_video], device=opts.device) + run_generator = torch.jit.trace(run_generator, [z, c, t], check_trace=False) + + # Initialize. + stats = feature_stats_cls(**stats_kwargs) + assert stats.max_items is not None + progress = opts.progress.sub(tag='generator features', num_items=stats.max_items, rel_lo=rel_lo, rel_hi=rel_hi) + detector = get_feature_detector(url=detector_url, device=opts.device, num_gpus=opts.num_gpus, rank=opts.rank, verbose=progress.verbose) + + # Main loop. + while not stats.is_full(): + images = [] + for _i in range(batch_size // batch_gen): + z = torch.randn([batch_gen, G.z_dim], device=opts.device) + cond_sample_idx = [np.random.randint(len(dataset)) for _ in range(batch_gen)] + c = [dataset.get_label(i) for i in cond_sample_idx] + c = torch.from_numpy(np.stack(c)).pin_memory().to(opts.device) + t = [list(range(0, num_video_frames * subsample_factor, subsample_factor)) for _i in range(batch_gen)] + t = torch.from_numpy(np.stack(t)).pin_memory().to(opts.device) + images.append(run_generator(z, c, t)) + images = torch.cat(images) + if images.shape[1] == 1: + images = images.repeat([1, 3, *([1] * (images.ndim - 2))]) + features = detector(images, **detector_kwargs) + stats.append_torch(features, num_gpus=opts.num_gpus, rank=opts.rank) + progress.update(stats.num_items) + return stats + +#---------------------------------------------------------------------------- + +def rewrite_opts_for_gen_dataset(opts): + """ + Updates dataset arguments in the opts to enable the second dataset stats computation + """ + new_opts = copy.deepcopy(opts) + new_opts.dataset_kwargs = new_opts.gen_dataset_kwargs + new_opts.cache = False + + return new_opts + +#---------------------------------------------------------------------------- \ No newline at end of file diff --git a/src/metrics/video_inception_score.py b/src/metrics/video_inception_score.py new file mode 100644 index 0000000000000000000000000000000000000000..fb4309c67c3b772362f987fd283071d753109aab --- /dev/null +++ b/src/metrics/video_inception_score.py @@ -0,0 +1,54 @@ +"""Inception Score (IS) from the paper "Improved techniques for training +GANs". Matches the original implementation by Salimans et al. at +https://github.com/openai/improved-gan/blob/master/inception_score/model.py""" + +import numpy as np +from . import metric_utils + +#---------------------------------------------------------------------------- + +NUM_FRAMES_IN_BATCH = {128: 128, 256: 128, 512: 64, 1024: 32} + +#---------------------------------------------------------------------------- + +def compute_isv(opts, num_gen: int, num_splits: int, backbone: str): + if backbone == 'c3d_ucf101': + # Perfectly reproduced torchscript version of the original chainer checkpoint: + # https://github.com/pfnet-research/tgan2/blob/f892bc432da315d4f6b6ae9448f69d046ef6fe01/tgan2/models/c3d/c3d_ucf101.py + # It is a UCF-101-finetuned C3D model. + detector_url = 'https://www.dropbox.com/s/jxpu7avzdc9n97q/c3d_ucf101.pt?dl=1' + else: + raise NotImplementedError(f'Backbone {backbone} is not supported.') + + num_frames = 16 + batch_size = NUM_FRAMES_IN_BATCH[opts.dataset_kwargs.resolution] // num_frames + + if opts.generator_as_dataset: + compute_gen_stats_fn = metric_utils.compute_feature_stats_for_dataset + gen_opts = metric_utils.rewrite_opts_for_gen_dataset(opts) + gen_opts.dataset_kwargs.load_n_consecutive = num_frames + gen_opts.dataset_kwargs.load_n_consecutive_random_offset = False + gen_opts.dataset_kwargs.subsample_factor = 1 + gen_kwargs = dict() + else: + compute_gen_stats_fn = metric_utils.compute_feature_stats_for_generator + gen_opts = opts + gen_kwargs = dict(num_video_frames=num_frames, subsample_factor=1) + + gen_probs = compute_gen_stats_fn( + opts=gen_opts, detector_url=detector_url, detector_kwargs={}, + capture_all=True, max_items=num_gen, temporal_detector=True, **gen_kwargs).get_all() # [num_gen, num_classes] + + if opts.rank != 0: + return float('nan'), float('nan') + + scores = [] + np.random.RandomState(42).shuffle(gen_probs) + for i in range(num_splits): + part = gen_probs[i * num_gen // num_splits : (i + 1) * num_gen // num_splits] + kl = part * (np.log(part) - np.log(np.mean(part, axis=0, keepdims=True))) + kl = np.mean(np.sum(kl, axis=1)) + scores.append(np.exp(kl)) + return float(np.mean(scores)), float(np.std(scores)) + +#---------------------------------------------------------------------------- diff --git a/src/scripts/__init__.py b/src/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/scripts/calc_metrics.py b/src/scripts/calc_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..11abbec6c32881181ffd58f109187a0e0d0e4b69 --- /dev/null +++ b/src/scripts/calc_metrics.py @@ -0,0 +1,250 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Calculate quality metrics for previous training run or pretrained network pickle.""" + +import sys; sys.path.extend(['.', 'src']) +import os +import re +import click +import json +import tempfile +import copy +import torch +from src import dnnlib +from omegaconf import OmegaConf + +import legacy +from metrics import metric_main +from metrics import metric_utils +from src.torch_utils import training_stats +from src.torch_utils import custom_ops +from src.torch_utils import misc + +#---------------------------------------------------------------------------- + +def subprocess_fn(rank, args, temp_dir): + dnnlib.util.Logger(should_flush=True) + + # Init torch.distributed. + if args.num_gpus > 1: + init_file = os.path.abspath(os.path.join(temp_dir, '.torch_distributed_init')) + if os.name == 'nt': + init_method = 'file:///' + init_file.replace('\\', '/') + torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=args.num_gpus) + else: + init_method = f'file://{init_file}' + torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=args.num_gpus) + + # Init torch_utils. + sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None + training_stats.init_multiprocessing(rank=rank, sync_device=sync_device) + if rank != 0 or not args.verbose: + custom_ops.verbosity = 'none' + + # Print network summary. + device = torch.device('cuda', rank) + torch.backends.cudnn.benchmark = True + torch.backends.cuda.matmul.allow_tf32 = False + torch.backends.cudnn.allow_tf32 = False + G = copy.deepcopy(args.G).eval().requires_grad_(False).to(device) + if rank == 0 and args.verbose: + z = torch.empty([8, G.z_dim], device=device) + c = torch.empty([8, G.c_dim], device=device) + t = torch.zeros([8, G.cfg.sampling.num_frames_per_video], device=device).long() + misc.print_module_summary(G, [z, c, t]) + + # Calculate each metric. + for metric in args.metrics: + if rank == 0 and args.verbose: + print(f'Calculating {metric}...') + progress = metric_utils.ProgressMonitor(verbose=args.verbose) + result_dict = metric_main.calc_metric( + metric=metric, + G=G, + dataset_kwargs=args.dataset_kwargs, + num_gpus=args.num_gpus, + rank=rank, + device=device, + progress=progress, + cache=args.use_cache, + num_runs=(1 if metric == 'fid50k_full' else args.num_runs), + ) + if rank == 0: + metric_main.report_metric(result_dict, run_dir=args.run_dir, snapshot_pkl=args.network_pkl) + if rank == 0 and args.verbose: + print() + + # Done. + if rank == 0 and args.verbose: + print('Exiting...') + +#---------------------------------------------------------------------------- + +class CommaSeparatedList(click.ParamType): + name = 'list' + + def convert(self, value, param, ctx): + _ = param, ctx + if value is None or value.lower() == 'none' or value == '': + return [] + return value.split(',') + +#---------------------------------------------------------------------------- + +@click.command() +@click.pass_context +@click.option('--network_pkl', '--network', help='Network pickle filename or URL', metavar='PATH') +@click.option('--networks_dir', '--networks_dir', help='Path to the experiment directory if the latest checkpoint is requested.', metavar='PATH') +@click.option('--metrics', help='Comma-separated list or "none"', type=CommaSeparatedList(), default='fid50k_full', show_default=True) +@click.option('--data', help='Dataset to evaluate metrics against (directory or zip) [default: same as training data]', metavar='PATH') +@click.option('--mirror', help='Whether the dataset was augmented with x-flips during training [default: look up]', type=bool, metavar='BOOL') +@click.option('--gpus', help='Number of GPUs to use', type=int, default=1, metavar='INT', show_default=True) +@click.option('--cfg_path', help='Path to the experiments config', type=str, default="auto", metavar='PATH') +@click.option('--verbose', help='Print optional information', type=bool, default=False, metavar='BOOL', show_default=True) +@click.option('--use_cache', help='Should we use the cache file?', type=bool, default=True, metavar='BOOL', show_default=True) +@click.option('--num_runs', help='Number of runs', type=int, default=1, metavar='INT', show_default=True) + +def calc_metrics(ctx, network_pkl, networks_dir, metrics, data, mirror, gpus, cfg_path, verbose, use_cache: bool, num_runs: int): + """Calculate quality metrics for previous training run or pretrained network pickle. + + Examples: + + \b + # Previous training run: look up options automatically, save result to JSONL file. + python calc_metrics.py --metrics=pr50k3_full \\ + --network=~/training-runs/00000-ffhq10k-res64-auto1/network-snapshot-000000.pkl + + \b + # Pre-trained network pickle: specify dataset explicitly, print result to stdout. + python calc_metrics.py --metrics=fid50k_full --data=~/datasets/ffhq.zip --mirror=1 \\ + --network=https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/ffhq.pkl + + Available metrics: + + \b + ADA paper: + fid50k_full Frechet inception distance against the full dataset. + kid50k_full Kernel inception distance against the full dataset. + pr50k3_full Precision and recall againt the full dataset. + is50k Inception score for CIFAR-10. + + \b + StyleGAN and StyleGAN2 papers: + fid50k Frechet inception distance against 50k real images. + kid50k Kernel inception distance against 50k real images. + pr50k3 Precision and recall against 50k real images. + ppl2_wend Perceptual path length in W at path endpoints against full image. + ppl_zfull Perceptual path length in Z for full paths against cropped image. + ppl_wfull Perceptual path length in W for full paths against cropped image. + ppl_zend Perceptual path length in Z at path endpoints against cropped image. + ppl_wend Perceptual path length in W at path endpoints against cropped image. + """ + dnnlib.util.Logger(should_flush=True) + + if network_pkl is None: + output_regex = "^network-snapshot-\d{6}.pkl$" + ckpt_regex = re.compile("^network-snapshot-\d{6}.pkl$") + # ckpts = sorted([f for f in os.listdir(networks_dir) if ckpt_regex.match(f)]) + # network_pkl = os.path.join(networks_dir, ckpts[-1]) + metrics_file = os.path.join(networks_dir, 'metric-fvd2048_16f.jsonl') + with open(metrics_file, 'r') as f: + snapshot_metrics_vals = [json.loads(line) for line in f.read().splitlines()] + best_snapshot = sorted(snapshot_metrics_vals, key=lambda m: m['results']['fvd2048_16f'])[0] + network_pkl = os.path.join(networks_dir, best_snapshot['snapshot_pkl']) + print(f'Using checkpoint: {network_pkl} with FVD16 of', best_snapshot['results']['fvd2048_16f']) + # Selecting a checkpoint with the best score + + # Validate arguments. + args = dnnlib.EasyDict(metrics=metrics, num_gpus=gpus, network_pkl=network_pkl, verbose=verbose) + if cfg_path == "auto": + # Assuming that `network_pkl` has the structure /path/to/experiment/output/network-X.pkl + output_path = os.path.dirname(network_pkl) + assert os.path.basename(output_path) == "output", f"Unknown path structure: {output_path}" + experiment_path = os.path.dirname(output_path) + cfg_path = os.path.join(experiment_path, 'experiment_config.yaml') + + cfg = OmegaConf.load(cfg_path) + if not all(metric_main.is_valid_metric(metric) for metric in args.metrics): + ctx.fail('\n'.join(['--metrics can only contain the following values:'] + metric_main.list_valid_metrics())) + if not args.num_gpus >= 1: + ctx.fail('--gpus must be at least 1') + + # Load network. + if not dnnlib.util.is_url(network_pkl, allow_file_urls=True) and not os.path.isfile(network_pkl): + ctx.fail('--network must point to a file or URL') + if args.verbose: + print(f'Loading network from "{network_pkl}"...') + with dnnlib.util.open_url(network_pkl, verbose=args.verbose) as f: + network_dict = legacy.load_network_pkl(f) + args.G = network_dict['G_ema'] # subclass of torch.nn.Module + + from src.training.networks import Generator + G = args.G + G.cfg.z_dim = G.z_dim + G_new = Generator( + w_dim=G.cfg.w_dim, + mapping_kwargs=dnnlib.EasyDict(num_layers=G.cfg.get('mapping_net_n_layers', 2), cfg=G.cfg), + synthesis_kwargs=dnnlib.EasyDict( + channel_base=int(G.cfg.get('fmaps', 0.5) * 32768), + channel_max=G.cfg.get('channel_max', 512), + num_fp16_res=4, + conv_clamp=256, + ), + cfg=G.cfg, + img_resolution=256, + img_channels=3, + c_dim=G.cfg.c_dim, + ).eval() + G_new.load_state_dict(G.state_dict()) + args.G = G_new + + # Initialize dataset options. + if data is not None: + args.dataset_kwargs = dnnlib.EasyDict(class_name='training.dataset.VideoFramesFolderDataset', cfg=cfg.dataset, path=data) + elif network_dict['training_set_kwargs'] is not None: + args.dataset_kwargs = dnnlib.EasyDict(network_dict['training_set_kwargs']) + else: + ctx.fail('Could not look up dataset options; please specify --data') + + # Finalize dataset options. + args.dataset_kwargs.resolution = args.G.img_resolution + args.dataset_kwargs.use_labels = (args.G.c_dim != 0) + if mirror is not None: + args.dataset_kwargs.xflip = mirror + args.use_cache = use_cache + args.num_runs = num_runs + + # Print dataset options. + if args.verbose: + print('Dataset options:') + print(cfg.dataset) + + # Locate run dir. + args.run_dir = None + if os.path.isfile(network_pkl): + pkl_dir = os.path.dirname(network_pkl) + if os.path.isfile(os.path.join(pkl_dir, 'training_options.json')): + args.run_dir = pkl_dir + + # Launch processes. + if args.verbose: + print('Launching processes...') + torch.multiprocessing.set_start_method('spawn') + with tempfile.TemporaryDirectory() as temp_dir: + if args.num_gpus == 1: + subprocess_fn(rank=0, args=args, temp_dir=temp_dir) + else: + torch.multiprocessing.spawn(fn=subprocess_fn, args=(args, temp_dir), nprocs=args.num_gpus) + +#---------------------------------------------------------------------------- + +if __name__ == "__main__": + calc_metrics() # pylint: disable=no-value-for-parameter + +#---------------------------------------------------------------------------- diff --git a/src/scripts/calc_metrics_for_dataset.py b/src/scripts/calc_metrics_for_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..00e6b6ca5488cd2263be7f1e07e93048d8dd5ec9 --- /dev/null +++ b/src/scripts/calc_metrics_for_dataset.py @@ -0,0 +1,169 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Calculate quality metrics for previous training run or pretrained network pickle.""" + +import sys; sys.path.extend(['.', 'src']) +import os +import click +import tempfile +import torch +from omegaconf import OmegaConf +from src import dnnlib + +from metrics import metric_main +from metrics import metric_utils +from src.torch_utils import training_stats +from src.torch_utils import custom_ops + +#---------------------------------------------------------------------------- + +def subprocess_fn(rank, args, temp_dir): + dnnlib.util.Logger(should_flush=True) + + # Init torch.distributed. + if args.num_gpus > 1: + init_file = os.path.abspath(os.path.join(temp_dir, '.torch_distributed_init')) + if os.name == 'nt': + init_method = 'file:///' + init_file.replace('\\', '/') + torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=args.num_gpus) + else: + init_method = f'file://{init_file}' + torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=args.num_gpus) + + # Init torch_utils. + sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None + training_stats.init_multiprocessing(rank=rank, sync_device=sync_device) + if rank != 0 or not args.verbose: + custom_ops.verbosity = 'none' + + # Print network summary. + device = torch.device('cuda', rank) + torch.backends.cudnn.benchmark = True + torch.backends.cuda.matmul.allow_tf32 = False + torch.backends.cudnn.allow_tf32 = False + + # Calculate each metric. + for metric in args.metrics: + if rank == 0 and args.verbose: + print(f'Calculating {metric}...') + progress = metric_utils.ProgressMonitor(verbose=args.verbose) + result_dict = metric_main.calc_metric( + metric=metric, + dataset_kwargs=args.dataset_kwargs, + gen_dataset_kwargs=args.gen_dataset_kwargs, + generator_as_dataset=args.generator_as_dataset, + num_gpus=args.num_gpus, + rank=rank, + device=device, + progress=progress, + cache=args.use_cache, + num_runs=args.num_runs, + ) + + if rank == 0: + metric_main.report_metric(result_dict, run_dir=args.run_dir) + + if rank == 0 and args.verbose: + print() + + # Done. + if rank == 0 and args.verbose: + print('Exiting...') + +#---------------------------------------------------------------------------- + +class CommaSeparatedList(click.ParamType): + name = 'list' + + def convert(self, value, param, ctx): + _ = param, ctx + if value is None or value.lower() == 'none' or value == '': + return [] + return value.split(',') + +#---------------------------------------------------------------------------- + +def calc_metrics_for_dataset(ctx, metrics, real_data_path, fake_data_path, mirror, resolution, gpus, verbose, use_cache: bool, num_runs: int): + dnnlib.util.Logger(should_flush=True) + + # Validate arguments. + args = dnnlib.EasyDict(metrics=metrics, num_gpus=gpus, verbose=verbose) + if not all(metric_main.is_valid_metric(metric) for metric in args.metrics): + ctx.fail('\n'.join(['--metrics can only contain the following values:'] + metric_main.list_valid_metrics())) + if not args.num_gpus >= 1: + ctx.fail('--gpus must be at least 1') + + dummy_dataset_cfg = OmegaConf.create({'max_num_frames': 10000}) + + # Initialize dataset options for real data. + args.dataset_kwargs = dnnlib.EasyDict( + class_name='training.dataset.VideoFramesFolderDataset', + path=real_data_path, + cfg=dummy_dataset_cfg, + xflip=mirror, + resolution=resolution, + use_labels=False, + ) + + # Initialize dataset options for fake data. + args.gen_dataset_kwargs = dnnlib.EasyDict( + class_name='training.dataset.VideoFramesFolderDataset', + path=fake_data_path, + cfg=dummy_dataset_cfg, + xflip=False, + resolution=resolution, + use_labels=False, + ) + args.generator_as_dataset = True + + # Print dataset options. + if args.verbose: + print('Real data options:') + print(args.dataset_kwargs) + + print('Fake data options:') + print(args.gen_dataset_kwargs) + + # Locate run dir. + args.run_dir = None + args.use_cache = use_cache + args.num_runs = num_runs + + # Launch processes. + if args.verbose: + print('Launching processes...') + torch.multiprocessing.set_start_method('spawn') + with tempfile.TemporaryDirectory() as temp_dir: + if args.num_gpus == 1: + subprocess_fn(rank=0, args=args, temp_dir=temp_dir) + else: + torch.multiprocessing.spawn(fn=subprocess_fn, args=(args, temp_dir), nprocs=args.num_gpus) + +#---------------------------------------------------------------------------- + +@click.command() +@click.pass_context +@click.option('--metrics', help='Comma-separated list or "none"', type=CommaSeparatedList(), default='fvd2048_16f,fid50k_full', show_default=True) +@click.option('--real_data_path', help='Dataset to evaluate metrics against (directory or zip) [default: same as training data]', metavar='PATH') +@click.option('--fake_data_path', help='Generated images (directory or zip)', metavar='PATH') +@click.option('--mirror', help='Should we mirror the real data?', type=bool, metavar='BOOL') +@click.option('--resolution', help='Resolution for the source dataset', type=int, metavar='INT') +@click.option('--gpus', help='Number of GPUs to use', type=int, default=1, metavar='INT', show_default=True) +@click.option('--verbose', help='Print optional information', type=bool, default=False, metavar='BOOL', show_default=True) +@click.option('--use_cache', help='Use stats cache', type=bool, default=True, metavar='BOOL', show_default=True) +@click.option('--num_runs', help='Number of runs', type=int, default=1, metavar='INT', show_default=True) +def calc_metrics_cli_wrapper(ctx, *args, **kwargs): + calc_metrics_for_dataset(ctx, *args, **kwargs) + +#---------------------------------------------------------------------------- + +if __name__ == "__main__": + calc_metrics_cli_wrapper() # pylint: disable=no-value-for-parameter + +#---------------------------------------------------------------------------- diff --git a/src/scripts/clip_edit.py b/src/scripts/clip_edit.py new file mode 100644 index 0000000000000000000000000000000000000000..fb1a7aa466426f0aef5bae48bbbe44f69532f8b4 --- /dev/null +++ b/src/scripts/clip_edit.py @@ -0,0 +1,403 @@ +# import sys; sys.path.extend(['.', 'src', '/home/skoroki/StyleCLIP']) +import argparse +import math +import os +from typing import List +import json +import re +import random +import yaml +import itertools + +import torchvision +from torch import optim +from PIL import Image +import click +import numpy as np +import torch +from tqdm import tqdm +from omegaconf import OmegaConf +import torch.nn as nn +import torch.nn.functional as F +from torchvision import utils +from torch import Tensor +import torchvision.transforms.functional as TVF +from torchvision.utils import save_image +from torch import Tensor + +from src.deps.facial_recognition.model_irse import Backbone + +try: + import clip +except ImportError: + raise ImportError( + "To edit videos with CLIP, you need to install the `clip` library. " \ + "Please follow the instructions in https://github.com/openai/CLIP") + +from src import dnnlib +import legacy +from src.scripts.project import save_edited_w + + +#---------------------------------------------------------------------------- + +def get_lr(t, initial_lr, rampdown=0.25, rampup=0.05): + lr_ramp = min(1, (1 - t) / rampdown) + lr_ramp = 0.5 - 0.5 * math.cos(lr_ramp * math.pi) + lr_ramp = lr_ramp * min(1, t / rampup) + + return initial_lr * lr_ramp + +#---------------------------------------------------------------------------- + +class CLIPLoss(torch.nn.Module): + """ + Copy-pasted and adapted from StyleCLIP + """ + def __init__(self): + super(CLIPLoss, self).__init__() + self.model, self.preprocess = clip.load("ViT-B/32", device="cuda") + #self.upsample = torch.nn.Upsample(scale_factor=7) + #self.avg_pool = torch.nn.AvgPool2d(kernel_size=opts.stylegan_size // 32) + + def forward(self, image, text): + #image = self.avg_pool(self.upsample(image)) + #print('shape', image.shape, text.shape) + image = F.interpolate(image, size=(224, 224), mode='area') + similarity = 1 - self.model(image, text)[0] / 100 + similarity = similarity.diag() + + return similarity + +#---------------------------------------------------------------------------- + +class IDLoss(nn.Module): + """ + Copy-pasted from StyleCLIP + """ + def __init__(self): + super(IDLoss, self).__init__() + self.facenet = Backbone(input_size=112, num_layers=50, drop_ratio=0.6, mode='ir_se') + with dnnlib.util.open_url(Backbone.WEIGHTS_URL, verbose=True) as f: + ir_se50_weights = torch.load(f) + self.facenet.load_state_dict(ir_se50_weights) + self.pool = torch.nn.AdaptiveAvgPool2d((256, 256)) + self.face_pool = torch.nn.AdaptiveAvgPool2d((112, 112)) + self.facenet.eval() + self.facenet.cuda() + + def extract_feats(self, x): + if x.shape[2] != 256: + x = self.pool(x) + x = x[:, :, 35:223, 32:220] # Crop interesting region + x = self.face_pool(x) + x_feats = self.facenet(x) + return x_feats + + def forward(self, y_hat, y): + n_samples = y.shape[0] + y_feats = self.extract_feats(y) # Otherwise use the feature from there + y_hat_feats = self.extract_feats(y_hat) + y_feats = y_feats.detach() + loss = 0 + + for i in range(n_samples): + diff_target = y_hat_feats[i].dot(y_feats[i]) + loss += 1 - diff_target + + return loss / n_samples + +#---------------------------------------------------------------------------- + +def run_edit_optimization( + _sentinel=None, + G: nn.Module=None, + w_orig: Tensor=None, + descriptions: List[str]=None, + # ckpt: float="stylegan2-ffhq-config-f.pt", + lr: float=0.1, + num_steps: int=40, + l2_lambda: float=0.001, + id_lambda: float=0.005, + # latent_path: float=latent_path, + # truncation: float=0.7, + # save_intermediate_image_every: float=1 if create_video else 20, + # results_dir: float="results", + mask: float=None, + mask_lambda: float=0.0, + verbose: bool=False, +) -> Tensor: + assert _sentinel is None + # text_inputs = torch.cat([clip.tokenize(d) for d in descriptions]).to(device) + num_prompts = len(descriptions) + num_images = len(w_orig) + device = w_orig.device + + text_inputs = clip.tokenize(descriptions).to(device) # [num_prompts, 77] + text_inputs = text_inputs.repeat_interleave(len(w_orig), dim=0) # [num_prompts * num_images, 77] + + c = torch.zeros(num_prompts * num_images, 0, device=device) + ts = torch.zeros(num_prompts * num_images, 1, device=device) + w_orig = w_orig.repeat(num_prompts, 1, 1) # [num_prompts * num_images, num_ws, w_dim] + + with torch.no_grad(): + img_orig = G.synthesis(ws=w_orig, c=c, t=ts) # [num_prompts * num_images, 3, c, h, w] + + w = w_orig.detach().clone() # [num_prompts * num_images, num_ws, w_dim] + w.requires_grad = True + + if mask_lambda > 0: + target_image = img_orig * (1 - mask) # [num_prompts * num_images, 3, c, h, w] + #target_image = img_orig[:, :, -128:, :128] + target_image = (target_image * 0.5 + 0.5) * 255.0 # [num_prompts * num_images, 3, c, h, w] + if target_image.shape[2] > 256: + target_image = F.interpolate(target_image, size=(256, 256), mode='area') + target_features = vgg16(target_image, resize_images=False, return_lpips=True) + #dist = (target_features - synth_features).square().sum() + else: + target_features = None + + clip_loss = CLIPLoss() + id_loss = IDLoss() + optimizer = optim.Adam([w], lr=lr) + + if verbose: + pbar = tqdm(range(num_steps)) + else: + pbar = range(num_steps) + + for curr_iter in pbar: + curr_lr = get_lr(curr_iter / num_steps, lr) + # optimizer.param_groups[0]["lr"] = lr + for param_group in optimizer.param_groups: + param_group['lr'] = curr_lr + + #img_gen, _ = g_ema([latent], input_is_latent=True, randomize_noise=False, input_is_stylespace=work_in_stylespace) + img_gen = G.synthesis(ws=w, c=c, t=ts) # [num_prompts * num_images, 3, c, h, w] + + if mask_lambda > 0: + raise NotImplementedError + synth_image = img_gen * (1 - mask) + #synth_image = img_gen[:, :, -128:, :128] + synth_image = (synth_image * 0.5 + 0.5) * 255.0 + if synth_image.shape[2] > 256: + synth_image = F.interpolate(synth_image, size=(256, 256), mode='area') + synth_features = vgg16(synth_image, resize_images=False, return_lpips=True) + mask_loss = (target_features - synth_features).square().sum() + else: + mask_loss = 0 + + if not mask is None: + img_gen = img_gen * mask.unsqueeze(0) # [num_prompts * num_images, 3, c, h, w] + + c_loss = clip_loss(img_gen, text_inputs) # [num_prompts * num_images] + + if id_lambda > 0: + i_loss = id_loss(img_gen, img_orig) + else: + i_loss = 0 + + l2_loss = ((w_orig - w) ** 2) # [1] + loss = c_loss.sum() + l2_lambda * l2_loss.sum() + id_lambda * i_loss + mask_lambda * mask_loss + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if verbose: + pbar.set_description((f"loss: {loss.item():.4f};")) + + final_result = torch.stack([img_orig, img_gen]) # [2, num_prompts * num_images, c, h, w] + + return final_result, w + + # x, new_w = main(args) + + # pair = torch.cat([img for img in x], dim=2) + # TVF.to_pil_image((pair.cpu().detach() * 0.5 + 0.5).clamp(0, 1)) + +#---------------------------------------------------------------------------- + +@click.command() +@click.pass_context +@click.option('--network_pkl', help='Network pickle filename', metavar='PATH') +@click.option('--networks_dir', help='Network pickles directory', metavar='PATH') +# @click.option('--truncation_psi', type=float, help='Truncation psi', default=1.0, show_default=True) +# @click.option('--noise_mode', help='Noise mode', type=click.Choice(['const', 'random', 'none']), default='const', show_default=True) +# @click.option('--same_motion_codes', type=bool, help='Should we use the same motion codes for all videos?', default=False, show_default=True) +@click.option('--w_dir', help='A directory leading to latent codes.', type=str, required=False, metavar='DIR') +@click.option('--results_dir', help='A directory to save the results in.', type=str, required=False, metavar='DIR') +@click.option('--truncation_psi', help='If we use new w, what truncation to use.', type=float, required=False, metavar='FLOAT', default=1.0) +@click.option('--num_w', help='If we use new w, how many to sample?', type=int, required=False, metavar='FLOAT', default=16) +@click.option('--prompts', help='A path to prompts or a string of prompts.', type=str, required=True, metavar='DIR') +@click.option('--seed', type=int, help='Random seed', default=42, metavar='DIR') +@click.option('--zero_periods', help='Zero-out periods predictor?', default=False, type=bool, metavar='BOOL') +@click.option('--num_weights_to_slice', help='Number of high-frequency coords to remove.', default=0, type=int, metavar='INT') +@click.option('--num_steps', help='Number of the optimization steps to perform.', default=40, type=int, metavar='INT') +@click.option('--stack_samples', help='When saving, should we stack samples together?', default=False, type=bool, metavar='BOOL') +# l2_lambda=0.001, +# id_lambda=0.005, +# l2_lambda=0.0005, +# id_lambda=0.0, +@click.option('--l2_lambda', help='L2 loss coef', default=0.001, type=float, metavar='FLOAT') +@click.option('--id_lambda', help='ID loss coef', default=0.005, type=float, metavar='FLOAT') +@click.option('--lr', help='Learning rate', default=0.1, type=float, metavar='FLOAT') +@click.option('--mask_lambda', help='If we use a mask, specify the loss coef', default=0.0, type=float, metavar='FLOAT') +@click.option('--use_id_lambda', help='Should we use id lambda in HPO?', default=False, type=bool, metavar='BOOL') + +def main( + ctx: click.Context, + network_pkl: str, + networks_dir: str, + seed: int, + w_dir: str, + results_dir: str, + truncation_psi: float, + num_w: int, + # save_as_mp4: bool, + # video_len: int, + # fps: int, + # as_grids: bool, + zero_periods: bool, + num_weights_to_slice: int, + num_steps: int, + stack_samples: bool, + l2_lambda: float, + id_lambda: float, + lr: float, + prompts: str, + mask_lambda: float, + use_id_lambda: bool, +): + if network_pkl is None: + output_regex = "^network-snapshot-\d{6}.pkl$" + ckpt_regex = re.compile("^network-snapshot-\d{6}.pkl$") + # ckpts = sorted([f for f in os.listdir(networks_dir) if ckpt_regex.match(f)]) + # network_pkl = os.path.join(networks_dir, ckpts[-1]) + metrics_file = os.path.join(networks_dir, 'metric-fvd2048_16f.jsonl') + with open(metrics_file, 'r') as f: + snapshot_metrics_vals = [json.loads(line) for line in f.read().splitlines()] + best_snapshot = sorted(snapshot_metrics_vals, key=lambda m: m['results']['fvd2048_16f'])[0] + network_pkl = os.path.join(networks_dir, best_snapshot['snapshot_pkl']) + print(f'Using checkpoint: {network_pkl} with FVD16 of', best_snapshot['results']['fvd2048_16f']) + # Selecting a checkpoint with the best score + else: + assert networks_dir is None, "Cant have both parameters: network_pkl and networks_dir" + + print('Loading networks from "%s"...' % network_pkl, end='') + device = torch.device('cuda') + with dnnlib.util.open_url(network_pkl) as f: + G = legacy.load_network_pkl(f)['G_ema'].to(device).eval() # type: ignore + print('Loaded!') + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + if zero_periods: + G.synthesis.motion_encoder.time_encoder.periods_predictor.weight.data.zero_() + + if num_weights_to_slice > 0: + G.synthesis.motion_encoder.time_encoder.weights[:, -num_weights_to_slice:] = 0.0 + + # description = "Bright sunny sky and mountains far away" + # experiment_type = 'edit' #@param ['edit', 'free_generation'] + # mask = torch.zeros(3, 256, 256, device=device) + # mask[:, :, 64+32 : 128+32] = 1.0 + # mask[:, :-128, :] = 1.0 + # mask[:, :, 128:] = 1.0 + + if w_dir is None: + print('Sampling new w') + z = torch.randn(num_w, G.z_dim, device=device) + c = torch.zeros(len(z), G.c_dim, device=device) + w_orig = G.mapping(z=z, c=c, truncation_psi=truncation_psi) + os.makedirs(results_dir, exist_ok=True) + torch.save(w_orig.cpu(), f'{results_dir}_w_orig.pt') + w_save_dir = os.path.join(results_dir, 'w_edit') + samples_save_dir = os.path.join(results_dir, 'edited_samples') + else: + w_paths = sorted([os.path.join(w_dir, f) for f in os.listdir(w_dir) if f.endswith('_w.pt')]) + w_names = [os.path.basename(f) for f in w_paths] + w_orig = [torch.load(f) for f in w_paths] + w_orig = torch.stack(w_orig).to(device) # [num_images, num_ws, w_dim] + w_save_dir = f'{w_dir}_edited_w' + samples_save_dir = f'{w_dir}_edited_samples' + + os.makedirs(w_save_dir, exist_ok=True) + os.makedirs(samples_save_dir, exist_ok=True) + + print(f'Loading prompts from file: {prompts}') + with open(prompts, 'r') as f: + descs_dict = yaml.load(f) + edit_names, descriptions = list(zip(*descs_dict.items())) + edit_names = edit_names + descriptions = descriptions + + del id_lambda, num_steps, l2_lambda + l2_lambdas = [1000000.0, 0.0025, 0.001, 0.00025, 0.0005, 0.0001] + if use_id_lambda: + id_lambdas = [0.005, 0.0025, 0.001, 0.00025, 0.0005, 0.0001, 0.0] + else: + id_lambdas = [0.0] + all_num_steps = [40] + + for curr_edit_name, curr_prompt in zip(edit_names, descriptions): + all_images = [] + all_w_edited = [] + + for l2_lambda, id_lambda, num_steps in tqdm(list(itertools.product(l2_lambdas, id_lambdas, all_num_steps)), desc=f'Performing HPO for {curr_edit_name}'): + final_image, w_edited = run_edit_optimization( + G=G, + w_orig=w_orig, + descriptions=[curr_prompt], + # ckpt="stylegan2-ffhq-config-f.pt", + lr=lr, + num_steps=num_steps, + l2_lambda=l2_lambda, + id_lambda=id_lambda, + mask_lambda=mask_lambda, + verbose=False, + # latent_path=latent_path, + # truncation=0.7, + # mask=None, + # mask_lambda=0.1, + ) + + all_images.extend((final_image[1].cpu() * 0.5 + 0.5).clamp(0, 1)) + all_w_edited.append({ + "w_edit": w_edited.cpu(), + "l2_lambda": l2_lambda, + "id_lambda": id_lambda, + "num_steps": num_steps, + "prompt": curr_prompt, + "edit_name": curr_edit_name, + }) + + # img_names = [f'{w_name}_{edit_name}' for edit_name in edit_names for w_name in w_names] + + # save_edited_w( + # G=G, + # w_outdir = f'{w_dir}_edited', + # samples_outdir = f'{w_dir}_projected_samples', + # img_names=img_names, + # stack_samples=stack_samples, + # all_w = w_edited, + # all_motion_z = None, + # stacked_samples_out_path = f'{w_dir}_edited_samples.png' + # ) + + torch.save(all_w_edited, f"{w_save_dir}/{curr_edit_name}_w.pt") + grid = utils.make_grid(torch.stack(all_images), nrow=len(w_orig)) + print('savig intp', f"{samples_save_dir}/{curr_edit_name}.png") + save_image(grid, f"{samples_save_dir}/{curr_edit_name}.png") + + print('Done!') + + +#---------------------------------------------------------------------------- + +if __name__ == "__main__": + main() # pylint: disable=no-value-for-parameter + +#---------------------------------------------------------------------------- diff --git a/src/scripts/construct_static_videos_dataset.py b/src/scripts/construct_static_videos_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..f40c24edda23d1eda25e9c77d5aa98d8bba6f888 --- /dev/null +++ b/src/scripts/construct_static_videos_dataset.py @@ -0,0 +1,46 @@ +""" +Takes a dataset directory and repeats the frames to include only a random frame from each video +This is needed to calculate same-frame FVD and DiFID +""" +import os +import random +import argparse +from typing import List +import shutil +from tqdm import tqdm + + +def construct_static_videos_dataset(videos_dir: os.PathLike, max_len: int=None, output_dir: os.PathLike=None, force_len: int=None): + output_dir = output_dir if not output_dir is None else f'{videos_dir}_freeze' + clips_paths = [os.path.join(videos_dir, d) for d in os.listdir(videos_dir)] + + print(f'Saving into {output_dir}') + + for video_idx, clip_path in enumerate(tqdm(clips_paths)): + frames_paths = os.listdir(clip_path) + frame_to_repeat = random.choice(frames_paths) + curr_output_dir = os.path.join(output_dir, f'{video_idx:05d}') + os.makedirs(curr_output_dir, exist_ok=True) + num_frames_to_create = force_len if not force_len is None else min(len(frames_paths), max_len) + + for i in range(num_frames_to_create): + ext = os.path.splitext(frame_to_repeat)[1].lower() + target_file_path = os.path.join(curr_output_dir, f'{i:06d}{ext}') + shutil.copy(os.path.join(clip_path, frame_to_repeat), target_file_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--directory', type=str, help='Directory with video frames') + parser.add_argument('-o', '--output_dir', type=None, help='Where to save the file?.') + parser.add_argument('-l', '--max_len', type=int, help='Max video length') + parser.add_argument('-fl', '--force_len', type=int, help='Force video length') + + args = parser.parse_args() + + construct_static_videos_dataset( + videos_dir=args.directory, + max_len=args.max_len, + output_dir=args.output_dir, + force_len=args.force_len, + ) diff --git a/src/scripts/convert_video_to_dataset.py b/src/scripts/convert_video_to_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2f8653f68934dee191beb07f2d5677c319dccba6 --- /dev/null +++ b/src/scripts/convert_video_to_dataset.py @@ -0,0 +1,87 @@ +""" +Converts a dataset of mp4 videos into a dataset of video frames +I.e. a directory of mp4 files becomes a directory of directories of frames +This speeds up loading during training because we do not need +""" +import os +from typing import List +import argparse +from pathlib import Path +from multiprocessing import Pool +from collections import Counter + +import numpy as np +from PIL import Image +import torchvision.transforms.functional as TVF +from moviepy.editor import VideoFileClip +from tqdm import tqdm + + +def convert_videos_into_dataset(video_path: os.PathLike, target_dir: os.PathLike, num_chunks: int, chunk_size: int, start_frame: int, target_size: int, force_fps: int): + assert (num_chunks is None) or (chunk_size is None), "Cant use both num_chunks and chunk_size" + + os.makedirs(target_dir, exist_ok=True) + clip = VideoFileClip(video_path) + fps = clip.fps if force_fps is None else force_fps + num_frames_total = int(np.floor(clip.duration * fps)) - start_frame + + if num_chunks is None: + num_chunks = num_frames_total // chunk_size + else: + chunk_size = num_frames_total // num_chunks + + num_frames_to_save = chunk_size * num_chunks + + print(f'Processing the video at {fps} fps. {num_frames_total} frames in total. We have {num_chunks} videos of {chunk_size} frames each.') + + current_chunk_idx = 0 + frame_idx = -start_frame + curr_chunk_dir = os.path.join(target_dir, f'{current_chunk_idx:06d}') + + for frame in tqdm(clip.iter_frames(fps=fps), total=num_frames_total + start_frame): + if frame_idx >= 0: + os.makedirs(curr_chunk_dir, exist_ok=True) + frame = Image.fromarray(frame) + frame = TVF.center_crop(frame, output_size=min(frame.size)) + frame = TVF.resize(frame, size=target_size, interpolation=Image.LANCZOS) + frame.save(os.path.join(curr_chunk_dir, f'{frame_idx % chunk_size:06d}.jpg'), q=95) + + frame_idx += 1 + if frame_idx % chunk_size == 0 and frame_idx > 0: + current_chunk_idx += 1 + curr_chunk_dir = os.path.join(target_dir, f'{current_chunk_idx:06d}') + + if frame_idx == num_frames_to_save: + # Stop here so not to have a partially-filled chunk + break + + chunk_sizes = [len(os.listdir(d)) for d in listdir_full_paths(target_dir)] + assert len(set(chunk_sizes)) == 1, f"Bad chunk sizes: {set(chunk_sizes)}" + + print('Finished successfully!') + + +def listdir_full_paths(d) -> List[os.PathLike]: + return sorted([os.path.join(d, x) for x in os.listdir(d)]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert a long video into a dataset of frame dirs') + parser.add_argument('-s', '--source_video_path', type=str, help='Path to the source video') + parser.add_argument('-t', '--target_dir', type=str, help='Where to save the new dataset') + parser.add_argument('-n', '--num_chunks', type=int, help='How many samples should there be in the dataset?') + parser.add_argument('-cs', '--chunk_size', type=int, help='Each video length. Should be used separately from num_chunks') + parser.add_argument('-sf', '--start_frame', type=int, default=0, help='Start frame idx. Should we skip several frames?') + parser.add_argument('--target_size', type=int, default=128, help='What size should we resize to?') + parser.add_argument('--force_fps', type=int, help='What fps should we run videos with?') + args = parser.parse_args() + + convert_videos_into_dataset( + video_path=args.source_video_path, + target_dir=args.target_dir, + num_chunks=args.num_chunks, + chunk_size=args.chunk_size, + start_frame=args.start_frame, + target_size=args.target_size, + force_fps=args.force_fps, + ) diff --git a/src/scripts/convert_videos_to_frames.py b/src/scripts/convert_videos_to_frames.py new file mode 100644 index 0000000000000000000000000000000000000000..be16d1c39144d3a3c759c0c7912cf17859bd5f56 --- /dev/null +++ b/src/scripts/convert_videos_to_frames.py @@ -0,0 +1,105 @@ +""" +Converts a dataset of mp4 videos into a dataset of video frames +I.e. a directory of mp4 files becomes a directory of directories of frames +This speeds up loading during training because we do not need +""" +import os +from typing import List +import argparse +from pathlib import Path +from multiprocessing import Pool +from collections import Counter + +from PIL import Image +import torchvision.transforms.functional as TVF +from moviepy.editor import VideoFileClip +from tqdm import tqdm + + +def convert_videos_to_frames(source_dir: os.PathLike, target_dir: os.PathLike, num_workers: int, video_ext: str, **process_video_kwargs): + broken_clips_dir = f'{target_dir}_broken_clips' + os.makedirs(target_dir, exist_ok=True) + os.makedirs(broken_clips_dir, exist_ok=True) + + clips_paths = [cp for cp in listdir_full_paths(source_dir) if cp.endswith(video_ext)] + clips_fps = [] + tasks_kwargs = [dict( + clip_path=cp, + target_dir=target_dir, + broken_clips_dir=broken_clips_dir, + **process_video_kwargs, + ) for cp in clips_paths] + pool = Pool(processes=num_workers) + + for fps in tqdm(pool.imap_unordered(task_proxy, tasks_kwargs), total=len(clips_paths)): + clips_fps.append(fps) + + print(f'All possible fps: {Counter(clips_fps).most_common()}') + + +def task_proxy(kwargs): + """I do not know, how to pass several arguments to a pool job...""" + return process_video(**kwargs) + + +def process_video( + clip_path: os.PathLike, target_dir: os.PathLike, force_fps: int=None, target_size: int=None, + broken_clips_dir: os.PathLike=None, compute_fps_only: bool=False) -> int: + + clip_name = os.path.basename(clip_path) + clip_name = clip_name[:clip_name.rfind('.')] + + try: + clip = VideoFileClip(clip_path) + except KeyboardInterrupt: + raise + except Exception as e: + print(f'Coudnt process clip: {clip_path}') + if not broken_clips_dir is None: + Path(os.path.join(broken_clips_dir, clip_name)).touch() + return 0 + + if compute_fps_only: + return clip.fps + + fps = clip.fps if force_fps is None else force_fps + clip_target_dir = os.path.join(target_dir, clip_name) + clip_target_dir = clip_target_dir.replace('#', '_') + os.makedirs(clip_target_dir, exist_ok=True) + + frame_idx = 0 + for frame in clip.iter_frames(fps=fps): + frame = Image.fromarray(frame) + if not target_size is None: + frame = TVF.resize(frame, size=target_size, interpolation=Image.LANCZOS) + frame = TVF.center_crop(frame, output_size=(target_size, target_size)) + frame.save(os.path.join(clip_target_dir, f'{frame_idx:06d}.jpg'), q=95) + frame_idx += 1 + + return clip.fps + + +def listdir_full_paths(d) -> List[os.PathLike]: + return sorted([os.path.join(d, x) for x in os.listdir(d)]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert a dataset of mp4 files into a dataset of individual frames') + parser.add_argument('-s', '--source_dir', type=str, help='Path to the source dataset') + parser.add_argument('-t', '--target_dir', type=str, help='Where to save the new dataset') + parser.add_argument('--video_ext', type=str, default='mp4', help='Video extension') + parser.add_argument('--target_size', type=int, default=128, help='What size should we resize to?') + parser.add_argument('--force_fps', type=int, help='What fps should we run videos with?') + parser.add_argument('--num_workers', type=int, default=8, help='Number of processes to launch') + parser.add_argument('--compute_fps_only', action='store_true', help='Should we just compute fps?') + args = parser.parse_args() + + convert_videos_to_frames( + source_dir=args.source_dir, + target_dir=args.target_dir, + target_size=args.target_size, + force_fps=args.force_fps, + num_workers=args.num_workers, + video_ext=args.video_ext, + compute_fps_only=args.compute_fps_only, + ) diff --git a/src/scripts/crop_video_dataset.py b/src/scripts/crop_video_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..f24d349573efb3d0ed31ce390f2fb14b8ad3b58e --- /dev/null +++ b/src/scripts/crop_video_dataset.py @@ -0,0 +1,69 @@ +import os +import shutil +import argparse +from typing import List + +import numpy as np +from tqdm import tqdm +from PIL import Image + + +def crop_video_dataset(source_dir: str, max_num_frames: int=None, slice_n_left_frames: int=0, resize: int=None, target_dir: str=None): + dataset_name = os.path.basename(source_dir) + if target_dir is None: + max_num_frames_prefix = "" if max_num_frames is None else f"_cut{max_num_frames}" + slice_prefix = "" if slice_n_left_frames == 0 else f"_slice{slice_n_left_frames}" + new_dataset_name = f"{dataset_name}{max_num_frames_prefix}{slice_prefix}" + target_dir = os.path.join(os.path.dirname(source_dir), new_dataset_name) + all_clips_paths = listdir_full_paths(source_dir) + os.makedirs(target_dir, exist_ok=True) + slice_proportions = [] + + total_num_frames = 0 + + for source_clip_dir in tqdm(all_clips_paths, desc=f'Cropping the dataset into {target_dir}'): + all_frames = listdir_full_paths(source_clip_dir) + if len(all_frames) == 0: + print(f'{source_clip_dir} is empty. Skipping it.') + continue + target_clip_dir = os.path.join(target_dir, os.path.basename(source_clip_dir)) + os.makedirs(target_clip_dir, exist_ok=True) + total_num_frames += len(all_frames) + slice_proportions.append(slice_n_left_frames / len(all_frames)) + all_frames = all_frames[slice_n_left_frames:] + + if not max_num_frames is None: + all_frames = all_frames[:max_num_frames] + + for source_frame_path in all_frames: + target_frame_path = os.path.join(target_clip_dir, os.path.basename(source_frame_path)) + + if resize is None: + shutil.copy(source_frame_path, target_frame_path) + else: + assert target_frame_path.endswith('.jpg') + Image.open(source_frame_path).resize((resize, resize), resample=Image.LANCZOS).save(target_frame_path, q=95) + + print(f'Done! Sliced {np.mean(slice_proportions) * 100.0 : .02f}% on average. {len(all_clips_paths) * slice_n_left_frames / total_num_frames * 100.0 : .02f}% of total num frames.') + + +def listdir_full_paths(d) -> List[os.PathLike]: + return sorted([os.path.join(d, x) for x in os.listdir(d)]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Crops a video dataset temporally into several frames') + parser.add_argument('source_dir', type=str, help='Path to the dataset') + parser.add_argument('-n', '--max_num_frames', type=int, default=None, help='Number of frames to preserve') + parser.add_argument('--slice_n_left_frames', type=int, default=0, help='Number of frames to slice from the left') + parser.add_argument('--resize', type=int, default=None, help='Should we resize the dataset') + parser.add_argument('--target_dir', type=str, default=None, help='Should we resize the dataset') + args = parser.parse_args() + + crop_video_dataset( + source_dir=args.source_dir, + max_num_frames=args.max_num_frames, + slice_n_left_frames=args.slice_n_left_frames, + resize=args.resize, + target_dir=args.target_dir, + ) diff --git a/src/scripts/frames_to_video_grid.py b/src/scripts/frames_to_video_grid.py new file mode 100644 index 0000000000000000000000000000000000000000..4a9d44f4108b38c3a6bc0742ae6ce85277aa91fa --- /dev/null +++ b/src/scripts/frames_to_video_grid.py @@ -0,0 +1,78 @@ +""" +Converts a directory of video frames into an mp4-grid +""" +import sys; sys.path.extend(['.']) +import os +import argparse +import random + +import numpy as np +import torch +from torch import Tensor +import torchvision.transforms.functional as TVF +from torchvision import utils +from PIL import Image +from tqdm import tqdm +import torchvision + + +def frames_to_video_grid(videos_dir: os.PathLike, num_videos: int, length: int, fps: int, output_path: os.PathLike, select_random: bool=False, random_seed: int=None): + clips_paths = [os.path.join(videos_dir, d) for d in os.listdir(videos_dir)] + + # bad_idx = [0, 9, 11, 16] + # clips_paths = [c for i, c in enumerate(clips_paths) if not i in bad_idx] + + if select_random: + random.seed(random_seed) + clips_paths = random.sample(clips_paths, k=num_videos) + else: + clips_paths = clips_paths[:num_videos] + videos = [read_first_n_frames(d, length) for d in tqdm(clips_paths, desc='Reading data...')] # [num_videos, length, c, h, w] + videos = [fill_with_black_squares(v, length) for v in tqdm(videos, desc='Adding empty frames')] # [num_videos, length, c, h, w] + frame_grids = torch.stack(videos).permute(1, 0, 2, 3, 4) # [video_len, num_videos, c, h, w] + frame_grids = [utils.make_grid(fs, nrow=int(np.ceil(np.sqrt(num_videos)))) for fs in tqdm(frame_grids, desc='Making grids')] + + if os.path.dirname(output_path) != "": + os.makedirs(os.path.dirname(output_path), exist_ok=True) + frame_grids = (torch.stack(frame_grids) * 255).to(torch.uint8).permute(0, 2, 3, 1) # [T, H, W, C] + torchvision.io.write_video(output_path, frame_grids, fps=fps, video_codec='h264', options={'crf': '10'}) + + +def read_first_n_frames(d: os.PathLike, num_frames: int) -> Tensor: + images = [Image.open(os.path.join(d, f)) for f in sorted(os.listdir(d))[:num_frames]] + images = [TVF.to_tensor(x) for x in images] + + return torch.stack(images) + + +def fill_with_black_squares(video, desired_len: int) -> Tensor: + if len(video) >= desired_len: + return video + + return torch.cat([ + video, + torch.zeros_like(video[0]).unsqueeze(0).repeat(desired_len - len(video), 1, 1, 1), + ], dim=0) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-d', '--directory', type=str, help='Directory with video frames') + parser.add_argument('-n', '--num_videos', type=int, help='Number of videos to consider') + parser.add_argument('-l', '--length', type=int, help='Video length (in frames)') + parser.add_argument('--fps', type=int, default=25, help='FPS to save with.') + parser.add_argument('-o', '--output_path', type=str, help='Where to save the file?.') + parser.add_argument('--select_random', action='store_true', help='Select videos at random?') + parser.add_argument('--random_seed', type=int, default=None, help='Random seed when selecting videos at random') + + args = parser.parse_args() + + frames_to_video_grid( + videos_dir=args.directory, + num_videos=args.num_videos, + length=args.length, + fps=args.fps, + output_path=args.output_path, + select_random=args.select_random, + random_seed=args.random_seed, + ) diff --git a/src/scripts/generate.py b/src/scripts/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..1841eba18db463ff175e1ab5b781e3b7263d20e8 --- /dev/null +++ b/src/scripts/generate.py @@ -0,0 +1,148 @@ +"""Generates a dataset of images using pretrained network pickle.""" + +import sys; sys.path.extend(['.', 'src']) +import os +import json +import random +import warnings + +import click +from src import dnnlib +import numpy as np +import torch +from tqdm import tqdm +from omegaconf import OmegaConf + +import src.legacy as legacy +from src.training.logging import generate_videos, save_video_frames_as_mp4, save_video_frames_as_frames_parallel + +torch.set_grad_enabled(False) + + +#---------------------------------------------------------------------------- + +@click.command() +@click.pass_context +@click.option('--network_pkl', help='Network pickle filename', metavar='PATH') +@click.option('--networks_dir', help='Network pickles directory. Selects a checkpoint from it automatically based on the fvd2048_16f metric.', metavar='PATH') +@click.option('--truncation_psi', type=float, help='Truncation psi', default=1.0, show_default=True) +@click.option('--noise_mode', help='Noise mode', type=click.Choice(['const', 'random', 'none']), default='const', show_default=True) +@click.option('--num_videos', type=int, help='Number of images to generate', default=50000, show_default=True) +@click.option('--batch_size', type=int, help='Batch size to use for generation', default=32, show_default=True) +@click.option('--moco_decomposition', type=bool, help='Should we do content/motion decomposition (available only for `--as_grids 1` generation)?', default=False, show_default=True) +@click.option('--seed', type=int, help='Random seed', default=42, metavar='DIR') +@click.option('--outdir', help='Where to save the output images', type=str, required=True, metavar='DIR') +@click.option('--save_as_mp4', help='Should we save as independent frames or mp4?', type=bool, default=False, metavar='BOOL') +@click.option('--video_len', help='Number of frames to generate', type=int, default=16, metavar='INT') +@click.option('--fps', help='FPS for mp4 saving', type=int, default=25, metavar='INT') +@click.option('--as_grids', help='Save videos as grids', type=bool, default=False, metavar='BOOl') +@click.option('--time_offset', help='Additional time offset', default=0, type=int, metavar='INT') +@click.option('--dataset_path', help='Dataset path. In case we want to use the conditioning signal.', default="", type=str, metavar='PATH') +@click.option('--hydra_cfg_path', help='Config path', default="", type=str, metavar='PATH') +@click.option('--slowmo_coef', help='Increase this value if you want to produce slow-motion videos.', default=1, type=int, metavar='INT') +def generate( + ctx: click.Context, + network_pkl: str, + networks_dir: str, + truncation_psi: float, + noise_mode: str, + num_videos: int, + batch_size: int, + moco_decomposition: bool, + seed: int, + outdir: str, + save_as_mp4: bool, + video_len: int, + fps: int, + as_grids: bool, + time_offset: int, + dataset_path: os.PathLike, + hydra_cfg_path: os.PathLike, + slowmo_coef: int, +): + if network_pkl is None: + # output_regex = "^network-snapshot-\d{6}.pkl$" + # ckpt_regex = re.compile("^network-snapshot-\d{6}.pkl$") + # ckpts = sorted([f for f in os.listdir(networks_dir) if ckpt_regex.match(f)]) + # network_pkl = os.path.join(networks_dir, ckpts[-1]) + ckpt_select_metric = 'fvd2048_16f' + metrics_file = os.path.join(networks_dir, f'metric-{ckpt_select_metric}.jsonl') + with open(metrics_file, 'r') as f: + snapshot_metrics_vals = [json.loads(line) for line in f.read().splitlines()] + best_snapshot = sorted(snapshot_metrics_vals, key=lambda m: m['results'][ckpt_select_metric])[0] + network_pkl = os.path.join(networks_dir, best_snapshot['snapshot_pkl']) + print(f'Using checkpoint: {network_pkl} with FVD16 of', best_snapshot['results'][ckpt_select_metric]) + # Selecting a checkpoint with the best score + else: + assert networks_dir is None, "Cant have both parameters: network_pkl and networks_dir" + + if moco_decomposition: + assert as_grids, f"Content/motion decomposition is available only when we generate as grids." + assert batch_size == num_videos, "Same motion is supported only for batch_size == num_videos" + + print('Loading networks from "%s"...' % network_pkl) + device = torch.device('cuda') + with dnnlib.util.open_url(network_pkl) as f: + G = legacy.load_network_pkl(f)['G_ema'].to(device).eval() # type: ignore + + os.makedirs(outdir, exist_ok=True) + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + all_z = torch.randn(num_videos, G.z_dim, device=device) # [curr_batch_size, z_dim] + if dataset_path and G.c_dim > 0: + hydra_cfg_path = hydra_cfg_path or os.path.join(networks_dir, '..', "experiment_config.yaml") + hydra_cfg = OmegaConf.load(hydra_cfg_path) + training_set_kwargs = dnnlib.EasyDict( + class_name='training.dataset.VideoFramesFolderDataset', + path=dataset_path, cfg=hydra_cfg.dataset, use_labels=True, max_size=None, xflip=False) + training_set = dnnlib.util.construct_class_by_name(**training_set_kwargs) + all_c = [training_set.get_label(random.choice(range(len(training_set)))) for _ in range(num_videos)] # [num_videos, c_dim] + all_c = torch.from_numpy(np.array(all_c)).to(device) # [num_videos, c_dim] + elif G.c_dim > 0: + warnings.warn('Assuming that the conditioning is one-hot!') + c_idx = torch.randint(low=0, high=G.c_dim, size=(num_videos, 1), device=device) + all_c = torch.zeros(num_videos, G.c_dim, device=device) # [num_videos, c_dim] + all_c.scatter_(1, c_idx, 1) + else: + all_c = torch.zeros(num_videos, G.c_dim, device=device) # [num_videos, c_dim] + ts = time_offset + torch.arange(video_len, device=device).float().unsqueeze(0).repeat(batch_size, 1) / slowmo_coef # [batch_size, video_len] + if moco_decomposition: + num_rows = num_cols = int(np.sqrt(num_videos)) + motion_z = G.synthesis.motion_encoder(c=all_c[:num_rows], t=ts[:num_rows])['motion_z'] # [1, *motion_dims] + motion_z = motion_z.repeat_interleave(num_cols, dim=0) # [batch_size, *motion_dims] + + all_z = all_z[:num_cols].repeat(num_rows, 1) # [num_videos, z_dim] + all_c = all_c[:num_cols].repeat(num_rows, 1) # [num_videos, z_dim] + else: + motion_z = None + + # Generate images. + for batch_idx in tqdm(range((num_videos + batch_size - 1) // batch_size), desc='Generating videos'): + curr_batch_size = batch_size if batch_size * (batch_idx + 1) <= num_videos else num_videos % batch_size + z = all_z[batch_idx * batch_size:batch_idx * batch_size + curr_batch_size] # [curr_batch_size, z_dim] + c = all_c[batch_idx * batch_size:batch_idx * batch_size + curr_batch_size] # [curr_batch_size, c_dim] + videos = generate_videos( + G, z, c, ts, motion_z=motion_z, noise_mode=noise_mode, + truncation_psi=truncation_psi, as_grids=as_grids, batch_size_num_frames=128) + + if as_grids: + videos = [videos] + + for video_idx, video in enumerate(videos): + if save_as_mp4: + save_path = os.path.join(outdir, f'{batch_idx * batch_size + video_idx:06d}.mp4') + save_video_frames_as_mp4(video, fps, save_path) + else: + save_dir = os.path.join(outdir, f'{batch_idx * batch_size + video_idx:06d}') + video = (video * 255).permute(0, 2, 3, 1).to(torch.uint8).numpy() # [video_len, h, w, c] + save_video_frames_as_frames_parallel(video, save_dir, time_offset=time_offset, num_processes=8) + +#---------------------------------------------------------------------------- + +if __name__ == "__main__": + generate() # pylint: disable=no-value-for-parameter + +#---------------------------------------------------------------------------- diff --git a/src/scripts/preprocess_ffs.py b/src/scripts/preprocess_ffs.py new file mode 100644 index 0000000000000000000000000000000000000000..19eaefac6d93863d00d40a69b6ed582dd846c321 --- /dev/null +++ b/src/scripts/preprocess_ffs.py @@ -0,0 +1,204 @@ +""" +This file preprocesses FaceForensics dataset by cropping it +Copied from https://github.com/pfnet-research/tgan2/blob/master/scripts/make_face_forensics.py +""" + +import argparse +import os +from typing import List +from multiprocessing import Pool +from PIL import Image + +import cv2 +# import h5py +import imageio +import numpy as np +import pandas +from tqdm import tqdm + + +def parse_videos(source_dir, splits: List[str], categories: List[dir]): + results = [] + for split in splits: + for category in categories: + target_dir = os.path.join(source_dir, split, category) + filenames = sorted(os.listdir(target_dir)) + for filename in filenames: + results.append({ + 'split': split, + 'category': category, + 'filename': filename, + 'filepath': os.path.join(split, category, filename), + }) + return pandas.DataFrame(results) + + +def crop(img, left, right, top, bottom, margin): + cols = right - left + rows = bottom - top + if cols < rows: + padding = rows - cols + left -= padding // 2 + right += (padding // 2) + (padding % 2) + cols = right - left + else: + padding = cols - rows + top -= padding // 2 + bottom += (padding // 2) + (padding % 2) + rows = bottom - top + assert(rows == cols) + return img[top:bottom, left:right] + + +def job_proxy(kwargs): + process_and_save_video(**kwargs) + + +def process_and_save_video(video_path: os.PathLike, mask_path: os.PathLike, img_size: int, wide_crop: bool, output_dir: os.PathLike): + try: + video = process_video(video_path, mask_path, img_size=img_size, wide_crop=wide_crop) + except KeyboardInterrupt: + raise + except: + print(f'Couldnt process {video_path}') + return + + os.makedirs(output_dir, exist_ok=True) + + # if os.path.isdir(output_dir) and len(os.listdir(output_dir)) > 0: + # return + + for i, frame in enumerate(video): + frame = frame.transpose(1, 2, 0) + Image.fromarray(frame).save(os.path.join(output_dir, f'{i:06d}.jpg'), q=95) + + +def process_video(video_path, mask_path, img_size, threshold=5, margin=0.02, wide_crop: bool=False): + video_reader = imageio.get_reader(video_path) + mask_reader = imageio.get_reader(mask_path) + assert(video_reader.get_length() == mask_reader.get_length()) + + # Searching for the widest crop which would work for the whole video + if wide_crop: + left_most = float('inf') + top_most = float('inf') + right_most = float('-inf') + bottom_most = float('-inf') + + for img, mask in zip(video_reader, mask_reader): + hist = (255 - mask).astype(np.float64).sum(axis=2) + horiz_hist = np.where(hist.mean(axis=0) > threshold)[0] + vert_hist = np.where(hist.mean(axis=1) > threshold)[0] + left, right = horiz_hist[0], horiz_hist[-1] + top, bottom = vert_hist[0], vert_hist[-1] + left_most = min(left_most, left) + top_most = min(top_most, top) + right_most = max(right_most, right) + bottom_most = max(bottom_most, bottom) + + video = [] + for img, mask in zip(video_reader, mask_reader): + if wide_crop: + left, right, top, bottom = left_most, right_most, top_most, bottom_most + else: + hist = (255 - mask).astype(np.float64).sum(axis=2) + horiz_hist = np.where(hist.mean(axis=0) > threshold)[0] + vert_hist = np.where(hist.mean(axis=1) > threshold)[0] + left, right = horiz_hist[0], horiz_hist[-1] + top, bottom = vert_hist[0], vert_hist[-1] + + dst_img = crop(img, left, right, top, bottom, margin) + + try: + dst_img = cv2.resize( + dst_img, (img_size, img_size), + interpolation=cv2.INTER_LANCZOS4).transpose(2, 0, 1) + video.append(dst_img) + except KeyboardInterrupt: + raise + except: + print(img.shape, dst_img.shape, left, right, top, bottom) + + T = len(video) + video = np.concatenate(video).reshape(T, 3, img_size, img_size) + return video + + +# def count_frames(path): +# reader = imageio.get_reader(path) +# n_frames = 0 +# while True: +# try: +# img = reader.get_next_data() +# except IndexError as e: +# break +# else: +# n_frames += 1 +# return n_frames + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--source_dir', type=str, default='data/FaceForensics_compressed') + parser.add_argument('--output_dir', type=str, default='data/ffs_processed') + parser.add_argument('--img_size', type=int, default=256) + parser.add_argument('--num_workers', type=int, default=8) + parser.add_argument('--wide_crop', action='store_true', help="Should we crop each frame independently (this makes a video shaking)?") + args = parser.parse_args() + + # splits = ['train', 'val', 'test'] + # categories = ['original', 'mask', 'altered'] + splits = ['train'] + categories = ['original', 'mask'] + df = parse_videos(args.source_dir, splits, categories) + os.makedirs(args.output_dir, exist_ok=True) + + for split in splits: + target_frame = df[df['split'] == split] + filenames = target_frame['filename'].unique() + + # print('Count # of frames') + # rets = [] + # for i, filename in enumerate(filenames): + # fn_frame = target_frame[target_frame['filename'] == filename] + # video_path = os.path.join( + # args.source_dir, fn_frame[fn_frame['category'] == 'original'].iloc[0]['filepath']) + # rets.append(p.apply_async(count_frames, args=(video_path,))) + # n_frames = 0 + # for ret in tqdm(rets): + # n_frames += ret.get() + # print('# of frames: {}'.format(n_frames)) + + # h5file = h5py.File(os.path.join(args.output_dir, '{}.h5'.format(split)), 'w') + # dset = h5file.create_dataset('image', (n_frames, 3, args.img_size, args.img_size), dtype=np.uint8) + # conf = [] + # start = 0 + + pool = Pool(processes=args.num_workers) + job_kwargs_list = [] + + for i, filename in enumerate(filenames): + fn_frame = target_frame[target_frame['filename'] == filename] + video_path = os.path.join(args.source_dir, fn_frame[fn_frame['category'] == 'original'].iloc[0]['filepath']) + mask_path = os.path.join(args.source_dir, fn_frame[fn_frame['category'] == 'mask'].iloc[0]['filepath']) + + job_kwargs_list.append(dict( + video_path=video_path, + mask_path=mask_path, + img_size=args.img_size, + wide_crop=args.wide_crop, + output_dir=os.path.join(args.output_dir, filename[:filename.rfind('.')]), + )) + + for _ in tqdm(pool.imap_unordered(job_proxy, job_kwargs_list), desc=f'Processing {split}', total=len(job_kwargs_list)): + pass + # T = len(video) + #dset[start:(start + T)] = video + # conf.append({'start': start, 'end': (start + T)}) + # start += T + # conf = pandas.DataFrame(conf) + # conf.to_json(os.path.join(args.output_dir, '{}.json'.format(split)), orient='records') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/scripts/profile_model.py b/src/scripts/profile_model.py new file mode 100644 index 0000000000000000000000000000000000000000..62cdbf6631bac35b69d3330deddb853d12ba659c --- /dev/null +++ b/src/scripts/profile_model.py @@ -0,0 +1,104 @@ +""" +This script computes imgs/sec for a generator in the eval mode +for different batch sizes +""" +import sys; sys.path.extend(['..', '.', 'src']) +import time + +import numpy as np +import torch +import torch.nn as nn +import hydra +from hydra.experimental import initialize +from omegaconf import DictConfig, OmegaConf +from tqdm import tqdm +import torch.autograd.profiler as profiler + +from src import dnnlib +from src.infra.utils import recursive_instantiate + + +DEVICE = 'cuda' +BATCH_SIZES = [32] +NUM_WARMUP_ITERS = 5 +NUM_PROFILE_ITERS = 25 + + +def instantiate_G(cfg: DictConfig) -> nn.Module: + G_kwargs = dnnlib.EasyDict(class_name='training.networks.Generator', w_dim=512, mapping_kwargs=dnnlib.EasyDict(), synthesis_kwargs=dnnlib.EasyDict()) + G_kwargs.synthesis_kwargs.channel_base = int(cfg.model.generator.get('fmaps', 0.5) * 32768) + G_kwargs.synthesis_kwargs.channel_max = 512 + G_kwargs.mapping_kwargs.num_layers = cfg.model.generator.get('mapping_net_n_layers', 2) + if cfg.get('num_fp16_res', 0) > 0: + G_kwargs.synthesis_kwargs.num_fp16_res = cfg.num_fp16_res + G_kwargs.synthesis_kwargs.conv_clamp = 256 + G_kwargs.cfg = cfg.model.generator + G_kwargs.c_dim = 0 + G_kwargs.img_resolution = cfg.get('resolution', 256) + G_kwargs.img_channels = 3 + + G = dnnlib.util.construct_class_by_name(**G_kwargs).eval().requires_grad_(False).to(DEVICE) + + return G + + +@torch.no_grad() +def profile_for_batch_size(G: nn.Module, cfg: DictConfig, batch_size: int): + z = torch.randn(batch_size, G.z_dim, device=DEVICE) + c = torch.zeros(batch_size, G.c_dim, device=DEVICE) + t = torch.zeros(batch_size, 2, device=DEVICE) + times = [] + + for i in tqdm(range(NUM_WARMUP_ITERS), desc='Warming up'): + torch.cuda.synchronize() + fake_img = G(z, c=c, t=t).contiguous() + y = fake_img[0, 0, 0, 0].item() # sync + torch.cuda.synchronize() + + time.sleep(1) + + torch.cuda.reset_peak_memory_stats() + + with profiler.profile(record_shapes=True, use_cuda=True) as prof: + for i in tqdm(range(NUM_PROFILE_ITERS), desc='Profiling'): + torch.cuda.synchronize() + start_time = time.time() + with profiler.record_function("forward"): + fake_img = G(z, c=c, t=t).contiguous() + y = fake_img[0, 0, 0, 0].item() # sync + torch.cuda.synchronize() + times.append(time.time() - start_time) + + torch.cuda.empty_cache() + num_imgs_processed = len(times) * batch_size + total_time_spent = np.sum(times) + bandwidth = num_imgs_processed / total_time_spent + summary = prof.key_averages().table(sort_by="cpu_time_total", row_limit=10) + + print(f'[Batch size: {batch_size}] Mean: {np.mean(times):.05f}s/it. Std: {np.std(times):.05f}s') + print(f'[Batch size: {batch_size}] Imgs/sec: {bandwidth:.03f}') + print(f'[Batch size: {batch_size}] Max mem: {torch.cuda.max_memory_allocated(DEVICE) / 2**30:<6.2f} gb') + + return bandwidth, summary + + +@hydra.main(config_path="../../configs", config_name="config.yaml") +def profile(cfg: DictConfig): + recursive_instantiate(cfg) + G = instantiate_G(cfg) + bandwidths = [] + summaries = [] + print(f'Number of parameters: {sum(p.numel() for p in G.parameters())}') + + for batch_size in BATCH_SIZES: + bandwidth, summary = profile_for_batch_size(G, cfg, batch_size) + bandwidths.append(bandwidth) + summaries.append(summary) + + best_batch_size_idx = int(np.argmax(bandwidths)) + print(f'------------ Best batch size is {BATCH_SIZES[best_batch_size_idx]} ------------') + print(summaries[best_batch_size_idx]) + + +if __name__ == '__main__': + profile() diff --git a/src/scripts/project.py b/src/scripts/project.py new file mode 100644 index 0000000000000000000000000000000000000000..496423f3a8abf468ad0c25b09bfe118a432a6921 --- /dev/null +++ b/src/scripts/project.py @@ -0,0 +1,479 @@ +""" +Given a dataset of images, it (optionally crops it) and embeds into the model +Also optionally generates random videos from the found w +""" + +import sys; sys.path.extend(['.', 'src']) +import os +import re +import json +import random +from typing import List, Optional, Callable +from typing import List + +from PIL import Image +import click +from src import dnnlib +import numpy as np +import torch +from tqdm import tqdm +from omegaconf import OmegaConf +import torch.nn as nn +import torch.nn.functional as F +from torchvision import utils +from torch import Tensor +import torchvision.transforms.functional as TVF +from torchvision.utils import save_image + +import legacy +from src.training.logging import generate_videos, save_video_frames_as_mp4, save_video_frames_as_frames +from src.torch_utils import misc + +#---------------------------------------------------------------------------- + +def project( + _sentinel=None, + G: Callable=None, + vgg16: nn.Module=None, + target_images: List[Tensor]=None, + device: str='cuda', + use_w_init: bool=False, + use_motion_init: bool=False, + w_avg_samples = 10000, + num_steps = 1000, + initial_learning_rate = 0.1, + initial_noise_factor = 0.05, + noise_ramp_length = 0.75, + lr_rampdown_length = 0.25, + lr_rampup_length = 0.05, + #regularize_noise_weight = 1e5, + regularize_noise_weight = 0.0001, + motion_reg_type: str=None, +): + num_videos = len(target_images) + + # misc.assert_shape(target_images, [None, G.img_channels, G.img_resolution, G.img_resolution]) + G = G.eval().requires_grad_(False).to(device) # type: ignore + + c = torch.zeros(num_videos, G.c_dim, device=device) + ts = torch.zeros(num_videos, 1, device=device) + + # Compute w stats. + z_samples = np.random.RandomState(123).randn(w_avg_samples, G.z_dim) + w_samples = G.mapping(torch.from_numpy(z_samples).to(device), None) # [N, L, C] + w_samples = w_samples[:, :1, :].cpu().numpy().astype(np.float32) # [N, 1, C] + w_avg = np.mean(w_samples, axis=0, keepdims=True) # [1, 1, C] + w_std = (np.sum((w_samples - w_avg) ** 2) / w_avg_samples) ** 0.5 + + # img_mean = G.synthesis( + # ws=torch.from_numpy(w_avg).repeat(1, G.num_ws, 1).to(device), + # c=c[0], t=ts[[0]], + # ) + # img_mean = (img_mean * 0.5 + 0.5).cpu().detach() + # TVF.to_pil_image(img_mean[0]).save('/tmp/data/mean.png') + # print('saved!') + + # Load VGG16 feature detector. + url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt' + with dnnlib.util.open_url(url) as f: + vgg16 = torch.jit.load(f).eval().to(device) + + # Features for target image. + target_features = [] + for img in target_images: + img = img.to(device).to(torch.float32).unsqueeze(0) * 255.0 + if img.shape[2] > 256: + img = F.interpolate(img, size=(256, 256), mode='area') + target_features.append(vgg16(img, resize_images=False, return_lpips=True).squeeze(0)) + target_features = torch.stack(target_features) # [num_images, lpips_dim] + + if use_w_init: + w_opt = find_w_init() # [num_videos, 1, w_dim] + w_opt = w_opt.detach().requires_grad_(True) # [num_videos, num_ws, w_dim] + else: + w_opt = torch.tensor(w_avg, dtype=torch.float32, device=device, requires_grad=True) # pylint: disable=not-callable + w_opt = w_opt.repeat(num_videos, G.num_ws, 1).detach().requires_grad_(True) # [num_videos, num_ws, w_dim] + + # w_opt_to_ws = lambda w_opt: torch.cat([w_opt[:, [0]].repeat(1, G.num_ws // 2, 1), w_opt[:, 1:]], dim=1) + + # Trying a lot of motions to find which one works best + if use_motion_init: + motion_z_opt = select_motions(motion_codes) + else: + motion_z_opt = G.synthesis.motion_encoder(c=c, t=ts)['motion_z'] + # motion_z_opt.data = torch.randn_like(motion_z_opt.data) * 1e-3 + + motion_z_opt.requires_grad_(True) + + w_result = torch.zeros([num_steps] + list(w_opt.shape), dtype=torch.float32, device=device) + # optimizer = torch.optim.Adam([w_opt] + [motion_z_opt], betas=(0.9, 0.999), lr=initial_learning_rate) + optimizer = torch.optim.Adam([w_opt], betas=(0.9, 0.999), lr=initial_learning_rate) + + for step in tqdm(range(num_steps)): + # Learning rate schedule. + t = step / num_steps + w_noise_scale = w_std * initial_noise_factor * max(0.0, 1.0 - t / noise_ramp_length) ** 2 + lr_ramp = min(1.0, (1.0 - t) / lr_rampdown_length) + lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi) + lr_ramp = lr_ramp * min(1.0, t / lr_rampup_length) + lr = initial_learning_rate * lr_ramp + + for param_group in optimizer.param_groups: + param_group['lr'] = lr + + # Synth images from opt_w. + w_noise = torch.randn_like(w_opt) * w_noise_scale + ws = w_opt + w_noise + #ws = w_opt_to_ws(w_opt + w_noise) + #ws = (w_opt + w_noise).repeat([1, G.mapping.num_ws, 1]) + #synth_images = G.synthesis(ws, c=c, t=ts, motion_z=motion_z_opt + torch.randn_like(motion_z_opt) * w_noise_scale) + synth_images = G.synthesis(ws, c=c, t=ts, motion_z=motion_z_opt) + #synth_images = G.synthesis(ws, c=c, t=ts) + + # Downsample image to 256x256 if it's larger than that. VGG was built for 224x224 images. + synth_images = (synth_images * 0.5 + 0.5) * 255.0 + if synth_images.shape[2] > 256: + synth_images = F.interpolate(synth_images, size=(256, 256), mode='area') + + # Features for synth images. + synth_features = vgg16(synth_images, resize_images=False, return_lpips=True) + dist = (target_features - synth_features).square().sum() + + # Noise regularization. + if motion_reg_type is None: + reg_loss = 0.0 + elif motion_reg_type == "norm": + reg_loss = motion_z_opt.norm(dim=2).mean() + elif motion_reg_type == "dist": + reg_loss = motion_z_opt.mean().pow(2) + (motion_z_opt.var() - 1).pow(2) + elif motion_reg_type == "sg2": + for v in noise_bufs.values(): + noise = v[None,None,:,:] # must be [1,1,H,W] for F.avg_pool2d() + while True: + reg_loss += (noise*torch.roll(noise, shifts=1, dims=3)).mean()**2 + reg_loss += (noise*torch.roll(noise, shifts=1, dims=2)).mean()**2 + if noise.shape[2] <= 8: + break + noise = F.avg_pool2d(noise, kernel_size=2) + else: + raise NotImplementedError(f"Uknown motion_reg_type: {motion_reg_type}") + + loss = dist + reg_loss * regularize_noise_weight + + # Step + optimizer.zero_grad(set_to_none=True) + loss.backward() + optimizer.step() + + # Save projected W for each optimization step. + w_result[step] = w_opt.detach() + + # Normalize noise. + # with torch.no_grad(): + # for buf in motion_z_opt.values(): + # buf -= buf.mean() + # buf *= buf.square().mean().rsqrt() + + return w_result, motion_z_opt + +#---------------------------------------------------------------------------- + +@torch.no_grad() +def find_motions_init(G: Callable, vgg16: nn.Module, target_features: Tensor, c: Tensor, t: Tensor, num_motions_to_try: int=128): + motions = G.synthesis.motion_encoder( + c=c.repeat_interleave(num_motions_to_try, dim=0), + t=t.repeat_interleave(num_motions_to_try, dim=0))['motion_z'] # [num_videos * num_motions_to_try, ...] + + synth_images = G.synthesis( + w_opt.repeat_interleave(num_motions_to_try, dim=0), + c=c.repeat_interleave(num_motions_to_try, dim=0), + t=t.repeat_interleave(num_motions_to_try, dim=0), + motion_z=motions) + + if synth_images.shape[2] > 256: + synth_images = F.interpolate(synth_images, size=(256, 256), mode='area') + + synth_images = (synth_images * 0.5 + 0.5) * 255.0 + synth_features = vgg16(synth_images, resize_images=False, return_lpips=True) # [num_videos * num_motions_to_try, ...] + dist = (target_features.repeat_interleave(num_motions_to_try, dim=0) - synth_features).square().sum(dim=1) # [num_videos * num_motions_to_try] + best_motions_idx = dist.view(num_videos, num_motions_to_try).argmin(dim=1) # [num_videos] + motion_z_opt = motions[best_motions_idx] # [num_videos, ...] + + return motion_z_opt + +#---------------------------------------------------------------------------- + +@torch.no_grad() +def find_w_init(G: Callable, vgg16: nn.Module, target_features: Tensor, c: Tensor, t: Tensor, l: Tensor, num_w_to_try: int=128): + z = torch.randn(num_videos * num_w_to_try, G.z_dim, device=device) + w = G.mapping(z=z, c=None) # [N, L, C] + + synth_images = G.synthesis( + ws=w, + c=c.repeat_interleave(num_w_to_try, dim=0), + t=t.repeat_interleave(num_w_to_try, dim=0)) + if synth_images.shape[2] > 256: + synth_images = F.interpolate(synth_images, size=(256, 256), mode='area') + synth_images = (synth_images * 0.5 + 0.5) * 255.0 + synth_features = vgg16(synth_images, resize_images=False, return_lpips=True) # [num_videos * num_motions_to_try, ...] + dist = (target_features.repeat_interleave(num_w_to_try, dim=0) - synth_features).square().sum(dim=1) # [num_videos * num_motions_to_try] + best_w_idx = dist.view(num_videos, num_w_to_try).argmin(dim=1) # [num_videos] + w_opt = w[best_w_idx] # [num_videos, num_ws, w_dim] + + return w_opt + +#---------------------------------------------------------------------------- + +@torch.no_grad() +def load_target_images(img_paths: List[os.PathLike], extract_faces: bool=False, ref_image: Tensor=None): + images = [Image.open(f) for f in tqdm(img_paths, desc='Loading images')] + + if extract_faces: + images = extract_faces_from_images(imgs=images, ref_image=ref_image) + for p, img in zip(img_paths, images): + img.save('/tmp/data/faces_extracted/' + os.path.basename(p), q=95) + assert False + # grid = torch.stack([TVF.to_tensor(x) for x in images]) + # grid = utils.make_grid(grid, nrow=8) + # save_image(grid, f'/tmp/data/faces_extracted.png') + # print('Saved the extracted images!') + + # images = [x[:, 200:-400, 450:-200] for x in images] + images = [TVF.to_tensor(x) for x in images] + images = [TVF.resize(x, size=(256, 256)) for x in images] + + return images + +#---------------------------------------------------------------------------- + +@torch.no_grad() +def extract_faces_from_images(_sentinel=None, imgs: List=None, ref_image: "Image"=None, device: str='cuda'): + assert _sentinel is None + try: + import face_alignment + except ImportError: + raise ImportError("To project images with alignment, you need to install the `face_alignment` library.") + + SELECTED_LANDMARKS = [38, 44] + fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False, device=device) + + ref_landmarks = fa.get_landmarks_from_image(np.array(ref_image))[0][SELECTED_LANDMARKS] # [2, 2] + landmarks = [fa.get_landmarks_from_image(np.array(x))[0][SELECTED_LANDMARKS] for x in imgs] # [num_imgs, 2, 2] + ref_dist = ((ref_landmarks[0] - ref_landmarks[1]) ** 2).sum() ** 0.5 # [1] + dists = [((p[0] - p[1]) ** 2).sum() ** 0.5 for p in landmarks] # [num_imgs] + resize_ratios = [ref_dist / d for d in dists] # [num_imgs] + new_sizes = [(int(r * x.size[1]), int(r * x.size[0])) for r, x in zip(resize_ratios, imgs)] + imgs_resized = [TVF.resize(x, size=s, interpolation=Image.LANCZOS) for x, s in zip(imgs, new_sizes)] # [num_imgs, Image] + bbox_left = [p[0][0] * r - ref_landmarks[0][0] for p, r in zip(landmarks, resize_ratios)] + bbox_top = [p[0][1] * r - ref_landmarks[0][1] for p, r in zip(landmarks, resize_ratios)] + + out = [x.crop(box=(l, t, l + ref_image.size[0], t + ref_image.size[1])) for x, l, t in zip(imgs_resized, bbox_left, bbox_top)] + + return out + +#---------------------------------------------------------------------------- + +def pad_box_to_square(left, upper, right, lower): + h = lower - upper + w = right - left + + if h == w: + return left, upper, right, lower + elif w > h: + diff = w - h + assert False, "Not implemented" + else: + pad = (h - w) // 2 + + return (left - pad, upper, right + pad, lower) + +#---------------------------------------------------------------------------- + +def add_margins(box, margin, width: int=float('inf'), height: int=float('inf')): + left, upper, right, lower = box + + return ( + max(0, left - margin[0]), + max(0, upper - margin[1]), + min(width, right + margin[2]), + min(height, lower + margin[3]), + ) + +#---------------------------------------------------------------------------- + +def add_top_margin(box, margin_ratio: float=0.0): + left, upper, right, lower = box + height = lower - upper + margin = int(height * margin_ratio) + + return (left, max(0, upper - margin), right, lower) + +#---------------------------------------------------------------------------- + +def save_edited_w( + _sentinel=None, + G: Callable=None, + w_outdir: os.PathLike=None, + samples_outdir: os.PathLike=None, + img_names: List[str]=None, + stack_samples: bool=False, + num_frames: int = 16, + each_nth_frame: int = 3, + all_w: Tensor=None, + all_motion_z: Tensor=None, + stacked_samples_out_path: os.PathLike=None, + ): + assert _sentinel is None + + # w_outdir = os.path.join(os.path.basename(images_dir)) + + os.makedirs(w_outdir, exist_ok=True) + num_videos = len(img_names) + device = all_w.device + + if not stack_samples: + os.makedirs(samples_outdir, exist_ok=True) + else: + all_samples = [] + + # Generate samples from the given w and save them. + with torch.no_grad(): + z = torch.randn(num_videos, G.z_dim, device=device) # [num_videos, z_dim] + c = torch.zeros(num_videos, G.c_dim, device=device) # [num_videos, c_dim] + + for i, w in enumerate(all_w): + torch.save(w.cpu(), os.path.join(w_outdir, f'{img_names[i]}_w.pt')) + + if all_motion_z is None: + motion_z = None + else: + motion_z = all_motion_z[i] # [......] + torch.save(motion_z.cpu(), os.path.join(w_outdir, f'{img_names[i]}_motion.pt')) + motion_z = motion_z.unsqueeze(0).to(device) # [1, ......] + motion_z = torch.randn_like(motion_z) + + w = w.unsqueeze(0).to(device) # [1, num_ws, w_dim] + t = torch.linspace(0, num_frames * (1 + each_nth_frame), num_frames, device=device).unsqueeze(0) + imgs = G.synthesis(w, c=c[[i]]], t=t, motion_z=motion_z) + imgs = (imgs * 0.5 + 0.5).clamp(0, 1) + grid = utils.make_grid(imgs, nrow=num_frames).cpu() + + if stack_samples: + all_samples.append(grid) + else: + # TVF.to_pil_image(grid).save(os.path.join(samples_outdir, img_names[i]) + '.jpg', q=95) + save_image(grid, os.path.join(samples_outdir, img_names[i]) + '.png') + + if stack_samples: + main_grid = torch.stack(all_samples) # [num_videos, c, h, w * num_frames] + main_grid = utils.make_grid(main_grid, nrow=1) + # TVF.to_pil_image(main_grid).save(f'{images_dir}.jpg', q=95) + save_image(main_grid, stacked_samples_out_path) + +#---------------------------------------------------------------------------- + +@click.command() +@click.pass_context +@click.option('--network_pkl', help='Network pickle filename', metavar='PATH') +@click.option('--networks_dir', help='Network pickles directory', metavar='PATH') +# @click.option('--truncation_psi', type=float, help='Truncation psi', default=1.0, show_default=True) +# @click.option('--noise_mode', help='Noise mode', type=click.Choice(['const', 'random', 'none']), default='const', show_default=True) +# @click.option('--same_motion_codes', type=bool, help='Should we use the same motion codes for all videos?', default=False, show_default=True) +@click.option('--seed', type=int, help='Random seed', default=42, metavar='DIR') +@click.option('--images_dir', help='Where to save the output images', type=str, required=True, metavar='DIR') +# @click.option('--save_as_mp4', help='Should we save as independent frames or mp4?', type=bool, default=False, metavar='BOOL') +# @click.option('--video_len', help='Number of frames to generate', type=int, default=16, metavar='INT') +# @click.option('--fps', help='FPS for mp4 saving', type=int, default=25, metavar='INT') +# @click.option('--as_grids', help='Save videos as grids', type=bool, default=False, metavar='BOOl') +@click.option('--zero_periods', help='Zero-out periods predictor?', default=False, type=bool, metavar='BOOL') +@click.option('--num_weights_to_slice', help='Number of high-frequency coords to remove.', default=0, type=int, metavar='INT') +@click.option('--use_w_init', help='Init w by LPIPS.', default=False, type=bool, metavar='BOOL') +@click.option('--use_motion_init', help='Init motions by LPIPS.', default=False, type=bool, metavar='BOOL') +@click.option('--motion_reg_type', help='Type of the regularization for motion', default=None, type=str, metavar='STR') +@click.option('--num_steps', help='Number of the optimization steps to perform.', default=1000, type=int, metavar='INT') +@click.option('--stack_samples', help='When saving, should we stack samples together?', default=False, type=bool, metavar='BOOL') +@click.option('--extract_faces', help='Use FaceNet to extract the face?', default=False, type=bool, metavar='BOOL') + +def main( + ctx: click.Context, + network_pkl: str, + networks_dir: str, + seed: int, + images_dir: str, + # save_as_mp4: bool, + # video_len: int, + # fps: int, + # as_grids: bool, + zero_periods: bool, + num_weights_to_slice: int, + use_w_init: bool, + use_motion_init: bool, + motion_reg_type: str, + num_steps: int, + stack_samples: bool, + extract_faces: bool, +): + if network_pkl is None: + output_regex = "^network-snapshot-\d{6}.pkl$" + ckpt_regex = re.compile("^network-snapshot-\d{6}.pkl$") + # ckpts = sorted([f for f in os.listdir(networks_dir) if ckpt_regex.match(f)]) + # network_pkl = os.path.join(networks_dir, ckpts[-1]) + metrics_file = os.path.join(networks_dir, 'metric-fvd2048_16f.jsonl') + with open(metrics_file, 'r') as f: + snapshot_metrics_vals = [json.loads(line) for line in f.read().splitlines()] + best_snapshot = sorted(snapshot_metrics_vals, key=lambda m: m['results']['fvd2048_16f'])[0] + network_pkl = os.path.join(networks_dir, best_snapshot['snapshot_pkl']) + print(f'Using checkpoint: {network_pkl} with FVD16 of', best_snapshot['results']['fvd2048_16f']) + # Selecting a checkpoint with the best score + else: + assert networks_dir is None, "Cant have both parameters: network_pkl and networks_dir" + + print('Loading networks from "%s"...' % network_pkl, end='') + device = torch.device('cuda') + with dnnlib.util.open_url(network_pkl) as f: + G = legacy.load_network_pkl(f)['G_ema'].to(device).eval() # type: ignore + print('Loaded!') + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + if zero_periods: + G.synthesis.motion_encoder.time_encoder.periods_predictor.weight.data.zero_() + + if num_weights_to_slice > 0: + G.synthesis.motion_encoder.time_encoder.weights[:, -num_weights_to_slice:] = 0.0 + + img_paths = sorted([os.path.join(images_dir, p) for p in os.listdir(images_dir) if p.endswith('.jpg')]) + img_names = [n[:n.rfind('.')] for n in [os.path.basename(p) for p in img_paths]] + target_images = load_target_images(img_paths, extract_faces, ref_image=Image.open('/tmp/data/mean.png')) # [b, c, h, w] + + assert G.c_dim == 0, "G.c_dim > 0 is not supported" + + w_all_iters, motion_z_final = project( + G=G, + target_images=target_images, + num_steps=num_steps, + device=device, + use_w_init=use_w_init, + use_motion_init=use_motion_init, + motion_reg_type=motion_reg_type, + ) # [num_videos, num_ws, w_dim] + + save_edited_w( + G=G, + w_outdir = f'{images_dir}_projected', + samples_outdir = f'{images_dir}_projected_samples', + img_names=img_names, + stack_samples=stack_samples, + all_w = w_all_iters[-1], + all_motion_z = motion_z_final, + stacked_samples_out_path = f'{images_dir}.png' + ) + +#---------------------------------------------------------------------------- + +if __name__ == "__main__": + main() # pylint: disable=no-value-for-parameter + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/__init__.py b/src/torch_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ece0ea08fe2e939cc260a1dafc0ab5b391b773d9 --- /dev/null +++ b/src/torch_utils/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +# empty diff --git a/src/torch_utils/custom_ops.py b/src/torch_utils/custom_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..4cc4e43fc6f6ce79f2bd68a44ba87990b9b8564e --- /dev/null +++ b/src/torch_utils/custom_ops.py @@ -0,0 +1,126 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import os +import glob +import torch +import torch.utils.cpp_extension +import importlib +import hashlib +import shutil +from pathlib import Path + +from torch.utils.file_baton import FileBaton + +#---------------------------------------------------------------------------- +# Global options. + +verbosity = 'brief' # Verbosity level: 'none', 'brief', 'full' + +#---------------------------------------------------------------------------- +# Internal helper funcs. + +def _find_compiler_bindir(): + patterns = [ + 'C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64', + 'C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64', + 'C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64', + 'C:/Program Files (x86)/Microsoft Visual Studio */vc/bin', + ] + for pattern in patterns: + matches = sorted(glob.glob(pattern)) + if len(matches): + return matches[-1] + return None + +#---------------------------------------------------------------------------- +# Main entry point for compiling and loading C++/CUDA plugins. + +_cached_plugins = dict() + +def get_plugin(module_name, sources, **build_kwargs): + assert verbosity in ['none', 'brief', 'full'] + + # Already cached? + if module_name in _cached_plugins: + return _cached_plugins[module_name] + + # Print status. + if verbosity == 'full': + print(f'Setting up PyTorch plugin "{module_name}"...') + elif verbosity == 'brief': + print(f'Setting up PyTorch plugin "{module_name}"... ', end='', flush=True) + + try: # pylint: disable=too-many-nested-blocks + # Make sure we can find the necessary compiler binaries. + if os.name == 'nt' and os.system("where cl.exe >nul 2>nul") != 0: + compiler_bindir = _find_compiler_bindir() + if compiler_bindir is None: + raise RuntimeError(f'Could not find MSVC/GCC/CLANG installation on this computer. Check _find_compiler_bindir() in "{__file__}".') + os.environ['PATH'] += ';' + compiler_bindir + + # Compile and load. + verbose_build = (verbosity == 'full') + + # Incremental build md5sum trickery. Copies all the input source files + # into a cached build directory under a combined md5 digest of the input + # source files. Copying is done only if the combined digest has changed. + # This keeps input file timestamps and filenames the same as in previous + # extension builds, allowing for fast incremental rebuilds. + # + # This optimization is done only in case all the source files reside in + # a single directory (just for simplicity) and if the TORCH_EXTENSIONS_DIR + # environment variable is set (we take this as a signal that the user + # actually cares about this.) + source_dirs_set = set(os.path.dirname(source) for source in sources) + if len(source_dirs_set) == 1 and ('TORCH_EXTENSIONS_DIR' in os.environ): + all_source_files = sorted(list(x for x in Path(list(source_dirs_set)[0]).iterdir() if x.is_file())) + + # Compute a combined hash digest for all source files in the same + # custom op directory (usually .cu, .cpp, .py and .h files). + hash_md5 = hashlib.md5() + for src in all_source_files: + with open(src, 'rb') as f: + hash_md5.update(f.read()) + build_dir = torch.utils.cpp_extension._get_build_directory(module_name, verbose=verbose_build) # pylint: disable=protected-access + digest_build_dir = os.path.join(build_dir, hash_md5.hexdigest()) + + if not os.path.isdir(digest_build_dir): + os.makedirs(digest_build_dir, exist_ok=True) + baton = FileBaton(os.path.join(digest_build_dir, 'lock')) + if baton.try_acquire(): + try: + for src in all_source_files: + shutil.copyfile(src, os.path.join(digest_build_dir, os.path.basename(src))) + finally: + baton.release() + else: + # Someone else is copying source files under the digest dir, + # wait until done and continue. + baton.wait() + digest_sources = [os.path.join(digest_build_dir, os.path.basename(x)) for x in sources] + torch.utils.cpp_extension.load(name=module_name, build_directory=build_dir, + verbose=verbose_build, sources=digest_sources, **build_kwargs) + else: + torch.utils.cpp_extension.load(name=module_name, verbose=verbose_build, sources=sources, **build_kwargs) + module = importlib.import_module(module_name) + + except: + if verbosity == 'brief': + print('Failed!') + raise + + # Print status and add to cache. + if verbosity == 'full': + print(f'Done setting up PyTorch plugin "{module_name}".') + elif verbosity == 'brief': + print('Done.') + _cached_plugins[module_name] = module + return module + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/misc.py b/src/torch_utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..0721a3f8fa1c3f6571724eea3bdcc2488a209bf6 --- /dev/null +++ b/src/torch_utils/misc.py @@ -0,0 +1,274 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import re +import contextlib +import numpy as np +import torch +import warnings +from src import dnnlib + +#---------------------------------------------------------------------------- +# Cached construction of constant tensors. Avoids CPU=>GPU copy when the +# same constant is used multiple times. + +_constant_cache = dict() + +def constant(value, shape=None, dtype=None, device=None, memory_format=None): + value = np.asarray(value) + if shape is not None: + shape = tuple(shape) + if dtype is None: + dtype = torch.get_default_dtype() + if device is None: + device = torch.device('cpu') + if memory_format is None: + memory_format = torch.contiguous_format + + key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format) + tensor = _constant_cache.get(key, None) + if tensor is None: + tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device) + if shape is not None: + tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape)) + tensor = tensor.contiguous(memory_format=memory_format) + _constant_cache[key] = tensor + return tensor + +#---------------------------------------------------------------------------- +# Replace NaN/Inf with specified numerical values. + +try: + nan_to_num = torch.nan_to_num # 1.8.0a0 +except AttributeError: + def nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None): # pylint: disable=redefined-builtin + assert isinstance(input, torch.Tensor) + if posinf is None: + posinf = torch.finfo(input.dtype).max + if neginf is None: + neginf = torch.finfo(input.dtype).min + assert nan == 0 + return torch.clamp(input.unsqueeze(0).nansum(0), min=neginf, max=posinf, out=out) + +#---------------------------------------------------------------------------- +# Symbolic assert. + +try: + symbolic_assert = torch._assert # 1.8.0a0 # pylint: disable=protected-access +except AttributeError: + symbolic_assert = torch.Assert # 1.7.0 + +#---------------------------------------------------------------------------- +# Context manager to suppress known warnings in torch.jit.trace(). + +class suppress_tracer_warnings(warnings.catch_warnings): + def __enter__(self): + super().__enter__() + warnings.simplefilter('ignore', category=torch.jit.TracerWarning) + return self + +#---------------------------------------------------------------------------- +# Assert that the shape of a tensor matches the given list of integers. +# None indicates that the size of a dimension is allowed to vary. +# Performs symbolic assertion when used in torch.jit.trace(). + +def assert_shape(tensor, ref_shape): + err_suffix = f' for tensor of size {list(tensor.shape)}' + if tensor.ndim != len(ref_shape): + raise AssertionError(f'Wrong number of dimensions: got {tensor.ndim}, expected {len(ref_shape)}{err_suffix}') + for idx, (size, ref_size) in enumerate(zip(tensor.shape, ref_shape)): + if ref_size is None: + pass + elif isinstance(ref_size, torch.Tensor): + with suppress_tracer_warnings(): # as_tensor results are registered as constants + symbolic_assert(torch.equal(torch.as_tensor(size), ref_size), f'Wrong size for dimension {idx}{err_suffix}') + elif isinstance(size, torch.Tensor): + with suppress_tracer_warnings(): # as_tensor results are registered as constants + symbolic_assert(torch.equal(size, torch.as_tensor(ref_size)), f'Wrong size for dimension {idx}: expected {ref_size}{err_suffix}') + elif size != ref_size: + raise AssertionError(f'Wrong size for dimension {idx}: got {size}, expected {ref_size}{err_suffix}') + +#---------------------------------------------------------------------------- +# Function decorator that calls torch.autograd.profiler.record_function(). + +def profiled_function(fn): + def decorator(*args, **kwargs): + with torch.autograd.profiler.record_function(fn.__name__): + return fn(*args, **kwargs) + decorator.__name__ = fn.__name__ + return decorator + +#---------------------------------------------------------------------------- +# Sampler for torch.utils.data.DataLoader that loops over the dataset +# indefinitely, shuffling items as it goes. + +class InfiniteSampler(torch.utils.data.Sampler): + def __init__(self, dataset, rank=0, num_replicas=1, shuffle=True, seed=0, window_size=0.5): + assert len(dataset) > 0 + assert num_replicas > 0 + assert 0 <= rank < num_replicas + assert 0 <= window_size <= 1 + super().__init__(dataset) + self.dataset = dataset + self.rank = rank + self.num_replicas = num_replicas + self.shuffle = shuffle + self.seed = seed + self.window_size = window_size + + def __iter__(self): + order = np.arange(len(self.dataset)) + rnd = None + window = 0 + if self.shuffle: + rnd = np.random.RandomState(self.seed) + rnd.shuffle(order) + window = int(np.rint(order.size * self.window_size)) + + idx = 0 + while True: + i = idx % order.size + if idx % self.num_replicas == self.rank: + yield order[i] + if window >= 2: + j = (i - rnd.randint(window)) % order.size + order[i], order[j] = order[j], order[i] + idx += 1 + +#---------------------------------------------------------------------------- +# Utilities for operating with torch.nn.Module parameters and buffers. + +def params_and_buffers(module): + assert isinstance(module, torch.nn.Module) + return list(module.parameters()) + list(module.buffers()) + +def named_params_and_buffers(module): + assert isinstance(module, torch.nn.Module) + return list(module.named_parameters()) + list(module.named_buffers()) + +def copy_params_and_buffers(src_module, dst_module, require_all=False): + assert isinstance(src_module, torch.nn.Module) + assert isinstance(dst_module, torch.nn.Module) + src_tensors = {name: tensor for name, tensor in named_params_and_buffers(src_module)} + for name, tensor in named_params_and_buffers(dst_module): + assert (name in src_tensors) or (not require_all) + if name in src_tensors: + tensor.copy_(src_tensors[name].detach()).requires_grad_(tensor.requires_grad) + +#---------------------------------------------------------------------------- +# Context manager for easily enabling/disabling DistributedDataParallel +# synchronization. + +@contextlib.contextmanager +def ddp_sync(module, sync): + assert isinstance(module, torch.nn.Module) + if sync or not isinstance(module, torch.nn.parallel.DistributedDataParallel): + yield + else: + with module.no_sync(): + yield + +#---------------------------------------------------------------------------- +# Check DistributedDataParallel consistency across processes. + +def check_ddp_consistency(module, ignore_regex=None): + assert isinstance(module, torch.nn.Module) + for name, tensor in named_params_and_buffers(module): + fullname = type(module).__name__ + '.' + name + if ignore_regex is not None and re.fullmatch(ignore_regex, fullname): + continue + tensor = tensor.detach() + other = tensor.clone() + torch.distributed.broadcast(tensor=other, src=0) + assert (nan_to_num(tensor) == nan_to_num(other)).all(), f'{fullname} is not DDP consistent' + +#---------------------------------------------------------------------------- +# Print summary table of module hierarchy. + +def print_module_summary(module, inputs, max_nesting=3, skip_redundant=True): + assert isinstance(module, torch.nn.Module) + assert not isinstance(module, torch.jit.ScriptModule) + assert isinstance(inputs, (tuple, list)) + + # Register hooks. + entries = [] + nesting = [0] + def pre_hook(_mod, _inputs): + nesting[0] += 1 + def post_hook(mod, module_inputs, outputs): + nesting[0] -= 1 + if nesting[0] <= max_nesting: + module_inputs = list(module_inputs) if isinstance(module_inputs, (tuple, list)) else [module_inputs] + module_inputs = [t for t in module_inputs if isinstance(t, torch.Tensor)] + if isinstance(outputs, (tuple, list)): + outputs = list(outputs) + elif isinstance(outputs, dict): + outputs = list(outputs.values()) + else: + outputs = [outputs] + outputs = [t for t in outputs if isinstance(t, torch.Tensor)] + entries.append(dnnlib.EasyDict(mod=mod, inputs=module_inputs, outputs=outputs)) + hooks = [mod.register_forward_pre_hook(pre_hook) for mod in module.modules()] + hooks += [mod.register_forward_hook(post_hook) for mod in module.modules()] + + # Run module. + outputs = module(*inputs) + for hook in hooks: + hook.remove() + + # Identify unique outputs, parameters, and buffers. + tensors_seen = set() + for e in entries: + e.unique_params = [t for t in e.mod.parameters() if id(t) not in tensors_seen] + e.unique_buffers = [t for t in e.mod.buffers() if id(t) not in tensors_seen] + e.unique_outputs = [t for t in e.outputs if id(t) not in tensors_seen] + tensors_seen |= {id(t) for t in e.unique_params + e.unique_buffers + e.unique_outputs} + + # Filter out redundant entries. + if skip_redundant: + entries = [e for e in entries if len(e.unique_params) or len(e.unique_buffers) or len(e.unique_outputs)] + + # Construct table. + rows = [[type(module).__name__, 'Parameters', 'Buffers', 'Input Shape', 'Output shape', 'Datatype']] + rows += [['---'] * len(rows[0])] + param_total = 0 + buffer_total = 0 + submodule_names = {mod: name for name, mod in module.named_modules()} + for e in entries: + name = '' if e.mod is module else submodule_names[e.mod] + param_size = sum(t.numel() for t in e.unique_params) + buffer_size = sum(t.numel() for t in e.unique_buffers) + input_shape_str = ' + '.join([str(list(t.shape)) for t in e.inputs]) + output_shapes = [str(list(t.shape)) for t in e.outputs] + output_dtypes = [str(t.dtype).split('.')[-1] for t in e.outputs] + rows += [[ + name + (':0' if len(e.outputs) >= 2 else ''), + str(param_size) if param_size else '-', + str(buffer_size) if buffer_size else '-', + input_shape_str if len(input_shape_str) > 0 else '-', + (output_shapes + ['-'])[0], + (output_dtypes + ['-'])[0], + ]] + for idx in range(1, len(e.outputs)): + rows += [[name + f':{idx}', '-', '-', '-', output_shapes[idx], output_dtypes[idx]]] + param_total += param_size + buffer_total += buffer_size + rows += [['---'] * len(rows[0])] + rows += [['Total', str(param_total), str(buffer_total), '-', '-', '-']] + row_lengths = [len(r) for r in rows] + assert len(set(row_lengths)) == 1, f"Summary table contains rows of different lengths: {row_lengths}" + + # Print table. + widths = [max(len(cell) for cell in column) for column in zip(*rows)] + print() + for row in rows: + print(' '.join(cell + ' ' * (width - len(cell)) for cell, width in zip(row, widths))) + print() + return outputs + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/ops/__init__.py b/src/torch_utils/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ece0ea08fe2e939cc260a1dafc0ab5b391b773d9 --- /dev/null +++ b/src/torch_utils/ops/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +# empty diff --git a/src/torch_utils/ops/bias_act.cpp b/src/torch_utils/ops/bias_act.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5d2425d8054991a8e8b6f7a940fd0ff7fa0bb330 --- /dev/null +++ b/src/torch_utils/ops/bias_act.cpp @@ -0,0 +1,99 @@ +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#include +#include +#include +#include "bias_act.h" + +//------------------------------------------------------------------------ + +static bool has_same_layout(torch::Tensor x, torch::Tensor y) +{ + if (x.dim() != y.dim()) + return false; + for (int64_t i = 0; i < x.dim(); i++) + { + if (x.size(i) != y.size(i)) + return false; + if (x.size(i) >= 2 && x.stride(i) != y.stride(i)) + return false; + } + return true; +} + +//------------------------------------------------------------------------ + +static torch::Tensor bias_act(torch::Tensor x, torch::Tensor b, torch::Tensor xref, torch::Tensor yref, torch::Tensor dy, int grad, int dim, int act, float alpha, float gain, float clamp) +{ + // Validate arguments. + TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device"); + TORCH_CHECK(b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()), "b must have the same dtype and device as x"); + TORCH_CHECK(xref.numel() == 0 || (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() && xref.device() == x.device()), "xref must have the same shape, dtype, and device as x"); + TORCH_CHECK(yref.numel() == 0 || (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() && yref.device() == x.device()), "yref must have the same shape, dtype, and device as x"); + TORCH_CHECK(dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() && dy.device() == x.device()), "dy must have the same dtype and device as x"); + TORCH_CHECK(x.numel() <= INT_MAX, "x is too large"); + TORCH_CHECK(b.dim() == 1, "b must have rank 1"); + TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()), "dim is out of bounds"); + TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim), "b has wrong number of elements"); + TORCH_CHECK(grad >= 0, "grad must be non-negative"); + + // Validate layout. + TORCH_CHECK(x.is_non_overlapping_and_dense(), "x must be non-overlapping and dense"); + TORCH_CHECK(b.is_contiguous(), "b must be contiguous"); + TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x), "xref must have the same layout as x"); + TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x), "yref must have the same layout as x"); + TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x), "dy must have the same layout as x"); + + // Create output tensor. + const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); + torch::Tensor y = torch::empty_like(x); + TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x"); + + // Initialize CUDA kernel parameters. + bias_act_kernel_params p; + p.x = x.data_ptr(); + p.b = (b.numel()) ? b.data_ptr() : NULL; + p.xref = (xref.numel()) ? xref.data_ptr() : NULL; + p.yref = (yref.numel()) ? yref.data_ptr() : NULL; + p.dy = (dy.numel()) ? dy.data_ptr() : NULL; + p.y = y.data_ptr(); + p.grad = grad; + p.act = act; + p.alpha = alpha; + p.gain = gain; + p.clamp = clamp; + p.sizeX = (int)x.numel(); + p.sizeB = (int)b.numel(); + p.stepB = (b.numel()) ? (int)x.stride(dim) : 1; + + // Choose CUDA kernel. + void* kernel; + AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] + { + kernel = choose_bias_act_kernel(p); + }); + TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func"); + + // Launch CUDA kernel. + p.loopX = 4; + int blockSize = 4 * 32; + int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1; + void* args[] = {&p}; + AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream())); + return y; +} + +//------------------------------------------------------------------------ + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("bias_act", &bias_act); +} + +//------------------------------------------------------------------------ diff --git a/src/torch_utils/ops/bias_act.cu b/src/torch_utils/ops/bias_act.cu new file mode 100644 index 0000000000000000000000000000000000000000..dd8fc4756d7d94727f94af738665b68d9c518880 --- /dev/null +++ b/src/torch_utils/ops/bias_act.cu @@ -0,0 +1,173 @@ +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#include +#include "bias_act.h" + +//------------------------------------------------------------------------ +// Helpers. + +template struct InternalType; +template <> struct InternalType { typedef double scalar_t; }; +template <> struct InternalType { typedef float scalar_t; }; +template <> struct InternalType { typedef float scalar_t; }; + +//------------------------------------------------------------------------ +// CUDA kernel. + +template +__global__ void bias_act_kernel(bias_act_kernel_params p) +{ + typedef typename InternalType::scalar_t scalar_t; + int G = p.grad; + scalar_t alpha = (scalar_t)p.alpha; + scalar_t gain = (scalar_t)p.gain; + scalar_t clamp = (scalar_t)p.clamp; + scalar_t one = (scalar_t)1; + scalar_t two = (scalar_t)2; + scalar_t expRange = (scalar_t)80; + scalar_t halfExpRange = (scalar_t)40; + scalar_t seluScale = (scalar_t)1.0507009873554804934193349852946; + scalar_t seluAlpha = (scalar_t)1.6732632423543772848170429916717; + + // Loop over elements. + int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x; + for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX; loopIdx++, xi += blockDim.x) + { + // Load. + scalar_t x = (scalar_t)((const T*)p.x)[xi]; + scalar_t b = (p.b) ? (scalar_t)((const T*)p.b)[(xi / p.stepB) % p.sizeB] : 0; + scalar_t xref = (p.xref) ? (scalar_t)((const T*)p.xref)[xi] : 0; + scalar_t yref = (p.yref) ? (scalar_t)((const T*)p.yref)[xi] : 0; + scalar_t dy = (p.dy) ? (scalar_t)((const T*)p.dy)[xi] : one; + scalar_t yy = (gain != 0) ? yref / gain : 0; + scalar_t y = 0; + + // Apply bias. + ((G == 0) ? x : xref) += b; + + // linear + if (A == 1) + { + if (G == 0) y = x; + if (G == 1) y = x; + } + + // relu + if (A == 2) + { + if (G == 0) y = (x > 0) ? x : 0; + if (G == 1) y = (yy > 0) ? x : 0; + } + + // lrelu + if (A == 3) + { + if (G == 0) y = (x > 0) ? x : x * alpha; + if (G == 1) y = (yy > 0) ? x : x * alpha; + } + + // tanh + if (A == 4) + { + if (G == 0) { scalar_t c = exp(x); scalar_t d = one / c; y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d); } + if (G == 1) y = x * (one - yy * yy); + if (G == 2) y = x * (one - yy * yy) * (-two * yy); + } + + // sigmoid + if (A == 5) + { + if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one); + if (G == 1) y = x * yy * (one - yy); + if (G == 2) y = x * yy * (one - yy) * (one - two * yy); + } + + // elu + if (A == 6) + { + if (G == 0) y = (x >= 0) ? x : exp(x) - one; + if (G == 1) y = (yy >= 0) ? x : x * (yy + one); + if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one); + } + + // selu + if (A == 7) + { + if (G == 0) y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one); + if (G == 1) y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha); + if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha); + } + + // softplus + if (A == 8) + { + if (G == 0) y = (x > expRange) ? x : log(exp(x) + one); + if (G == 1) y = x * (one - exp(-yy)); + if (G == 2) { scalar_t c = exp(-yy); y = x * c * (one - c); } + } + + // swish + if (A == 9) + { + if (G == 0) + y = (x < -expRange) ? 0 : x / (exp(-x) + one); + else + { + scalar_t c = exp(xref); + scalar_t d = c + one; + if (G == 1) + y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d); + else + y = (xref > halfExpRange) ? 0 : x * c * (xref * (two - d) + two * d) / (d * d * d); + yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain; + } + } + + // Apply gain. + y *= gain * dy; + + // Clamp. + if (clamp >= 0) + { + if (G == 0) + y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp; + else + y = (yref > -clamp & yref < clamp) ? y : 0; + } + + // Store. + ((T*)p.y)[xi] = (T)y; + } +} + +//------------------------------------------------------------------------ +// CUDA kernel selection. + +template void* choose_bias_act_kernel(const bias_act_kernel_params& p) +{ + if (p.act == 1) return (void*)bias_act_kernel; + if (p.act == 2) return (void*)bias_act_kernel; + if (p.act == 3) return (void*)bias_act_kernel; + if (p.act == 4) return (void*)bias_act_kernel; + if (p.act == 5) return (void*)bias_act_kernel; + if (p.act == 6) return (void*)bias_act_kernel; + if (p.act == 7) return (void*)bias_act_kernel; + if (p.act == 8) return (void*)bias_act_kernel; + if (p.act == 9) return (void*)bias_act_kernel; + return NULL; +} + +//------------------------------------------------------------------------ +// Template specializations. + +template void* choose_bias_act_kernel (const bias_act_kernel_params& p); +template void* choose_bias_act_kernel (const bias_act_kernel_params& p); +template void* choose_bias_act_kernel (const bias_act_kernel_params& p); + +//------------------------------------------------------------------------ diff --git a/src/torch_utils/ops/bias_act.h b/src/torch_utils/ops/bias_act.h new file mode 100644 index 0000000000000000000000000000000000000000..a32187e1fb7e3bae509d4eceaf900866866875a4 --- /dev/null +++ b/src/torch_utils/ops/bias_act.h @@ -0,0 +1,38 @@ +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +//------------------------------------------------------------------------ +// CUDA kernel parameters. + +struct bias_act_kernel_params +{ + const void* x; // [sizeX] + const void* b; // [sizeB] or NULL + const void* xref; // [sizeX] or NULL + const void* yref; // [sizeX] or NULL + const void* dy; // [sizeX] or NULL + void* y; // [sizeX] + + int grad; + int act; + float alpha; + float gain; + float clamp; + + int sizeX; + int sizeB; + int stepB; + int loopX; +}; + +//------------------------------------------------------------------------ +// CUDA kernel selection. + +template void* choose_bias_act_kernel(const bias_act_kernel_params& p); + +//------------------------------------------------------------------------ diff --git a/src/torch_utils/ops/bias_act.py b/src/torch_utils/ops/bias_act.py new file mode 100644 index 0000000000000000000000000000000000000000..29a9e2933b2584580518235e7d1e869d70a5e0ef --- /dev/null +++ b/src/torch_utils/ops/bias_act.py @@ -0,0 +1,212 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Custom PyTorch ops for efficient bias and activation.""" + +import os +import warnings +import numpy as np +import torch +from src import dnnlib +import traceback + +from .. import custom_ops +from .. import misc + +#---------------------------------------------------------------------------- + +activation_funcs = { + 'linear': dnnlib.EasyDict(func=lambda x, **_: x, def_alpha=0, def_gain=1, cuda_idx=1, ref='', has_2nd_grad=False), + 'relu': dnnlib.EasyDict(func=lambda x, **_: torch.nn.functional.relu(x), def_alpha=0, def_gain=np.sqrt(2), cuda_idx=2, ref='y', has_2nd_grad=False), + 'lrelu': dnnlib.EasyDict(func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha), def_alpha=0.2, def_gain=np.sqrt(2), cuda_idx=3, ref='y', has_2nd_grad=False), + 'tanh': dnnlib.EasyDict(func=lambda x, **_: torch.tanh(x), def_alpha=0, def_gain=1, cuda_idx=4, ref='y', has_2nd_grad=True), + 'sigmoid': dnnlib.EasyDict(func=lambda x, **_: torch.sigmoid(x), def_alpha=0, def_gain=1, cuda_idx=5, ref='y', has_2nd_grad=True), + 'elu': dnnlib.EasyDict(func=lambda x, **_: torch.nn.functional.elu(x), def_alpha=0, def_gain=1, cuda_idx=6, ref='y', has_2nd_grad=True), + 'selu': dnnlib.EasyDict(func=lambda x, **_: torch.nn.functional.selu(x), def_alpha=0, def_gain=1, cuda_idx=7, ref='y', has_2nd_grad=True), + 'softplus': dnnlib.EasyDict(func=lambda x, **_: torch.nn.functional.softplus(x), def_alpha=0, def_gain=1, cuda_idx=8, ref='y', has_2nd_grad=True), + 'swish': dnnlib.EasyDict(func=lambda x, **_: torch.sigmoid(x) * x, def_alpha=0, def_gain=np.sqrt(2), cuda_idx=9, ref='x', has_2nd_grad=True), +} + +#---------------------------------------------------------------------------- + +_inited = False +_plugin = None +_null_tensor = torch.empty([0]) + +def _init(): + global _inited, _plugin + if not _inited: + _inited = True + sources = ['bias_act.cpp', 'bias_act.cu'] + sources = [os.path.join(os.path.dirname(__file__), s) for s in sources] + try: + _plugin = custom_ops.get_plugin('bias_act_plugin', sources=sources, extra_cuda_cflags=['--use_fast_math']) + except: + warnings.warn('Failed to build CUDA kernels for bias_act. Falling back to slow reference implementation. Details:\n\n' + traceback.format_exc()) + return _plugin is not None + +#---------------------------------------------------------------------------- + +def bias_act(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None, impl='cuda'): + r"""Fused bias and activation function. + + Adds bias `b` to activation tensor `x`, evaluates activation function `act`, + and scales the result by `gain`. Each of the steps is optional. In most cases, + the fused op is considerably more efficient than performing the same calculation + using standard PyTorch ops. It supports first and second order gradients, + but not third order gradients. + + Args: + x: Input activation tensor. Can be of any shape. + b: Bias vector, or `None` to disable. Must be a 1D tensor of the same type + as `x`. The shape must be known, and it must match the dimension of `x` + corresponding to `dim`. + dim: The dimension in `x` corresponding to the elements of `b`. + The value of `dim` is ignored if `b` is not specified. + act: Name of the activation function to evaluate, or `"linear"` to disable. + Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc. + See `activation_funcs` for a full list. `None` is not allowed. + alpha: Shape parameter for the activation function, or `None` to use the default. + gain: Scaling factor for the output tensor, or `None` to use default. + See `activation_funcs` for the default scaling of each activation function. + If unsure, consider specifying 1. + clamp: Clamp the output values to `[-clamp, +clamp]`, or `None` to disable + the clamping (default). + impl: Name of the implementation to use. Can be `"ref"` or `"cuda"` (default). + + Returns: + Tensor of the same shape and datatype as `x`. + """ + assert isinstance(x, torch.Tensor) + assert impl in ['ref', 'cuda'] + if impl == 'cuda' and x.device.type == 'cuda' and _init(): + return _bias_act_cuda(dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp).apply(x, b) + return _bias_act_ref(x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp) + +#---------------------------------------------------------------------------- + +@misc.profiled_function +def _bias_act_ref(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None): + """Slow reference implementation of `bias_act()` using standard TensorFlow ops. + """ + assert isinstance(x, torch.Tensor) + assert clamp is None or clamp >= 0 + spec = activation_funcs[act] + alpha = float(alpha if alpha is not None else spec.def_alpha) + gain = float(gain if gain is not None else spec.def_gain) + clamp = float(clamp if clamp is not None else -1) + + # Add bias. + if b is not None: + assert isinstance(b, torch.Tensor) and b.ndim == 1 + assert 0 <= dim < x.ndim + assert b.shape[0] == x.shape[dim] + x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)]) + + # Evaluate activation function. + alpha = float(alpha) + x = spec.func(x, alpha=alpha) + + # Scale by gain. + gain = float(gain) + if gain != 1: + x = x * gain + + # Clamp. + if clamp >= 0: + x = x.clamp(-clamp, clamp) # pylint: disable=invalid-unary-operand-type + return x + +#---------------------------------------------------------------------------- + +_bias_act_cuda_cache = dict() + +def _bias_act_cuda(dim=1, act='linear', alpha=None, gain=None, clamp=None): + """Fast CUDA implementation of `bias_act()` using custom ops. + """ + # Parse arguments. + assert clamp is None or clamp >= 0 + spec = activation_funcs[act] + alpha = float(alpha if alpha is not None else spec.def_alpha) + gain = float(gain if gain is not None else spec.def_gain) + clamp = float(clamp if clamp is not None else -1) + + # Lookup from cache. + key = (dim, act, alpha, gain, clamp) + if key in _bias_act_cuda_cache: + return _bias_act_cuda_cache[key] + + # Forward op. + class BiasActCuda(torch.autograd.Function): + @staticmethod + def forward(ctx, x, b): # pylint: disable=arguments-differ + ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride()[1] == 1 else torch.contiguous_format + x = x.contiguous(memory_format=ctx.memory_format) + b = b.contiguous() if b is not None else _null_tensor + y = x + if act != 'linear' or gain != 1 or clamp >= 0 or b is not _null_tensor: + y = _plugin.bias_act(x, b, _null_tensor, _null_tensor, _null_tensor, 0, dim, spec.cuda_idx, alpha, gain, clamp) + ctx.save_for_backward( + x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor, + b if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor, + y if 'y' in spec.ref else _null_tensor) + return y + + @staticmethod + def backward(ctx, dy): # pylint: disable=arguments-differ + dy = dy.contiguous(memory_format=ctx.memory_format) + x, b, y = ctx.saved_tensors + dx = None + db = None + + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + dx = dy + if act != 'linear' or gain != 1 or clamp >= 0: + dx = BiasActCudaGrad.apply(dy, x, b, y) + + if ctx.needs_input_grad[1]: + db = dx.sum([i for i in range(dx.ndim) if i != dim]) + + return dx, db + + # Backward op. + class BiasActCudaGrad(torch.autograd.Function): + @staticmethod + def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ + ctx.memory_format = torch.channels_last if dy.ndim > 2 and dy.stride()[1] == 1 else torch.contiguous_format + dx = _plugin.bias_act(dy, b, x, y, _null_tensor, 1, dim, spec.cuda_idx, alpha, gain, clamp) + ctx.save_for_backward( + dy if spec.has_2nd_grad else _null_tensor, + x, b, y) + return dx + + @staticmethod + def backward(ctx, d_dx): # pylint: disable=arguments-differ + d_dx = d_dx.contiguous(memory_format=ctx.memory_format) + dy, x, b, y = ctx.saved_tensors + d_dy = None + d_x = None + d_b = None + d_y = None + + if ctx.needs_input_grad[0]: + d_dy = BiasActCudaGrad.apply(d_dx, x, b, y) + + if spec.has_2nd_grad and (ctx.needs_input_grad[1] or ctx.needs_input_grad[2]): + d_x = _plugin.bias_act(d_dx, b, x, y, dy, 2, dim, spec.cuda_idx, alpha, gain, clamp) + + if spec.has_2nd_grad and ctx.needs_input_grad[2]: + d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim]) + + return d_dy, d_x, d_b, d_y + + # Add to cache. + _bias_act_cuda_cache[key] = BiasActCuda + return BiasActCuda + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/ops/conv2d_gradfix.py b/src/torch_utils/ops/conv2d_gradfix.py new file mode 100644 index 0000000000000000000000000000000000000000..44ed80fbc9d4bd0d749f8ead245826c3bc6f5e4c --- /dev/null +++ b/src/torch_utils/ops/conv2d_gradfix.py @@ -0,0 +1,170 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Custom replacement for `torch.nn.functional.conv2d` that supports +arbitrarily high order gradients with zero performance penalty.""" + +import warnings +import contextlib +import torch + +# pylint: disable=redefined-builtin +# pylint: disable=arguments-differ +# pylint: disable=protected-access + +#---------------------------------------------------------------------------- + +enabled = False # Enable the custom op by setting this to true. +weight_gradients_disabled = False # Forcefully disable computation of gradients with respect to the weights. + +@contextlib.contextmanager +def no_weight_gradients(): + global weight_gradients_disabled + old = weight_gradients_disabled + weight_gradients_disabled = True + yield + weight_gradients_disabled = old + +#---------------------------------------------------------------------------- + +def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1): + if _should_use_custom_op(input): + return _conv2d_gradfix(transpose=False, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=0, dilation=dilation, groups=groups).apply(input, weight, bias) + return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups) + +def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1): + if _should_use_custom_op(input): + return _conv2d_gradfix(transpose=True, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation).apply(input, weight, bias) + return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation) + +#---------------------------------------------------------------------------- + +def _should_use_custom_op(input): + assert isinstance(input, torch.Tensor) + if (not enabled) or (not torch.backends.cudnn.enabled): + return False + if input.device.type != 'cuda': + return False + if any(torch.__version__.startswith(x) for x in ['1.7.', '1.8.', '1.9', '1.10']): + return True + warnings.warn(f'conv2d_gradfix not supported on PyTorch {torch.__version__}. Falling back to torch.nn.functional.conv2d().') + return False + +def _tuple_of_ints(xs, ndim): + xs = tuple(xs) if isinstance(xs, (tuple, list)) else (xs,) * ndim + assert len(xs) == ndim + assert all(isinstance(x, int) for x in xs) + return xs + +#---------------------------------------------------------------------------- + +_conv2d_gradfix_cache = dict() + +def _conv2d_gradfix(transpose, weight_shape, stride, padding, output_padding, dilation, groups): + # Parse arguments. + ndim = 2 + weight_shape = tuple(weight_shape) + stride = _tuple_of_ints(stride, ndim) + padding = _tuple_of_ints(padding, ndim) + output_padding = _tuple_of_ints(output_padding, ndim) + dilation = _tuple_of_ints(dilation, ndim) + + # Lookup from cache. + key = (transpose, weight_shape, stride, padding, output_padding, dilation, groups) + if key in _conv2d_gradfix_cache: + return _conv2d_gradfix_cache[key] + + # Validate arguments. + assert groups >= 1 + assert len(weight_shape) == ndim + 2 + assert all(stride[i] >= 1 for i in range(ndim)) + assert all(padding[i] >= 0 for i in range(ndim)) + assert all(dilation[i] >= 0 for i in range(ndim)) + if not transpose: + assert all(output_padding[i] == 0 for i in range(ndim)) + else: # transpose + assert all(0 <= output_padding[i] < max(stride[i], dilation[i]) for i in range(ndim)) + + # Helpers. + common_kwargs = dict(stride=stride, padding=padding, dilation=dilation, groups=groups) + def calc_output_padding(input_shape, output_shape): + if transpose: + return [0, 0] + return [ + input_shape[i + 2] + - (output_shape[i + 2] - 1) * stride[i] + - (1 - 2 * padding[i]) + - dilation[i] * (weight_shape[i + 2] - 1) + for i in range(ndim) + ] + + # Forward & backward. + class Conv2d(torch.autograd.Function): + @staticmethod + def forward(ctx, input, weight, bias): + assert weight.shape == weight_shape + if not transpose: + output = torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, **common_kwargs) + else: # transpose + output = torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, output_padding=output_padding, **common_kwargs) + ctx.save_for_backward(input, weight) + return output + + @staticmethod + def backward(ctx, grad_output): + input, weight = ctx.saved_tensors + grad_input = None + grad_weight = None + grad_bias = None + + if ctx.needs_input_grad[0]: + p = calc_output_padding(input_shape=input.shape, output_shape=grad_output.shape) + grad_input = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs).apply(grad_output, weight, None) + assert grad_input.shape == input.shape + + if ctx.needs_input_grad[1] and not weight_gradients_disabled: + grad_weight = Conv2dGradWeight.apply(grad_output, input) + assert grad_weight.shape == weight_shape + + if ctx.needs_input_grad[2]: + grad_bias = grad_output.sum([0, 2, 3]) + + return grad_input, grad_weight, grad_bias + + # Gradient with respect to the weights. + class Conv2dGradWeight(torch.autograd.Function): + @staticmethod + def forward(ctx, grad_output, input): + op = torch._C._jit_get_operation('aten::cudnn_convolution_backward_weight' if not transpose else 'aten::cudnn_convolution_transpose_backward_weight') + flags = [torch.backends.cudnn.benchmark, torch.backends.cudnn.deterministic, torch.backends.cudnn.allow_tf32] + grad_weight = op(weight_shape, grad_output, input, padding, stride, dilation, groups, *flags) + assert grad_weight.shape == weight_shape + ctx.save_for_backward(grad_output, input) + return grad_weight + + @staticmethod + def backward(ctx, grad2_grad_weight): + grad_output, input = ctx.saved_tensors + grad2_grad_output = None + grad2_input = None + + if ctx.needs_input_grad[0]: + grad2_grad_output = Conv2d.apply(input, grad2_grad_weight, None) + assert grad2_grad_output.shape == grad_output.shape + + if ctx.needs_input_grad[1]: + p = calc_output_padding(input_shape=input.shape, output_shape=grad_output.shape) + grad2_input = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs).apply(grad_output, grad2_grad_weight, None) + assert grad2_input.shape == input.shape + + return grad2_grad_output, grad2_input + + _conv2d_gradfix_cache[key] = Conv2d + return Conv2d + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/ops/conv2d_resample.py b/src/torch_utils/ops/conv2d_resample.py new file mode 100644 index 0000000000000000000000000000000000000000..cd4750744c83354bab78704d4ef51ad1070fcc4a --- /dev/null +++ b/src/torch_utils/ops/conv2d_resample.py @@ -0,0 +1,156 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""2D convolution with optional up/downsampling.""" + +import torch + +from .. import misc +from . import conv2d_gradfix +from . import upfirdn2d +from .upfirdn2d import _parse_padding +from .upfirdn2d import _get_filter_size + +#---------------------------------------------------------------------------- + +def _get_weight_shape(w): + with misc.suppress_tracer_warnings(): # this value will be treated as a constant + shape = [int(sz) for sz in w.shape] + misc.assert_shape(w, shape) + return shape + +#---------------------------------------------------------------------------- + +def _conv2d_wrapper(x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True): + """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations. + """ + out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w) + + # Flip weight if requested. + if not flip_weight: # conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False). + w = w.flip([2, 3]) + + # Workaround performance pitfall in cuDNN 8.0.5, triggered when using + # 1x1 kernel + memory_format=channels_last + less than 64 channels. + if kw == 1 and kh == 1 and stride == 1 and padding in [0, [0, 0], (0, 0)] and not transpose: + if x.stride()[1] == 1 and min(out_channels, in_channels_per_group) < 64: + if out_channels <= 4 and groups == 1: + in_shape = x.shape + x = w.squeeze(3).squeeze(2) @ x.reshape([in_shape[0], in_channels_per_group, -1]) + x = x.reshape([in_shape[0], out_channels, in_shape[2], in_shape[3]]) + else: + x = x.to(memory_format=torch.contiguous_format) + w = w.to(memory_format=torch.contiguous_format) + x = conv2d_gradfix.conv2d(x, w, groups=groups) + return x.to(memory_format=torch.channels_last) + + # Otherwise => execute using conv2d_gradfix. + op = conv2d_gradfix.conv_transpose2d if transpose else conv2d_gradfix.conv2d + return op(x, w, stride=stride, padding=padding, groups=groups) + +#---------------------------------------------------------------------------- + +@misc.profiled_function +def conv2d_resample(x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False): + r"""2D convolution with optional up/downsampling. + + Padding is performed only once at the beginning, not between the operations. + + Args: + x: Input tensor of shape + `[batch_size, in_channels, in_height, in_width]`. + w: Weight tensor of shape + `[out_channels, in_channels//groups, kernel_height, kernel_width]`. + f: Low-pass filter for up/downsampling. Must be prepared beforehand by + calling upfirdn2d.setup_filter(). None = identity (default). + up: Integer upsampling factor (default: 1). + down: Integer downsampling factor (default: 1). + padding: Padding with respect to the upsampled image. Can be a single number + or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + groups: Split input channels into N groups (default: 1). + flip_weight: False = convolution, True = correlation (default: True). + flip_filter: False = convolution, True = correlation (default: False). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + # Validate arguments. + assert isinstance(x, torch.Tensor) and (x.ndim == 4) + assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype) + assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2] and f.dtype == torch.float32) + assert isinstance(up, int) and (up >= 1) + assert isinstance(down, int) and (down >= 1) + assert isinstance(groups, int) and (groups >= 1) + out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w) + fw, fh = _get_filter_size(f) + px0, px1, py0, py1 = _parse_padding(padding) + + # Adjust padding to account for up/downsampling. + if up > 1: + px0 += (fw + up - 1) // 2 + px1 += (fw - up) // 2 + py0 += (fh + up - 1) // 2 + py1 += (fh - up) // 2 + if down > 1: + px0 += (fw - down + 1) // 2 + px1 += (fw - down) // 2 + py0 += (fh - down + 1) // 2 + py1 += (fh - down) // 2 + + # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve. + if kw == 1 and kh == 1 and (down > 1 and up == 1): + x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, padding=[px0,px1,py0,py1], flip_filter=flip_filter) + x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight) + return x + + # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample. + if kw == 1 and kh == 1 and (up > 1 and down == 1): + x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight) + x = upfirdn2d.upfirdn2d(x=x, f=f, up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter) + return x + + # Fast path: downsampling only => use strided convolution. + if down > 1 and up == 1: + x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0,px1,py0,py1], flip_filter=flip_filter) + x = _conv2d_wrapper(x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight) + return x + + # Fast path: upsampling with optional downsampling => use transpose strided convolution. + if up > 1: + if groups == 1: + w = w.transpose(0, 1) + else: + w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw) + w = w.transpose(1, 2) + w = w.reshape(groups * in_channels_per_group, out_channels // groups, kh, kw) + px0 -= kw - 1 + px1 -= kw - up + py0 -= kh - 1 + py1 -= kh - up + pxt = max(min(-px0, -px1), 0) + pyt = max(min(-py0, -py1), 0) + x = _conv2d_wrapper(x=x, w=w, stride=up, padding=[pyt,pxt], groups=groups, transpose=True, flip_weight=(not flip_weight)) + x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0+pxt,px1+pxt,py0+pyt,py1+pyt], gain=up**2, flip_filter=flip_filter) + if down > 1: + x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter) + return x + + # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d. + if up == 1 and down == 1: + if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0: + return _conv2d_wrapper(x=x, w=w, padding=[py0,px0], groups=groups, flip_weight=flip_weight) + + # Fallback: Generic reference implementation. + x = upfirdn2d.upfirdn2d(x=x, f=(f if up > 1 else None), up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter) + x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight) + if down > 1: + x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter) + return x + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/ops/fma.py b/src/torch_utils/ops/fma.py new file mode 100644 index 0000000000000000000000000000000000000000..2eeac58a626c49231e04122b93e321ada954c5d3 --- /dev/null +++ b/src/torch_utils/ops/fma.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Fused multiply-add, with slightly faster gradients than `torch.addcmul()`.""" + +import torch + +#---------------------------------------------------------------------------- + +def fma(a, b, c): # => a * b + c + return _FusedMultiplyAdd.apply(a, b, c) + +#---------------------------------------------------------------------------- + +class _FusedMultiplyAdd(torch.autograd.Function): # a * b + c + @staticmethod + def forward(ctx, a, b, c): # pylint: disable=arguments-differ + out = torch.addcmul(c, a, b) + ctx.save_for_backward(a, b) + ctx.c_shape = c.shape + return out + + @staticmethod + def backward(ctx, dout): # pylint: disable=arguments-differ + a, b = ctx.saved_tensors + c_shape = ctx.c_shape + da = None + db = None + dc = None + + if ctx.needs_input_grad[0]: + da = _unbroadcast(dout * b, a.shape) + + if ctx.needs_input_grad[1]: + db = _unbroadcast(dout * a, b.shape) + + if ctx.needs_input_grad[2]: + dc = _unbroadcast(dout, c_shape) + + return da, db, dc + +#---------------------------------------------------------------------------- + +def _unbroadcast(x, shape): + extra_dims = x.ndim - len(shape) + assert extra_dims >= 0 + dim = [i for i in range(x.ndim) if x.shape[i] > 1 and (i < extra_dims or shape[i - extra_dims] == 1)] + if len(dim): + x = x.sum(dim=dim, keepdim=True) + if extra_dims: + x = x.reshape(-1, *x.shape[extra_dims+1:]) + assert x.shape == shape + return x + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/ops/grid_sample_gradfix.py b/src/torch_utils/ops/grid_sample_gradfix.py new file mode 100644 index 0000000000000000000000000000000000000000..ca6b3413ea72a734703c34382c023b84523601fd --- /dev/null +++ b/src/torch_utils/ops/grid_sample_gradfix.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Custom replacement for `torch.nn.functional.grid_sample` that +supports arbitrarily high order gradients between the input and output. +Only works on 2D images and assumes +`mode='bilinear'`, `padding_mode='zeros'`, `align_corners=False`.""" + +import warnings +import torch + +# pylint: disable=redefined-builtin +# pylint: disable=arguments-differ +# pylint: disable=protected-access + +#---------------------------------------------------------------------------- + +enabled = False # Enable the custom op by setting this to true. + +#---------------------------------------------------------------------------- + +def grid_sample(input, grid): + if _should_use_custom_op(): + return _GridSample2dForward.apply(input, grid) + return torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False) + +#---------------------------------------------------------------------------- + +def _should_use_custom_op(): + if not enabled: + return False + if any(torch.__version__.startswith(x) for x in ['1.7.', '1.8.', '1.9']): + return True + warnings.warn(f'grid_sample_gradfix not supported on PyTorch {torch.__version__}. Falling back to torch.nn.functional.grid_sample().') + return False + +#---------------------------------------------------------------------------- + +class _GridSample2dForward(torch.autograd.Function): + @staticmethod + def forward(ctx, input, grid): + assert input.ndim == 4 + assert grid.ndim == 4 + output = torch.nn.functional.grid_sample(input=input, grid=grid, mode='bilinear', padding_mode='zeros', align_corners=False) + ctx.save_for_backward(input, grid) + return output + + @staticmethod + def backward(ctx, grad_output): + input, grid = ctx.saved_tensors + grad_input, grad_grid = _GridSample2dBackward.apply(grad_output, input, grid) + return grad_input, grad_grid + +#---------------------------------------------------------------------------- + +class _GridSample2dBackward(torch.autograd.Function): + @staticmethod + def forward(ctx, grad_output, input, grid): + op = torch._C._jit_get_operation('aten::grid_sampler_2d_backward') + grad_input, grad_grid = op(grad_output, input, grid, 0, 0, False) + ctx.save_for_backward(grid) + return grad_input, grad_grid + + @staticmethod + def backward(ctx, grad2_grad_input, grad2_grad_grid): + _ = grad2_grad_grid # unused + grid, = ctx.saved_tensors + grad2_grad_output = None + grad2_input = None + grad2_grid = None + + if ctx.needs_input_grad[0]: + grad2_grad_output = _GridSample2dForward.apply(grad2_grad_input, grid) + + assert not ctx.needs_input_grad[2] + return grad2_grad_output, grad2_input, grad2_grid + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/ops/upfirdn2d.cpp b/src/torch_utils/ops/upfirdn2d.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2d7177fc60040751d20e9a8da0301fa3ab64968a --- /dev/null +++ b/src/torch_utils/ops/upfirdn2d.cpp @@ -0,0 +1,103 @@ +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#include +#include +#include +#include "upfirdn2d.h" + +//------------------------------------------------------------------------ + +static torch::Tensor upfirdn2d(torch::Tensor x, torch::Tensor f, int upx, int upy, int downx, int downy, int padx0, int padx1, int pady0, int pady1, bool flip, float gain) +{ + // Validate arguments. + TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device"); + TORCH_CHECK(f.device() == x.device(), "f must reside on the same device as x"); + TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32"); + TORCH_CHECK(x.numel() <= INT_MAX, "x is too large"); + TORCH_CHECK(f.numel() <= INT_MAX, "f is too large"); + TORCH_CHECK(x.dim() == 4, "x must be rank 4"); + TORCH_CHECK(f.dim() == 2, "f must be rank 2"); + TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1"); + TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1"); + TORCH_CHECK(downx >= 1 && downy >= 1, "downsampling factor must be at least 1"); + + // Create output tensor. + const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); + int outW = ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx; + int outH = ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy; + TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1"); + torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW}, x.options(), x.suggest_memory_format()); + TORCH_CHECK(y.numel() <= INT_MAX, "output is too large"); + + // Initialize CUDA kernel parameters. + upfirdn2d_kernel_params p; + p.x = x.data_ptr(); + p.f = f.data_ptr(); + p.y = y.data_ptr(); + p.up = make_int2(upx, upy); + p.down = make_int2(downx, downy); + p.pad0 = make_int2(padx0, pady0); + p.flip = (flip) ? 1 : 0; + p.gain = gain; + p.inSize = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0)); + p.inStride = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1), (int)x.stride(0)); + p.filterSize = make_int2((int)f.size(1), (int)f.size(0)); + p.filterStride = make_int2((int)f.stride(1), (int)f.stride(0)); + p.outSize = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0)); + p.outStride = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1), (int)y.stride(0)); + p.sizeMajor = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z; + p.sizeMinor = (p.inStride.z == 1) ? p.inSize.z : 1; + + // Choose CUDA kernel. + upfirdn2d_kernel_spec spec; + AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] + { + spec = choose_upfirdn2d_kernel(p); + }); + + // Set looping options. + p.loopMajor = (p.sizeMajor - 1) / 16384 + 1; + p.loopMinor = spec.loopMinor; + p.loopX = spec.loopX; + p.launchMinor = (p.sizeMinor - 1) / p.loopMinor + 1; + p.launchMajor = (p.sizeMajor - 1) / p.loopMajor + 1; + + // Compute grid size. + dim3 blockSize, gridSize; + if (spec.tileOutW < 0) // large + { + blockSize = dim3(4, 32, 1); + gridSize = dim3( + ((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor, + (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1, + p.launchMajor); + } + else // small + { + blockSize = dim3(256, 1, 1); + gridSize = dim3( + ((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor, + (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1, + p.launchMajor); + } + + // Launch CUDA kernel. + void* args[] = {&p}; + AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream())); + return y; +} + +//------------------------------------------------------------------------ + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("upfirdn2d", &upfirdn2d); +} + +//------------------------------------------------------------------------ diff --git a/src/torch_utils/ops/upfirdn2d.cu b/src/torch_utils/ops/upfirdn2d.cu new file mode 100644 index 0000000000000000000000000000000000000000..ebdd9879f4bb16fc57a23cbc81f9de8ef54e4916 --- /dev/null +++ b/src/torch_utils/ops/upfirdn2d.cu @@ -0,0 +1,350 @@ +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#include +#include "upfirdn2d.h" + +//------------------------------------------------------------------------ +// Helpers. + +template struct InternalType; +template <> struct InternalType { typedef double scalar_t; }; +template <> struct InternalType { typedef float scalar_t; }; +template <> struct InternalType { typedef float scalar_t; }; + +static __device__ __forceinline__ int floor_div(int a, int b) +{ + int t = 1 - a / b; + return (a + t * b) / b - t; +} + +//------------------------------------------------------------------------ +// Generic CUDA implementation for large filters. + +template static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p) +{ + typedef typename InternalType::scalar_t scalar_t; + + // Calculate thread index. + int minorBase = blockIdx.x * blockDim.x + threadIdx.x; + int outY = minorBase / p.launchMinor; + minorBase -= outY * p.launchMinor; + int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y; + int majorBase = blockIdx.z * p.loopMajor; + if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor) + return; + + // Setup Y receptive field. + int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y; + int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y); + int h = min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY; + int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y; + if (p.flip) + filterY = p.filterSize.y - 1 - filterY; + + // Loop over major, minor, and X. + for (int majorIdx = 0, major = majorBase; majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) + for (int minorIdx = 0, minor = minorBase; minorIdx < p.loopMinor & minor < p.sizeMinor; minorIdx++, minor += p.launchMinor) + { + int nc = major * p.sizeMinor + minor; + int n = nc / p.inSize.z; + int c = nc - n * p.inSize.z; + for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x; loopX++, outX += blockDim.y) + { + // Setup X receptive field. + int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x; + int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x); + int w = min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) - inX; + int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x; + if (p.flip) + filterX = p.filterSize.x - 1 - filterX; + + // Initialize pointers. + const T* xp = &((const T*)p.x)[inX * p.inStride.x + inY * p.inStride.y + c * p.inStride.z + n * p.inStride.w]; + const float* fp = &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y]; + int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x; + int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y; + + // Inner loop. + scalar_t v = 0; + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + v += (scalar_t)(*xp) * (scalar_t)(*fp); + xp += p.inStride.x; + fp += filterStepX; + } + xp += p.inStride.y - w * p.inStride.x; + fp += filterStepY - w * filterStepX; + } + + // Store result. + v *= p.gain; + ((T*)p.y)[outX * p.outStride.x + outY * p.outStride.y + c * p.outStride.z + n * p.outStride.w] = (T)v; + } + } +} + +//------------------------------------------------------------------------ +// Specialized CUDA implementation for small filters. + +template +static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p) +{ + typedef typename InternalType::scalar_t scalar_t; + const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1; + const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1; + __shared__ volatile scalar_t sf[filterH][filterW]; + __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor]; + + // Calculate tile index. + int minorBase = blockIdx.x; + int tileOutY = minorBase / p.launchMinor; + minorBase -= tileOutY * p.launchMinor; + minorBase *= loopMinor; + tileOutY *= tileOutH; + int tileOutXBase = blockIdx.y * p.loopX * tileOutW; + int majorBase = blockIdx.z * p.loopMajor; + if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y | majorBase >= p.sizeMajor) + return; + + // Load filter (flipped). + for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW; tapIdx += blockDim.x) + { + int fy = tapIdx / filterW; + int fx = tapIdx - fy * filterW; + scalar_t v = 0; + if (fx < p.filterSize.x & fy < p.filterSize.y) + { + int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx; + int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy; + v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y]; + } + sf[fy][fx] = v; + } + + // Loop over major and X. + for (int majorIdx = 0, major = majorBase; majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) + { + int baseNC = major * p.sizeMinor + minorBase; + int n = baseNC / p.inSize.z; + int baseC = baseNC - n * p.inSize.z; + for (int loopX = 0, tileOutX = tileOutXBase; loopX < p.loopX & tileOutX < p.outSize.x; loopX++, tileOutX += tileOutW) + { + // Load input pixels. + int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x; + int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y; + int tileInX = floor_div(tileMidX, upx); + int tileInY = floor_div(tileMidY, upy); + __syncthreads(); + for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor; inIdx += blockDim.x) + { + int relC = inIdx; + int relInX = relC / loopMinor; + int relInY = relInX / tileInW; + relC -= relInX * loopMinor; + relInX -= relInY * tileInW; + int c = baseC + relC; + int inX = tileInX + relInX; + int inY = tileInY + relInY; + scalar_t v = 0; + if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y & c < p.inSize.z) + v = (scalar_t)((const T*)p.x)[inX * p.inStride.x + inY * p.inStride.y + c * p.inStride.z + n * p.inStride.w]; + sx[relInY][relInX][relC] = v; + } + + // Loop over output pixels. + __syncthreads(); + for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor; outIdx += blockDim.x) + { + int relC = outIdx; + int relOutX = relC / loopMinor; + int relOutY = relOutX / tileOutW; + relC -= relOutX * loopMinor; + relOutX -= relOutY * tileOutW; + int c = baseC + relC; + int outX = tileOutX + relOutX; + int outY = tileOutY + relOutY; + + // Setup receptive field. + int midX = tileMidX + relOutX * downx; + int midY = tileMidY + relOutY * downy; + int inX = floor_div(midX, upx); + int inY = floor_div(midY, upy); + int relInX = inX - tileInX; + int relInY = inY - tileInY; + int filterX = (inX + 1) * upx - midX - 1; // flipped + int filterY = (inY + 1) * upy - midY - 1; // flipped + + // Inner loop. + if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z) + { + scalar_t v = 0; + #pragma unroll + for (int y = 0; y < filterH / upy; y++) + #pragma unroll + for (int x = 0; x < filterW / upx; x++) + v += sx[relInY + y][relInX + x][relC] * sf[filterY + y * upy][filterX + x * upx]; + v *= p.gain; + ((T*)p.y)[outX * p.outStride.x + outY * p.outStride.y + c * p.outStride.z + n * p.outStride.w] = (T)v; + } + } + } + } +} + +//------------------------------------------------------------------------ +// CUDA kernel selection. + +template upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p) +{ + int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y; + + upfirdn2d_kernel_spec spec = {(void*)upfirdn2d_kernel_large, -1,-1,1, 4}; // contiguous + if (s == 1) spec = {(void*)upfirdn2d_kernel_large, -1,-1,4, 1}; // channels_last + + if (s != 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) // contiguous + { + if (fx <= 7 && fy <= 7 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + if (fx <= 6 && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + if (fx <= 5 && fy <= 5 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + if (fx <= 4 && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + if (fx <= 3 && fy <= 3 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 8 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + if (fx <= 1 && fy <= 20) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + if (fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + if (fx <= 1 && fy <= 12) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + if (fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + } + if (s == 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) // channels_last + { + if (fx <= 7 && fy <= 7 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + if (fx <= 6 && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + if (fx <= 5 && fy <= 5 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + if (fx <= 4 && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + if (fx <= 3 && fy <= 3 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 8 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + if (fx <= 1 && fy <= 20) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + if (fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + if (fx <= 1 && fy <= 12) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + if (fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + } + if (s != 1 && p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) // contiguous + { + if (fx <= 8 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + if (fx <= 6 && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + if (fx <= 4 && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + if (fx <= 2 && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small, 64,16,1, 1}; + } + if (s == 1 && p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) // channels_last + { + if (fx <= 8 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + if (fx <= 6 && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + if (fx <= 4 && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + if (fx <= 2 && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small, 16,16,8, 1}; + } + if (s != 1 && p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) // contiguous + { + if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + if (fx <= 8 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,8,1, 1}; + } + if (s == 1 && p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) // channels_last + { + if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + if (fx <= 8 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 128,1,16, 1}; + } + if (s != 1 && p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) // contiguous + { + if (fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + if (fx <= 1 && fy <= 20) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + if (fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + if (fx <= 1 && fy <= 12) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + if (fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 32,32,1, 1}; + } + if (s == 1 && p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) // channels_last + { + if (fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + if (fx <= 1 && fy <= 20) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + if (fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + if (fx <= 1 && fy <= 12) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + if (fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 1,128,16, 1}; + } + if (s != 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) // contiguous + { + if (fx <= 8 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 32,8,1, 1}; + if (fx <= 6 && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small, 32,8,1, 1}; + if (fx <= 4 && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small, 32,8,1, 1}; + if (fx <= 2 && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small, 32,8,1, 1}; + } + if (s == 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) // channels_last + { + if (fx <= 8 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 8,8,8, 1}; + if (fx <= 6 && fy <= 6 ) spec = {(void*)upfirdn2d_kernel_small, 8,8,8, 1}; + if (fx <= 4 && fy <= 4 ) spec = {(void*)upfirdn2d_kernel_small, 8,8,8, 1}; + if (fx <= 2 && fy <= 2 ) spec = {(void*)upfirdn2d_kernel_small, 8,8,8, 1}; + } + if (s != 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) // contiguous + { + if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,8,1, 1}; + if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,8,1, 1}; + if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,8,1, 1}; + if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,8,1, 1}; + if (fx <= 8 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,8,1, 1}; + } + if (s == 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) // channels_last + { + if (fx <= 24 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,1,8, 1}; + if (fx <= 20 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,1,8, 1}; + if (fx <= 16 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,1,8, 1}; + if (fx <= 12 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,1,8, 1}; + if (fx <= 8 && fy <= 1 ) spec = {(void*)upfirdn2d_kernel_small, 64,1,8, 1}; + } + if (s != 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) // contiguous + { + if (fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small, 32,16,1, 1}; + if (fx <= 1 && fy <= 20) spec = {(void*)upfirdn2d_kernel_small, 32,16,1, 1}; + if (fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small, 32,16,1, 1}; + if (fx <= 1 && fy <= 12) spec = {(void*)upfirdn2d_kernel_small, 32,16,1, 1}; + if (fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 32,16,1, 1}; + } + if (s == 1 && p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) // channels_last + { + if (fx <= 1 && fy <= 24) spec = {(void*)upfirdn2d_kernel_small, 1,64,8, 1}; + if (fx <= 1 && fy <= 20) spec = {(void*)upfirdn2d_kernel_small, 1,64,8, 1}; + if (fx <= 1 && fy <= 16) spec = {(void*)upfirdn2d_kernel_small, 1,64,8, 1}; + if (fx <= 1 && fy <= 12) spec = {(void*)upfirdn2d_kernel_small, 1,64,8, 1}; + if (fx <= 1 && fy <= 8 ) spec = {(void*)upfirdn2d_kernel_small, 1,64,8, 1}; + } + return spec; +} + +//------------------------------------------------------------------------ +// Template specializations. + +template upfirdn2d_kernel_spec choose_upfirdn2d_kernel (const upfirdn2d_kernel_params& p); +template upfirdn2d_kernel_spec choose_upfirdn2d_kernel (const upfirdn2d_kernel_params& p); +template upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p); + +//------------------------------------------------------------------------ diff --git a/src/torch_utils/ops/upfirdn2d.h b/src/torch_utils/ops/upfirdn2d.h new file mode 100644 index 0000000000000000000000000000000000000000..c9e2032bcac9d2abde7a75eea4d812da348afadd --- /dev/null +++ b/src/torch_utils/ops/upfirdn2d.h @@ -0,0 +1,59 @@ +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#include + +//------------------------------------------------------------------------ +// CUDA kernel parameters. + +struct upfirdn2d_kernel_params +{ + const void* x; + const float* f; + void* y; + + int2 up; + int2 down; + int2 pad0; + int flip; + float gain; + + int4 inSize; // [width, height, channel, batch] + int4 inStride; + int2 filterSize; // [width, height] + int2 filterStride; + int4 outSize; // [width, height, channel, batch] + int4 outStride; + int sizeMinor; + int sizeMajor; + + int loopMinor; + int loopMajor; + int loopX; + int launchMinor; + int launchMajor; +}; + +//------------------------------------------------------------------------ +// CUDA kernel specialization. + +struct upfirdn2d_kernel_spec +{ + void* kernel; + int tileOutW; + int tileOutH; + int loopMinor; + int loopX; +}; + +//------------------------------------------------------------------------ +// CUDA kernel selection. + +template upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params& p); + +//------------------------------------------------------------------------ diff --git a/src/torch_utils/ops/upfirdn2d.py b/src/torch_utils/ops/upfirdn2d.py new file mode 100644 index 0000000000000000000000000000000000000000..ceeac2b9834e33b7c601c28bf27f32aa91c69256 --- /dev/null +++ b/src/torch_utils/ops/upfirdn2d.py @@ -0,0 +1,384 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Custom PyTorch ops for efficient resampling of 2D images.""" + +import os +import warnings +import numpy as np +import torch +import traceback + +from .. import custom_ops +from .. import misc +from . import conv2d_gradfix + +#---------------------------------------------------------------------------- + +_inited = False +_plugin = None + +def _init(): + global _inited, _plugin + if not _inited: + sources = ['upfirdn2d.cpp', 'upfirdn2d.cu'] + sources = [os.path.join(os.path.dirname(__file__), s) for s in sources] + try: + _plugin = custom_ops.get_plugin('upfirdn2d_plugin', sources=sources, extra_cuda_cflags=['--use_fast_math']) + except: + warnings.warn('Failed to build CUDA kernels for upfirdn2d. Falling back to slow reference implementation. Details:\n\n' + traceback.format_exc()) + return _plugin is not None + +def _parse_scaling(scaling): + if isinstance(scaling, int): + scaling = [scaling, scaling] + assert isinstance(scaling, (list, tuple)) + assert all(isinstance(x, int) for x in scaling) + sx, sy = scaling + assert sx >= 1 and sy >= 1 + return sx, sy + +def _parse_padding(padding): + if isinstance(padding, int): + padding = [padding, padding] + assert isinstance(padding, (list, tuple)) + assert all(isinstance(x, int) for x in padding) + if len(padding) == 2: + padx, pady = padding + padding = [padx, padx, pady, pady] + padx0, padx1, pady0, pady1 = padding + return padx0, padx1, pady0, pady1 + +def _get_filter_size(f): + if f is None: + return 1, 1 + assert isinstance(f, torch.Tensor) and f.ndim in [1, 2] + fw = f.shape[-1] + fh = f.shape[0] + with misc.suppress_tracer_warnings(): + fw = int(fw) + fh = int(fh) + misc.assert_shape(f, [fh, fw][:f.ndim]) + assert fw >= 1 and fh >= 1 + return fw, fh + +#---------------------------------------------------------------------------- + +def setup_filter(f, device=torch.device('cpu'), normalize=True, flip_filter=False, gain=1, separable=None): + r"""Convenience function to setup 2D FIR filter for `upfirdn2d()`. + + Args: + f: Torch tensor, numpy array, or python list of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), + `[]` (impulse), or + `None` (identity). + device: Result device (default: cpu). + normalize: Normalize the filter so that it retains the magnitude + for constant input signal (DC)? (default: True). + flip_filter: Flip the filter? (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + separable: Return a separable filter? (default: select automatically). + + Returns: + Float32 tensor of the shape + `[filter_height, filter_width]` (non-separable) or + `[filter_taps]` (separable). + """ + # Validate. + if f is None: + f = 1 + f = torch.as_tensor(f, dtype=torch.float32) + assert f.ndim in [0, 1, 2] + assert f.numel() > 0 + if f.ndim == 0: + f = f[np.newaxis] + + # Separable? + if separable is None: + separable = (f.ndim == 1 and f.numel() >= 8) + if f.ndim == 1 and not separable: + f = f.ger(f) + assert f.ndim == (1 if separable else 2) + + # Apply normalize, flip, gain, and device. + if normalize: + f /= f.sum() + if flip_filter: + f = f.flip(list(range(f.ndim))) + f = f * (gain ** (f.ndim / 2)) + f = f.to(device=device) + return f + +#---------------------------------------------------------------------------- + +def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl='cuda'): + r"""Pad, upsample, filter, and downsample a batch of 2D images. + + Performs the following sequence of operations for each channel: + + 1. Upsample the image by inserting N-1 zeros after each pixel (`up`). + + 2. Pad the image with the specified number of zeros on each side (`padding`). + Negative padding corresponds to cropping the image. + + 3. Convolve the image with the specified 2D FIR filter (`f`), shrinking it + so that the footprint of all output pixels lies within the input image. + + 4. Downsample the image by keeping every Nth pixel (`down`). + + This sequence of operations bears close resemblance to scipy.signal.upfirdn(). + The fused op is considerably more efficient than performing the same calculation + using standard PyTorch ops. It supports gradients of arbitrary order. + + Args: + x: Float32/float64/float16 input tensor of the shape + `[batch_size, num_channels, in_height, in_width]`. + f: Float32 FIR filter of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), or + `None` (identity). + up: Integer upsampling factor. Can be a single int or a list/tuple + `[x, y]` (default: 1). + down: Integer downsampling factor. Can be a single int or a list/tuple + `[x, y]` (default: 1). + padding: Padding with respect to the upsampled image. Can be a single number + or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + flip_filter: False = convolution, True = correlation (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + impl: Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + assert isinstance(x, torch.Tensor) + assert impl in ['ref', 'cuda'] + if impl == 'cuda' and x.device.type == 'cuda' and _init(): + return _upfirdn2d_cuda(up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain).apply(x, f) + return _upfirdn2d_ref(x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain) + +#---------------------------------------------------------------------------- + +@misc.profiled_function +def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1): + """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops. + """ + # Validate arguments. + assert isinstance(x, torch.Tensor) and x.ndim == 4 + if f is None: + f = torch.ones([1, 1], dtype=torch.float32, device=x.device) + assert isinstance(f, torch.Tensor) and f.ndim in [1, 2] + assert f.dtype == torch.float32 and not f.requires_grad + batch_size, num_channels, in_height, in_width = x.shape + upx, upy = _parse_scaling(up) + downx, downy = _parse_scaling(down) + padx0, padx1, pady0, pady1 = _parse_padding(padding) + + # Upsample by inserting zeros. + x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1]) + x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1]) + x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx]) + + # Pad or crop. + x = torch.nn.functional.pad(x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)]) + x = x[:, :, max(-pady0, 0) : x.shape[2] - max(-pady1, 0), max(-padx0, 0) : x.shape[3] - max(-padx1, 0)] + + # Setup filter. + f = f * (gain ** (f.ndim / 2)) + f = f.to(x.dtype) + if not flip_filter: + f = f.flip(list(range(f.ndim))) + + # Convolve with the filter. + f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim) + if f.ndim == 4: + x = conv2d_gradfix.conv2d(input=x, weight=f, groups=num_channels) + else: + x = conv2d_gradfix.conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels) + x = conv2d_gradfix.conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels) + + # Downsample by throwing away pixels. + x = x[:, :, ::downy, ::downx] + return x + +#---------------------------------------------------------------------------- + +_upfirdn2d_cuda_cache = dict() + +def _upfirdn2d_cuda(up=1, down=1, padding=0, flip_filter=False, gain=1): + """Fast CUDA implementation of `upfirdn2d()` using custom ops. + """ + # Parse arguments. + upx, upy = _parse_scaling(up) + downx, downy = _parse_scaling(down) + padx0, padx1, pady0, pady1 = _parse_padding(padding) + + # Lookup from cache. + key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter, gain) + if key in _upfirdn2d_cuda_cache: + return _upfirdn2d_cuda_cache[key] + + # Forward op. + class Upfirdn2dCuda(torch.autograd.Function): + @staticmethod + def forward(ctx, x, f): # pylint: disable=arguments-differ + assert isinstance(x, torch.Tensor) and x.ndim == 4 + if f is None: + f = torch.ones([1, 1], dtype=torch.float32, device=x.device) + assert isinstance(f, torch.Tensor) and f.ndim in [1, 2] + y = x + if f.ndim == 2: + y = _plugin.upfirdn2d(y, f, upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter, gain) + else: + y = _plugin.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1, padx0, padx1, 0, 0, flip_filter, np.sqrt(gain)) + y = _plugin.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy, 0, 0, pady0, pady1, flip_filter, np.sqrt(gain)) + ctx.save_for_backward(f) + ctx.x_shape = x.shape + return y + + @staticmethod + def backward(ctx, dy): # pylint: disable=arguments-differ + f, = ctx.saved_tensors + _, _, ih, iw = ctx.x_shape + _, _, oh, ow = dy.shape + fw, fh = _get_filter_size(f) + p = [ + fw - padx0 - 1, + iw * upx - ow * downx + padx0 - upx + 1, + fh - pady0 - 1, + ih * upy - oh * downy + pady0 - upy + 1, + ] + dx = None + df = None + + if ctx.needs_input_grad[0]: + dx = _upfirdn2d_cuda(up=down, down=up, padding=p, flip_filter=(not flip_filter), gain=gain).apply(dy, f) + + assert not ctx.needs_input_grad[1] + return dx, df + + # Add to cache. + _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda + return Upfirdn2dCuda + +#---------------------------------------------------------------------------- + +def filter2d(x, f, padding=0, flip_filter=False, gain=1, impl='cuda'): + r"""Filter a batch of 2D images using the given 2D FIR filter. + + By default, the result is padded so that its shape matches the input. + User-specified padding is applied on top of that, with negative values + indicating cropping. Pixels outside the image are assumed to be zero. + + Args: + x: Float32/float64/float16 input tensor of the shape + `[batch_size, num_channels, in_height, in_width]`. + f: Float32 FIR filter of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), or + `None` (identity). + padding: Padding with respect to the output. Can be a single number or a + list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + flip_filter: False = convolution, True = correlation (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + impl: Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + padx0, padx1, pady0, pady1 = _parse_padding(padding) + fw, fh = _get_filter_size(f) + p = [ + padx0 + fw // 2, + padx1 + (fw - 1) // 2, + pady0 + fh // 2, + pady1 + (fh - 1) // 2, + ] + return upfirdn2d(x, f, padding=p, flip_filter=flip_filter, gain=gain, impl=impl) + +#---------------------------------------------------------------------------- + +def upsample2d(x, f, up=2, padding=0, flip_filter=False, gain=1, impl='cuda'): + r"""Upsample a batch of 2D images using the given 2D FIR filter. + + By default, the result is padded so that its shape is a multiple of the input. + User-specified padding is applied on top of that, with negative values + indicating cropping. Pixels outside the image are assumed to be zero. + + Args: + x: Float32/float64/float16 input tensor of the shape + `[batch_size, num_channels, in_height, in_width]`. + f: Float32 FIR filter of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), or + `None` (identity). + up: Integer upsampling factor. Can be a single int or a list/tuple + `[x, y]` (default: 1). + padding: Padding with respect to the output. Can be a single number or a + list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + flip_filter: False = convolution, True = correlation (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + impl: Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + upx, upy = _parse_scaling(up) + padx0, padx1, pady0, pady1 = _parse_padding(padding) + fw, fh = _get_filter_size(f) + p = [ + padx0 + (fw + upx - 1) // 2, + padx1 + (fw - upx) // 2, + pady0 + (fh + upy - 1) // 2, + pady1 + (fh - upy) // 2, + ] + return upfirdn2d(x, f, up=up, padding=p, flip_filter=flip_filter, gain=gain*upx*upy, impl=impl) + +#---------------------------------------------------------------------------- + +def downsample2d(x, f, down=2, padding=0, flip_filter=False, gain=1, impl='cuda'): + r"""Downsample a batch of 2D images using the given 2D FIR filter. + + By default, the result is padded so that its shape is a fraction of the input. + User-specified padding is applied on top of that, with negative values + indicating cropping. Pixels outside the image are assumed to be zero. + + Args: + x: Float32/float64/float16 input tensor of the shape + `[batch_size, num_channels, in_height, in_width]`. + f: Float32 FIR filter of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), or + `None` (identity). + down: Integer downsampling factor. Can be a single int or a list/tuple + `[x, y]` (default: 1). + padding: Padding with respect to the input. Can be a single number or a + list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + flip_filter: False = convolution, True = correlation (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + impl: Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + downx, downy = _parse_scaling(down) + padx0, padx1, pady0, pady1 = _parse_padding(padding) + fw, fh = _get_filter_size(f) + p = [ + padx0 + (fw - downx + 1) // 2, + padx1 + (fw - downx) // 2, + pady0 + (fh - downy + 1) // 2, + pady1 + (fh - downy) // 2, + ] + return upfirdn2d(x, f, down=down, padding=p, flip_filter=flip_filter, gain=gain, impl=impl) + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/persistence.py b/src/torch_utils/persistence.py new file mode 100644 index 0000000000000000000000000000000000000000..06e6bb7a274a46fe03d10f04417555edbc6f00cb --- /dev/null +++ b/src/torch_utils/persistence.py @@ -0,0 +1,251 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Facilities for pickling Python code alongside other data. + +The pickled code is automatically imported into a separate Python module +during unpickling. This way, any previously exported pickles will remain +usable even if the original code is no longer available, or if the current +version of the code is not consistent with what was originally pickled.""" + +import sys +import pickle +import io +import inspect +import copy +import uuid +import types +from src import dnnlib + +#---------------------------------------------------------------------------- + +_version = 6 # internal version number +_decorators = set() # {decorator_class, ...} +_import_hooks = [] # [hook_function, ...] +_module_to_src_dict = dict() # {module: src, ...} +_src_to_module_dict = dict() # {src: module, ...} + +#---------------------------------------------------------------------------- + +def persistent_class(orig_class): + r"""Class decorator that extends a given class to save its source code + when pickled. + + Example: + + from src.torch_utils import persistence + + @persistence.persistent_class + class MyNetwork(torch.nn.Module): + def __init__(self, num_inputs, num_outputs): + super().__init__() + self.fc = MyLayer(num_inputs, num_outputs) + ... + + @persistence.persistent_class + class MyLayer(torch.nn.Module): + ... + + When pickled, any instance of `MyNetwork` and `MyLayer` will save its + source code alongside other internal state (e.g., parameters, buffers, + and submodules). This way, any previously exported pickle will remain + usable even if the class definitions have been modified or are no + longer available. + + The decorator saves the source code of the entire Python module + containing the decorated class. It does *not* save the source code of + any imported modules. Thus, the imported modules must be available + during unpickling, also including `torch_utils.persistence` itself. + + It is ok to call functions defined in the same module from the + decorated class. However, if the decorated class depends on other + classes defined in the same module, they must be decorated as well. + This is illustrated in the above example in the case of `MyLayer`. + + It is also possible to employ the decorator just-in-time before + calling the constructor. For example: + + cls = MyLayer + if want_to_make_it_persistent: + cls = persistence.persistent_class(cls) + layer = cls(num_inputs, num_outputs) + + As an additional feature, the decorator also keeps track of the + arguments that were used to construct each instance of the decorated + class. The arguments can be queried via `obj.init_args` and + `obj.init_kwargs`, and they are automatically pickled alongside other + object state. A typical use case is to first unpickle a previous + instance of a persistent class, and then upgrade it to use the latest + version of the source code: + + with open('old_pickle.pkl', 'rb') as f: + old_net = pickle.load(f) + new_net = MyNetwork(*old_obj.init_args, **old_obj.init_kwargs) + misc.copy_params_and_buffers(old_net, new_net, require_all=True) + """ + assert isinstance(orig_class, type) + if is_persistent(orig_class): + return orig_class + + assert orig_class.__module__ in sys.modules + orig_module = sys.modules[orig_class.__module__] + orig_module_src = _module_to_src(orig_module) + + class Decorator(orig_class): + _orig_module_src = orig_module_src + _orig_class_name = orig_class.__name__ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._init_args = copy.deepcopy(args) + self._init_kwargs = copy.deepcopy(kwargs) + assert orig_class.__name__ in orig_module.__dict__ + _check_pickleable(self.__reduce__()) + + @property + def init_args(self): + return copy.deepcopy(self._init_args) + + @property + def init_kwargs(self): + return dnnlib.EasyDict(copy.deepcopy(self._init_kwargs)) + + def __reduce__(self): + fields = list(super().__reduce__()) + fields += [None] * max(3 - len(fields), 0) + if fields[0] is not _reconstruct_persistent_obj: + meta = dict(type='class', version=_version, module_src=self._orig_module_src, class_name=self._orig_class_name, state=fields[2]) + fields[0] = _reconstruct_persistent_obj # reconstruct func + fields[1] = (meta,) # reconstruct args + fields[2] = None # state dict + return tuple(fields) + + Decorator.__name__ = orig_class.__name__ + _decorators.add(Decorator) + return Decorator + +#---------------------------------------------------------------------------- + +def is_persistent(obj): + r"""Test whether the given object or class is persistent, i.e., + whether it will save its source code when pickled. + """ + try: + if obj in _decorators: + return True + except TypeError: + pass + return type(obj) in _decorators # pylint: disable=unidiomatic-typecheck + +#---------------------------------------------------------------------------- + +def import_hook(hook): + r"""Register an import hook that is called whenever a persistent object + is being unpickled. A typical use case is to patch the pickled source + code to avoid errors and inconsistencies when the API of some imported + module has changed. + + The hook should have the following signature: + + hook(meta) -> modified meta + + `meta` is an instance of `dnnlib.EasyDict` with the following fields: + + type: Type of the persistent object, e.g. `'class'`. + version: Internal version number of `torch_utils.persistence`. + module_src Original source code of the Python module. + class_name: Class name in the original Python module. + state: Internal state of the object. + + Example: + + @persistence.import_hook + def wreck_my_network(meta): + if meta.class_name == 'MyNetwork': + print('MyNetwork is being imported. I will wreck it!') + meta.module_src = meta.module_src.replace("True", "False") + return meta + """ + assert callable(hook) + _import_hooks.append(hook) + +#---------------------------------------------------------------------------- + +def _reconstruct_persistent_obj(meta): + r"""Hook that is called internally by the `pickle` module to unpickle + a persistent object. + """ + meta = dnnlib.EasyDict(meta) + meta.state = dnnlib.EasyDict(meta.state) + for hook in _import_hooks: + meta = hook(meta) + assert meta is not None + + assert meta.version == _version + module = _src_to_module(meta.module_src) + + assert meta.type == 'class' + orig_class = module.__dict__[meta.class_name] + decorator_class = persistent_class(orig_class) + obj = decorator_class.__new__(decorator_class) + + setstate = getattr(obj, '__setstate__', None) + if callable(setstate): + setstate(meta.state) # pylint: disable=not-callable + else: + obj.__dict__.update(meta.state) + return obj + +#---------------------------------------------------------------------------- + +def _module_to_src(module): + r"""Query the source code of a given Python module. + """ + src = _module_to_src_dict.get(module, None) + if src is None: + src = inspect.getsource(module) + _module_to_src_dict[module] = src + _src_to_module_dict[src] = module + return src + +def _src_to_module(src): + r"""Get or create a Python module for the given source code. + """ + module = _src_to_module_dict.get(src, None) + if module is None: + module_name = "_imported_module_" + uuid.uuid4().hex + module = types.ModuleType(module_name) + sys.modules[module_name] = module + _module_to_src_dict[module] = src + _src_to_module_dict[src] = module + exec(src, module.__dict__) # pylint: disable=exec-used + return module + +#---------------------------------------------------------------------------- + +def _check_pickleable(obj): + r"""Check that the given object is pickleable, raising an exception if + it is not. This function is expected to be considerably more efficient + than actually pickling the object. + """ + def recurse(obj): + if isinstance(obj, (list, tuple, set)): + return [recurse(x) for x in obj] + if isinstance(obj, dict): + return [[recurse(x), recurse(y)] for x, y in obj.items()] + if isinstance(obj, (str, int, float, bool, bytes, bytearray)): + return None # Python primitive types are pickleable. + if f'{type(obj).__module__}.{type(obj).__name__}' in ['numpy.ndarray', 'torch.Tensor']: + return None # NumPy arrays and PyTorch tensors are pickleable. + if is_persistent(obj): + return None # Persistent objects are pickleable, by virtue of the constructor check. + return obj + with io.BytesIO() as f: + pickle.dump(recurse(obj), f) + +#---------------------------------------------------------------------------- diff --git a/src/torch_utils/training_stats.py b/src/torch_utils/training_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..d953504bee5edd28bf48c7b7ac0c684efd01aca1 --- /dev/null +++ b/src/torch_utils/training_stats.py @@ -0,0 +1,268 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +"""Facilities for reporting and collecting training statistics across +multiple processes and devices. The interface is designed to minimize +synchronization overhead as well as the amount of boilerplate in user +code.""" + +import re +import numpy as np +import torch +from src import dnnlib + +from . import misc + +#---------------------------------------------------------------------------- + +_num_moments = 3 # [num_scalars, sum_of_scalars, sum_of_squares] +_reduce_dtype = torch.float32 # Data type to use for initial per-tensor reduction. +_counter_dtype = torch.float64 # Data type to use for the internal counters. +_rank = 0 # Rank of the current process. +_sync_device = None # Device to use for multiprocess communication. None = single-process. +_sync_called = False # Has _sync() been called yet? +_counters = dict() # Running counters on each device, updated by report(): name => device => torch.Tensor +_cumulative = dict() # Cumulative counters on the CPU, updated by _sync(): name => torch.Tensor + +#---------------------------------------------------------------------------- + +def init_multiprocessing(rank, sync_device): + r"""Initializes `torch_utils.training_stats` for collecting statistics + across multiple processes. + + This function must be called after + `torch.distributed.init_process_group()` and before `Collector.update()`. + The call is not necessary if multi-process collection is not needed. + + Args: + rank: Rank of the current process. + sync_device: PyTorch device to use for inter-process + communication, or None to disable multi-process + collection. Typically `torch.device('cuda', rank)`. + """ + global _rank, _sync_device + assert not _sync_called + _rank = rank + _sync_device = sync_device + +#---------------------------------------------------------------------------- + +@misc.profiled_function +def report(name, value): + r"""Broadcasts the given set of scalars to all interested instances of + `Collector`, across device and process boundaries. + + This function is expected to be extremely cheap and can be safely + called from anywhere in the training loop, loss function, or inside a + `torch.nn.Module`. + + Warning: The current implementation expects the set of unique names to + be consistent across processes. Please make sure that `report()` is + called at least once for each unique name by each process, and in the + same order. If a given process has no scalars to broadcast, it can do + `report(name, [])` (empty list). + + Args: + name: Arbitrary string specifying the name of the statistic. + Averages are accumulated separately for each unique name. + value: Arbitrary set of scalars. Can be a list, tuple, + NumPy array, PyTorch tensor, or Python scalar. + + Returns: + The same `value` that was passed in. + """ + if name not in _counters: + _counters[name] = dict() + + elems = torch.as_tensor(value) + if elems.numel() == 0: + return value + + elems = elems.detach().flatten().to(_reduce_dtype) + moments = torch.stack([ + torch.ones_like(elems).sum(), + elems.sum(), + elems.square().sum(), + ]) + assert moments.ndim == 1 and moments.shape[0] == _num_moments + moments = moments.to(_counter_dtype) + + device = moments.device + if device not in _counters[name]: + _counters[name][device] = torch.zeros_like(moments) + _counters[name][device].add_(moments) + return value + +#---------------------------------------------------------------------------- + +def report0(name, value): + r"""Broadcasts the given set of scalars by the first process (`rank = 0`), + but ignores any scalars provided by the other processes. + See `report()` for further details. + """ + report(name, value if _rank == 0 else []) + return value + +#---------------------------------------------------------------------------- + +class Collector: + r"""Collects the scalars broadcasted by `report()` and `report0()` and + computes their long-term averages (mean and standard deviation) over + user-defined periods of time. + + The averages are first collected into internal counters that are not + directly visible to the user. They are then copied to the user-visible + state as a result of calling `update()` and can then be queried using + `mean()`, `std()`, `as_dict()`, etc. Calling `update()` also resets the + internal counters for the next round, so that the user-visible state + effectively reflects averages collected between the last two calls to + `update()`. + + Args: + regex: Regular expression defining which statistics to + collect. The default is to collect everything. + keep_previous: Whether to retain the previous averages if no + scalars were collected on a given round + (default: True). + """ + def __init__(self, regex='.*', keep_previous=True): + self._regex = re.compile(regex) + self._keep_previous = keep_previous + self._cumulative = dict() + self._moments = dict() + self.update() + self._moments.clear() + + def names(self): + r"""Returns the names of all statistics broadcasted so far that + match the regular expression specified at construction time. + """ + return [name for name in _counters if self._regex.fullmatch(name)] + + def update(self): + r"""Copies current values of the internal counters to the + user-visible state and resets them for the next round. + + If `keep_previous=True` was specified at construction time, the + operation is skipped for statistics that have received no scalars + since the last update, retaining their previous averages. + + This method performs a number of GPU-to-CPU transfers and one + `torch.distributed.all_reduce()`. It is intended to be called + periodically in the main training loop, typically once every + N training steps. + """ + if not self._keep_previous: + self._moments.clear() + for name, cumulative in _sync(self.names()): + if name not in self._cumulative: + self._cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype) + delta = cumulative - self._cumulative[name] + self._cumulative[name].copy_(cumulative) + if float(delta[0]) != 0: + self._moments[name] = delta + + def _get_delta(self, name): + r"""Returns the raw moments that were accumulated for the given + statistic between the last two calls to `update()`, or zero if + no scalars were collected. + """ + assert self._regex.fullmatch(name) + if name not in self._moments: + self._moments[name] = torch.zeros([_num_moments], dtype=_counter_dtype) + return self._moments[name] + + def num(self, name): + r"""Returns the number of scalars that were accumulated for the given + statistic between the last two calls to `update()`, or zero if + no scalars were collected. + """ + delta = self._get_delta(name) + return int(delta[0]) + + def mean(self, name): + r"""Returns the mean of the scalars that were accumulated for the + given statistic between the last two calls to `update()`, or NaN if + no scalars were collected. + """ + delta = self._get_delta(name) + if int(delta[0]) == 0: + return float('nan') + return float(delta[1] / delta[0]) + + def std(self, name): + r"""Returns the standard deviation of the scalars that were + accumulated for the given statistic between the last two calls to + `update()`, or NaN if no scalars were collected. + """ + delta = self._get_delta(name) + if int(delta[0]) == 0 or not np.isfinite(float(delta[1])): + return float('nan') + if int(delta[0]) == 1: + return float(0) + mean = float(delta[1] / delta[0]) + raw_var = float(delta[2] / delta[0]) + return np.sqrt(max(raw_var - np.square(mean), 0)) + + def as_dict(self): + r"""Returns the averages accumulated between the last two calls to + `update()` as an `dnnlib.EasyDict`. The contents are as follows: + + dnnlib.EasyDict( + NAME = dnnlib.EasyDict(num=FLOAT, mean=FLOAT, std=FLOAT), + ... + ) + """ + stats = dnnlib.EasyDict() + for name in self.names(): + stats[name] = dnnlib.EasyDict(num=self.num(name), mean=self.mean(name), std=self.std(name)) + return stats + + def __getitem__(self, name): + r"""Convenience getter. + `collector[name]` is a synonym for `collector.mean(name)`. + """ + return self.mean(name) + +#---------------------------------------------------------------------------- + +def _sync(names): + r"""Synchronize the global cumulative counters across devices and + processes. Called internally by `Collector.update()`. + """ + if len(names) == 0: + return [] + global _sync_called + _sync_called = True + + # Collect deltas within current rank. + deltas = [] + device = _sync_device if _sync_device is not None else torch.device('cpu') + for name in names: + delta = torch.zeros([_num_moments], dtype=_counter_dtype, device=device) + for counter in _counters[name].values(): + delta.add_(counter.to(device)) + counter.copy_(torch.zeros_like(counter)) + deltas.append(delta) + deltas = torch.stack(deltas) + + # Sum deltas across ranks. + if _sync_device is not None: + torch.distributed.all_reduce(deltas) + + # Update cumulative values. + deltas = deltas.cpu() + for idx, name in enumerate(names): + if name not in _cumulative: + _cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype) + _cumulative[name].add_(deltas[idx]) + + # Return name-value pairs. + return [(name, _cumulative[name]) for name in names] + +#---------------------------------------------------------------------------- diff --git a/src/training/__init__.py b/src/training/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1e1a5ba99e56a56ecaa14f7d4fa41777789c0cf --- /dev/null +++ b/src/training/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +# empty diff --git a/src/training/augment.py b/src/training/augment.py new file mode 100644 index 0000000000000000000000000000000000000000..2bd72d5f0a26cbb28e4e1b79b5236533224be712 --- /dev/null +++ b/src/training/augment.py @@ -0,0 +1,436 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import numpy as np +import scipy.signal +import torch +from src.torch_utils import persistence +from src.torch_utils import misc +from src.torch_utils.ops import upfirdn2d +from src.torch_utils.ops import grid_sample_gradfix +from src.torch_utils.ops import conv2d_gradfix + +#---------------------------------------------------------------------------- +# Coefficients of various wavelet decomposition low-pass filters. + +wavelets = { + 'haar': [0.7071067811865476, 0.7071067811865476], + 'db1': [0.7071067811865476, 0.7071067811865476], + 'db2': [-0.12940952255092145, 0.22414386804185735, 0.836516303737469, 0.48296291314469025], + 'db3': [0.035226291882100656, -0.08544127388224149, -0.13501102001039084, 0.4598775021193313, 0.8068915093133388, 0.3326705529509569], + 'db4': [-0.010597401784997278, 0.032883011666982945, 0.030841381835986965, -0.18703481171888114, -0.02798376941698385, 0.6308807679295904, 0.7148465705525415, 0.23037781330885523], + 'db5': [0.003335725285001549, -0.012580751999015526, -0.006241490213011705, 0.07757149384006515, -0.03224486958502952, -0.24229488706619015, 0.13842814590110342, 0.7243085284385744, 0.6038292697974729, 0.160102397974125], + 'db6': [-0.00107730108499558, 0.004777257511010651, 0.0005538422009938016, -0.031582039318031156, 0.02752286553001629, 0.09750160558707936, -0.12976686756709563, -0.22626469396516913, 0.3152503517092432, 0.7511339080215775, 0.4946238903983854, 0.11154074335008017], + 'db7': [0.0003537138000010399, -0.0018016407039998328, 0.00042957797300470274, 0.012550998556013784, -0.01657454163101562, -0.03802993693503463, 0.0806126091510659, 0.07130921926705004, -0.22403618499416572, -0.14390600392910627, 0.4697822874053586, 0.7291320908465551, 0.39653931948230575, 0.07785205408506236], + 'db8': [-0.00011747678400228192, 0.0006754494059985568, -0.0003917403729959771, -0.00487035299301066, 0.008746094047015655, 0.013981027917015516, -0.04408825393106472, -0.01736930100202211, 0.128747426620186, 0.00047248457399797254, -0.2840155429624281, -0.015829105256023893, 0.5853546836548691, 0.6756307362980128, 0.3128715909144659, 0.05441584224308161], + 'sym2': [-0.12940952255092145, 0.22414386804185735, 0.836516303737469, 0.48296291314469025], + 'sym3': [0.035226291882100656, -0.08544127388224149, -0.13501102001039084, 0.4598775021193313, 0.8068915093133388, 0.3326705529509569], + 'sym4': [-0.07576571478927333, -0.02963552764599851, 0.49761866763201545, 0.8037387518059161, 0.29785779560527736, -0.09921954357684722, -0.012603967262037833, 0.0322231006040427], + 'sym5': [0.027333068345077982, 0.029519490925774643, -0.039134249302383094, 0.1993975339773936, 0.7234076904024206, 0.6339789634582119, 0.01660210576452232, -0.17532808990845047, -0.021101834024758855, 0.019538882735286728], + 'sym6': [0.015404109327027373, 0.0034907120842174702, -0.11799011114819057, -0.048311742585633, 0.4910559419267466, 0.787641141030194, 0.3379294217276218, -0.07263752278646252, -0.021060292512300564, 0.04472490177066578, 0.0017677118642428036, -0.007800708325034148], + 'sym7': [0.002681814568257878, -0.0010473848886829163, -0.01263630340325193, 0.03051551316596357, 0.0678926935013727, -0.049552834937127255, 0.017441255086855827, 0.5361019170917628, 0.767764317003164, 0.2886296317515146, -0.14004724044296152, -0.10780823770381774, 0.004010244871533663, 0.010268176708511255], + 'sym8': [-0.0033824159510061256, -0.0005421323317911481, 0.03169508781149298, 0.007607487324917605, -0.1432942383508097, -0.061273359067658524, 0.4813596512583722, 0.7771857517005235, 0.3644418948353314, -0.05194583810770904, -0.027219029917056003, 0.049137179673607506, 0.003808752013890615, -0.01495225833704823, -0.0003029205147213668, 0.0018899503327594609], +} + +#---------------------------------------------------------------------------- +# Helpers for constructing transformation matrices. + +def matrix(*rows, device=None): + assert all(len(row) == len(rows[0]) for row in rows) + elems = [x for row in rows for x in row] + ref = [x for x in elems if isinstance(x, torch.Tensor)] + if len(ref) == 0: + return misc.constant(np.asarray(rows), device=device) + assert device is None or device == ref[0].device + elems = [x if isinstance(x, torch.Tensor) else misc.constant(x, shape=ref[0].shape, device=ref[0].device) for x in elems] + return torch.stack(elems, dim=-1).reshape(ref[0].shape + (len(rows), -1)) + +def translate2d(tx, ty, **kwargs): + return matrix( + [1, 0, tx], + [0, 1, ty], + [0, 0, 1], + **kwargs) + +def translate3d(tx, ty, tz, **kwargs): + return matrix( + [1, 0, 0, tx], + [0, 1, 0, ty], + [0, 0, 1, tz], + [0, 0, 0, 1], + **kwargs) + +def scale2d(sx, sy, **kwargs): + return matrix( + [sx, 0, 0], + [0, sy, 0], + [0, 0, 1], + **kwargs) + +def scale3d(sx, sy, sz, **kwargs): + return matrix( + [sx, 0, 0, 0], + [0, sy, 0, 0], + [0, 0, sz, 0], + [0, 0, 0, 1], + **kwargs) + +def rotate2d(theta, **kwargs): + return matrix( + [torch.cos(theta), torch.sin(-theta), 0], + [torch.sin(theta), torch.cos(theta), 0], + [0, 0, 1], + **kwargs) + +def rotate3d(v, theta, **kwargs): + vx = v[..., 0]; vy = v[..., 1]; vz = v[..., 2] + s = torch.sin(theta); c = torch.cos(theta); cc = 1 - c + return matrix( + [vx*vx*cc+c, vx*vy*cc-vz*s, vx*vz*cc+vy*s, 0], + [vy*vx*cc+vz*s, vy*vy*cc+c, vy*vz*cc-vx*s, 0], + [vz*vx*cc-vy*s, vz*vy*cc+vx*s, vz*vz*cc+c, 0], + [0, 0, 0, 1], + **kwargs) + +def translate2d_inv(tx, ty, **kwargs): + return translate2d(-tx, -ty, **kwargs) + +def scale2d_inv(sx, sy, **kwargs): + return scale2d(1 / sx, 1 / sy, **kwargs) + +def rotate2d_inv(theta, **kwargs): + return rotate2d(-theta, **kwargs) + +#---------------------------------------------------------------------------- +# Versatile image augmentation pipeline from the paper +# "Training Generative Adversarial Networks with Limited Data". +# +# All augmentations are disabled by default; individual augmentations can +# be enabled by setting their probability multipliers to 1. + +@persistence.persistent_class +class AugmentPipe(torch.nn.Module): + def __init__(self, + xflip=0, rotate90=0, xint=0, xint_max=0.125, + scale=0, rotate=0, aniso=0, xfrac=0, scale_std=0.2, rotate_max=1, aniso_std=0.2, xfrac_std=0.125, + brightness=0, contrast=0, lumaflip=0, hue=0, saturation=0, brightness_std=0.2, contrast_std=0.5, hue_max=1, saturation_std=1, + imgfilter=0, imgfilter_bands=[1,1,1,1], imgfilter_std=1, + noise=0, cutout=0, noise_std=0.1, cutout_size=0.5, + ): + super().__init__() + self.register_buffer('p', torch.ones([])) # Overall multiplier for augmentation probability. + + # Pixel blitting. + self.xflip = float(xflip) # Probability multiplier for x-flip. + self.rotate90 = float(rotate90) # Probability multiplier for 90 degree rotations. + self.xint = float(xint) # Probability multiplier for integer translation. + self.xint_max = float(xint_max) # Range of integer translation, relative to image dimensions. + + # General geometric transformations. + self.scale = float(scale) # Probability multiplier for isotropic scaling. + self.rotate = float(rotate) # Probability multiplier for arbitrary rotation. + self.aniso = float(aniso) # Probability multiplier for anisotropic scaling. + self.xfrac = float(xfrac) # Probability multiplier for fractional translation. + self.scale_std = float(scale_std) # Log2 standard deviation of isotropic scaling. + self.rotate_max = float(rotate_max) # Range of arbitrary rotation, 1 = full circle. + self.aniso_std = float(aniso_std) # Log2 standard deviation of anisotropic scaling. + self.xfrac_std = float(xfrac_std) # Standard deviation of frational translation, relative to image dimensions. + + # Color transformations. + self.brightness = float(brightness) # Probability multiplier for brightness. + self.contrast = float(contrast) # Probability multiplier for contrast. + self.lumaflip = float(lumaflip) # Probability multiplier for luma flip. + self.hue = float(hue) # Probability multiplier for hue rotation. + self.saturation = float(saturation) # Probability multiplier for saturation. + self.brightness_std = float(brightness_std) # Standard deviation of brightness. + self.contrast_std = float(contrast_std) # Log2 standard deviation of contrast. + self.hue_max = float(hue_max) # Range of hue rotation, 1 = full circle. + self.saturation_std = float(saturation_std) # Log2 standard deviation of saturation. + + # Image-space filtering. + self.imgfilter = float(imgfilter) # Probability multiplier for image-space filtering. + self.imgfilter_bands = list(imgfilter_bands) # Probability multipliers for individual frequency bands. + self.imgfilter_std = float(imgfilter_std) # Log2 standard deviation of image-space filter amplification. + + # Image-space corruptions. + self.noise = float(noise) # Probability multiplier for additive RGB noise. + self.cutout = float(cutout) # Probability multiplier for cutout. + self.noise_std = float(noise_std) # Standard deviation of additive RGB noise. + self.cutout_size = float(cutout_size) # Size of the cutout rectangle, relative to image dimensions. + + # Setup orthogonal lowpass filter for geometric augmentations. + self.register_buffer('Hz_geom', upfirdn2d.setup_filter(wavelets['sym6'])) + + # Construct filter bank for image-space filtering. + Hz_lo = np.asarray(wavelets['sym2']) # H(z) + Hz_hi = Hz_lo * ((-1) ** np.arange(Hz_lo.size)) # H(-z) + Hz_lo2 = np.convolve(Hz_lo, Hz_lo[::-1]) / 2 # H(z) * H(z^-1) / 2 + Hz_hi2 = np.convolve(Hz_hi, Hz_hi[::-1]) / 2 # H(-z) * H(-z^-1) / 2 + Hz_fbank = np.eye(4, 1) # Bandpass(H(z), b_i) + for i in range(1, Hz_fbank.shape[0]): + Hz_fbank = np.dstack([Hz_fbank, np.zeros_like(Hz_fbank)]).reshape(Hz_fbank.shape[0], -1)[:, :-1] + Hz_fbank = scipy.signal.convolve(Hz_fbank, [Hz_lo2]) + Hz_fbank[i, (Hz_fbank.shape[1] - Hz_hi2.size) // 2 : (Hz_fbank.shape[1] + Hz_hi2.size) // 2] += Hz_hi2 + self.register_buffer('Hz_fbank', torch.as_tensor(Hz_fbank, dtype=torch.float32)) + + def forward(self, images, debug_percentile=None): + assert isinstance(images, torch.Tensor) and images.ndim == 4 + batch_size, num_channels, height, width = images.shape + device = images.device + if debug_percentile is not None: + debug_percentile = torch.as_tensor(debug_percentile, dtype=torch.float32, device=device) + + # ------------------------------------- + # Select parameters for pixel blitting. + # ------------------------------------- + + # Initialize inverse homogeneous 2D transform: G_inv @ pixel_out ==> pixel_in + I_3 = torch.eye(3, device=device) + G_inv = I_3 + + # Apply x-flip with probability (xflip * strength). + if self.xflip > 0: + i = torch.floor(torch.rand([batch_size], device=device) * 2) + i = torch.where(torch.rand([batch_size], device=device) < self.xflip * self.p, i, torch.zeros_like(i)) + if debug_percentile is not None: + i = torch.full_like(i, torch.floor(debug_percentile * 2)) + G_inv = G_inv @ scale2d_inv(1 - 2 * i, 1) + + # Apply 90 degree rotations with probability (rotate90 * strength). + if self.rotate90 > 0: + i = torch.floor(torch.rand([batch_size], device=device) * 4) + i = torch.where(torch.rand([batch_size], device=device) < self.rotate90 * self.p, i, torch.zeros_like(i)) + if debug_percentile is not None: + i = torch.full_like(i, torch.floor(debug_percentile * 4)) + G_inv = G_inv @ rotate2d_inv(-np.pi / 2 * i) + + # Apply integer translation with probability (xint * strength). + if self.xint > 0: + t = (torch.rand([batch_size, 2], device=device) * 2 - 1) * self.xint_max + t = torch.where(torch.rand([batch_size, 1], device=device) < self.xint * self.p, t, torch.zeros_like(t)) + if debug_percentile is not None: + t = torch.full_like(t, (debug_percentile * 2 - 1) * self.xint_max) + G_inv = G_inv @ translate2d_inv(torch.round(t[:,0] * width), torch.round(t[:,1] * height)) + + # -------------------------------------------------------- + # Select parameters for general geometric transformations. + # -------------------------------------------------------- + + # Apply isotropic scaling with probability (scale * strength). + if self.scale > 0: + s = torch.exp2(torch.randn([batch_size], device=device) * self.scale_std) + s = torch.where(torch.rand([batch_size], device=device) < self.scale * self.p, s, torch.ones_like(s)) + if debug_percentile is not None: + s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.scale_std)) + G_inv = G_inv @ scale2d_inv(s, s) + + # Apply pre-rotation with probability p_rot. + p_rot = 1 - torch.sqrt((1 - self.rotate * self.p).clamp(0, 1)) # P(pre OR post) = p + if self.rotate > 0: + theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.rotate_max + theta = torch.where(torch.rand([batch_size], device=device) < p_rot, theta, torch.zeros_like(theta)) + if debug_percentile is not None: + theta = torch.full_like(theta, (debug_percentile * 2 - 1) * np.pi * self.rotate_max) + G_inv = G_inv @ rotate2d_inv(-theta) # Before anisotropic scaling. + + # Apply anisotropic scaling with probability (aniso * strength). + if self.aniso > 0: + s = torch.exp2(torch.randn([batch_size], device=device) * self.aniso_std) + s = torch.where(torch.rand([batch_size], device=device) < self.aniso * self.p, s, torch.ones_like(s)) + if debug_percentile is not None: + s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.aniso_std)) + G_inv = G_inv @ scale2d_inv(s, 1 / s) + + # Apply post-rotation with probability p_rot. + if self.rotate > 0: + theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.rotate_max + theta = torch.where(torch.rand([batch_size], device=device) < p_rot, theta, torch.zeros_like(theta)) + if debug_percentile is not None: + theta = torch.zeros_like(theta) + G_inv = G_inv @ rotate2d_inv(-theta) # After anisotropic scaling. + + # Apply fractional translation with probability (xfrac * strength). + if self.xfrac > 0: + t = torch.randn([batch_size, 2], device=device) * self.xfrac_std + t = torch.where(torch.rand([batch_size, 1], device=device) < self.xfrac * self.p, t, torch.zeros_like(t)) + if debug_percentile is not None: + t = torch.full_like(t, torch.erfinv(debug_percentile * 2 - 1) * self.xfrac_std) + G_inv = G_inv @ translate2d_inv(t[:,0] * width, t[:,1] * height) + + # ---------------------------------- + # Execute geometric transformations. + # ---------------------------------- + + # Execute if the transform is not identity. + if G_inv is not I_3: + # Calculate padding. + cx = (width - 1) / 2 + cy = (height - 1) / 2 + cp = matrix([-cx, -cy, 1], [cx, -cy, 1], [cx, cy, 1], [-cx, cy, 1], device=device) # [idx, xyz] + cp = G_inv @ cp.t() # [batch, xyz, idx] + Hz_pad = self.Hz_geom.shape[0] // 4 + margin = cp[:, :2, :].permute(1, 0, 2).flatten(1) # [xy, batch * idx] + margin = torch.cat([-margin, margin]).max(dim=1).values # [x0, y0, x1, y1] + margin = margin + misc.constant([Hz_pad * 2 - cx, Hz_pad * 2 - cy] * 2, device=device) + margin = margin.max(misc.constant([0, 0] * 2, device=device)) + margin = margin.min(misc.constant([width-1, height-1] * 2, device=device)) + mx0, my0, mx1, my1 = margin.ceil().to(torch.int32) + + # Pad image and adjust origin. + images = torch.nn.functional.pad(input=images, pad=[mx0,mx1,my0,my1], mode='reflect') + G_inv = translate2d((mx0 - mx1) / 2, (my0 - my1) / 2) @ G_inv + + # Upsample. + images = upfirdn2d.upsample2d(x=images, f=self.Hz_geom, up=2) + G_inv = scale2d(2, 2, device=device) @ G_inv @ scale2d_inv(2, 2, device=device) + G_inv = translate2d(-0.5, -0.5, device=device) @ G_inv @ translate2d_inv(-0.5, -0.5, device=device) + + # Execute transformation. + shape = [batch_size, num_channels, (height + Hz_pad * 2) * 2, (width + Hz_pad * 2) * 2] + G_inv = scale2d(2 / images.shape[3], 2 / images.shape[2], device=device) @ G_inv @ scale2d_inv(2 / shape[3], 2 / shape[2], device=device) + grid = torch.nn.functional.affine_grid(theta=G_inv[:,:2,:], size=shape, align_corners=False) + images = grid_sample_gradfix.grid_sample(images, grid) + + # Downsample and crop. + images = upfirdn2d.downsample2d(x=images, f=self.Hz_geom, down=2, padding=-Hz_pad*2, flip_filter=True) + + # -------------------------------------------- + # Select parameters for color transformations. + # -------------------------------------------- + + # Initialize homogeneous 3D transformation matrix: C @ color_in ==> color_out + I_4 = torch.eye(4, device=device) + C = I_4 + + # Apply brightness with probability (brightness * strength). + if self.brightness > 0: + b = torch.randn([batch_size], device=device) * self.brightness_std + b = torch.where(torch.rand([batch_size], device=device) < self.brightness * self.p, b, torch.zeros_like(b)) + if debug_percentile is not None: + b = torch.full_like(b, torch.erfinv(debug_percentile * 2 - 1) * self.brightness_std) + C = translate3d(b, b, b) @ C + + # Apply contrast with probability (contrast * strength). + if self.contrast > 0: + c = torch.exp2(torch.randn([batch_size], device=device) * self.contrast_std) + c = torch.where(torch.rand([batch_size], device=device) < self.contrast * self.p, c, torch.ones_like(c)) + if debug_percentile is not None: + c = torch.full_like(c, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.contrast_std)) + C = scale3d(c, c, c) @ C + + # Apply luma flip with probability (lumaflip * strength). + v = misc.constant(np.asarray([1, 1, 1, 0]) / np.sqrt(3), device=device) # Luma axis. + if self.lumaflip > 0: + i = torch.floor(torch.rand([batch_size, 1, 1], device=device) * 2) + i = torch.where(torch.rand([batch_size, 1, 1], device=device) < self.lumaflip * self.p, i, torch.zeros_like(i)) + if debug_percentile is not None: + i = torch.full_like(i, torch.floor(debug_percentile * 2)) + C = (I_4 - 2 * v.ger(v) * i) @ C # Householder reflection. + + # Apply hue rotation with probability (hue * strength). + if self.hue > 0 and num_channels > 1: + theta = (torch.rand([batch_size], device=device) * 2 - 1) * np.pi * self.hue_max + theta = torch.where(torch.rand([batch_size], device=device) < self.hue * self.p, theta, torch.zeros_like(theta)) + if debug_percentile is not None: + theta = torch.full_like(theta, (debug_percentile * 2 - 1) * np.pi * self.hue_max) + C = rotate3d(v, theta) @ C # Rotate around v. + + # Apply saturation with probability (saturation * strength). + if self.saturation > 0 and num_channels > 1: + s = torch.exp2(torch.randn([batch_size, 1, 1], device=device) * self.saturation_std) + s = torch.where(torch.rand([batch_size, 1, 1], device=device) < self.saturation * self.p, s, torch.ones_like(s)) + if debug_percentile is not None: + s = torch.full_like(s, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.saturation_std)) + C = (v.ger(v) + (I_4 - v.ger(v)) * s) @ C + + # ------------------------------ + # Execute color transformations. + # ------------------------------ + + # Execute if the transform is not identity. + if C is not I_4: + images = images.reshape([batch_size, num_channels, height * width]) + + if num_channels > 3 and num_channels % 3 == 0: + num_frames = num_channels // 3 + images = images.reshape([batch_size * num_frames, 3, height * width]) + C = C.repeat_interleave(num_frames, dim=0) + + if num_channels % 3 == 0: + images = C[:, :3, :3] @ images + C[:, :3, 3:] + elif num_channels == 1: + C = C[:, :3, :].mean(dim=1, keepdims=True) + images = images * C[:, :, :3].sum(dim=2, keepdims=True) + C[:, :, 3:] + else: + raise ValueError('Image must be RGB (3 channels) or L (1 channel)') + images = images.reshape([batch_size, num_channels, height, width]) + + # ---------------------- + # Image-space filtering. + # ---------------------- + + if self.imgfilter > 0: + num_bands = self.Hz_fbank.shape[0] + assert len(self.imgfilter_bands) == num_bands + expected_power = misc.constant(np.array([10, 1, 1, 1]) / 13, device=device) # Expected power spectrum (1/f). + + # Apply amplification for each band with probability (imgfilter * strength * band_strength). + g = torch.ones([batch_size, num_bands], device=device) # Global gain vector (identity). + for i, band_strength in enumerate(self.imgfilter_bands): + t_i = torch.exp2(torch.randn([batch_size], device=device) * self.imgfilter_std) + t_i = torch.where(torch.rand([batch_size], device=device) < self.imgfilter * self.p * band_strength, t_i, torch.ones_like(t_i)) + if debug_percentile is not None: + t_i = torch.full_like(t_i, torch.exp2(torch.erfinv(debug_percentile * 2 - 1) * self.imgfilter_std)) if band_strength > 0 else torch.ones_like(t_i) + t = torch.ones([batch_size, num_bands], device=device) # Temporary gain vector. + t[:, i] = t_i # Replace i'th element. + t = t / (expected_power * t.square()).sum(dim=-1, keepdims=True).sqrt() # Normalize power. + g = g * t # Accumulate into global gain. + + # Construct combined amplification filter. + Hz_prime = g @ self.Hz_fbank # [batch, tap] + Hz_prime = Hz_prime.unsqueeze(1).repeat([1, num_channels, 1]) # [batch, channels, tap] + Hz_prime = Hz_prime.reshape([batch_size * num_channels, 1, -1]) # [batch * channels, 1, tap] + + # Apply filter. + p = self.Hz_fbank.shape[1] // 2 + images = images.reshape([1, batch_size * num_channels, height, width]) + images = torch.nn.functional.pad(input=images, pad=[p,p,p,p], mode='reflect') + images = conv2d_gradfix.conv2d(input=images, weight=Hz_prime.unsqueeze(2), groups=batch_size*num_channels) + images = conv2d_gradfix.conv2d(input=images, weight=Hz_prime.unsqueeze(3), groups=batch_size*num_channels) + images = images.reshape([batch_size, num_channels, height, width]) + + # ------------------------ + # Image-space corruptions. + # ------------------------ + + # Apply additive RGB noise with probability (noise * strength). + if self.noise > 0: + sigma = torch.randn([batch_size, 1, 1, 1], device=device).abs() * self.noise_std + sigma = torch.where(torch.rand([batch_size, 1, 1, 1], device=device) < self.noise * self.p, sigma, torch.zeros_like(sigma)) + if debug_percentile is not None: + sigma = torch.full_like(sigma, torch.erfinv(debug_percentile) * self.noise_std) + images = images + torch.randn([batch_size, num_channels, height, width], device=device) * sigma + + # Apply cutout with probability (cutout * strength). + if self.cutout > 0: + size = torch.full([batch_size, 2, 1, 1, 1], self.cutout_size, device=device) + size = torch.where(torch.rand([batch_size, 1, 1, 1, 1], device=device) < self.cutout * self.p, size, torch.zeros_like(size)) + center = torch.rand([batch_size, 2, 1, 1, 1], device=device) + if debug_percentile is not None: + size = torch.full_like(size, self.cutout_size) + center = torch.full_like(center, debug_percentile) + coord_x = torch.arange(width, device=device).reshape([1, 1, 1, -1]) + coord_y = torch.arange(height, device=device).reshape([1, 1, -1, 1]) + mask_x = (((coord_x + 0.5) / width - center[:, 0]).abs() >= size[:, 0] / 2) + mask_y = (((coord_y + 0.5) / height - center[:, 1]).abs() >= size[:, 1] / 2) + mask = torch.logical_or(mask_x, mask_y).to(torch.float32) + images = images * mask + + return images + +#---------------------------------------------------------------------------- diff --git a/src/training/dataset.py b/src/training/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..507de81557ffe1125cbb806bcef3b242728bbcba --- /dev/null +++ b/src/training/dataset.py @@ -0,0 +1,496 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import os +import copy +from typing import List, Dict +import zipfile +import json +import random +from typing import Tuple + +import numpy as np +import PIL.Image +import torch +from src import dnnlib +from omegaconf import DictConfig, OmegaConf + +from src.training.layers import sample_frames + +try: + import pyspng +except ImportError: + pyspng = None + +#---------------------------------------------------------------------------- + +NUMPY_INTEGER_TYPES = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64] +NUMPY_FLOAT_TYPES = [np.float16, np.float32, np.float64, np.single, np.double] + +#---------------------------------------------------------------------------- + +class Dataset(torch.utils.data.Dataset): + def __init__(self, + name, # Name of the dataset. + raw_shape, # Shape of the raw image data (NCHW). + max_size = None, # Artificially limit the size of the dataset. None = no limit. Applied before xflip. + use_labels = False, # Enable conditioning labels? False = label dimension is zero. + xflip = False, # Artificially double the size of the dataset via x-flips. Applied after max_size. + random_seed = 0, # Random seed to use when applying max_size. + ): + self._name = name + self._raw_shape = list(raw_shape) + self._use_labels = use_labels + self._raw_labels = None + self._label_shape = None + + # Apply max_size. + self._raw_idx = np.arange(self._raw_shape[0], dtype=np.int64) + if (max_size is not None) and (self._raw_idx.size > max_size): + np.random.RandomState(random_seed).shuffle(self._raw_idx) + self._raw_idx = np.sort(self._raw_idx[:max_size]) + + # Apply xflip. + self._xflip = np.zeros(self._raw_idx.size, dtype=np.uint8) + if xflip: + self._raw_idx = np.tile(self._raw_idx, 2) + self._xflip = np.concatenate([self._xflip, np.ones_like(self._xflip)]) + + @staticmethod + def _file_ext(fname): + return os.path.splitext(fname)[1].lower() + + def _get_raw_labels(self): + if self._raw_labels is None: + self._raw_labels = self._load_raw_labels() if self._use_labels else None + if self._raw_labels is None: + self._raw_labels = np.zeros([self._raw_shape[0], 0], dtype=np.float32) + assert isinstance(self._raw_labels, np.ndarray) + assert self._raw_labels.shape[0] == self._raw_shape[0] + assert self._raw_labels.dtype in [np.float32, np.int64] + if self._raw_labels.dtype == np.int64: + assert np.all(self._raw_labels >= 0) + return self._raw_labels + + def close(self): # to be overridden by subclass + pass + + def _load_raw_image(self, raw_idx): # to be overridden by subclass + raise NotImplementedError + + def _load_raw_labels(self): # to be overridden by subclass + raise NotImplementedError + + def __getstate__(self): + return dict(self.__dict__, _raw_labels=None) + + def __del__(self): + try: + self.close() + except: + pass + + def __len__(self): + return self._raw_idx.size + + def __getitem__(self, idx): + image = self._load_raw_image(self._raw_idx[idx]) + assert isinstance(image, np.ndarray) + assert list(image.shape) == self.image_shape + assert image.dtype == np.uint8 + if self._xflip[idx]: + assert image.ndim == 3 # CHW + image = image[:, :, ::-1] + + return { + 'image': image.copy(), + 'label': self.get_label(idx), + } + + def get_label(self, idx): + label = self._get_raw_labels()[self._raw_idx[idx]] + if label.dtype == np.int64: + onehot = np.zeros(self.label_shape, dtype=np.float32) + onehot[label] = 1 + label = onehot + return label.copy() + + def get_details(self, idx): + d = dnnlib.EasyDict() + d.raw_idx = int(self._raw_idx[idx]) + d.xflip = (int(self._xflip[idx]) != 0) + d.raw_label = self._get_raw_labels()[d.raw_idx].copy() + return d + + @property + def name(self): + return self._name + + @property + def image_shape(self): + return list(self._raw_shape[1:]) + + @property + def num_channels(self): + assert len(self.image_shape) == 3 # CHW + return self.image_shape[0] + + @property + def resolution(self): + assert len(self.image_shape) == 3 # CHW + assert self.image_shape[1] == self.image_shape[2] + return self.image_shape[1] + + @property + def label_shape(self): + if self._label_shape is None: + raw_labels = self._get_raw_labels() + if raw_labels.dtype == np.int64: + self._label_shape = [int(np.max(raw_labels)) + 1] + else: + self._label_shape = raw_labels.shape[1:] + return list(self._label_shape) + + @property + def label_dim(self): + assert len(self.label_shape) == 1, f"Labels must be 1-dimensional: {self.label_shape} to use `.label_dim`" + return self.label_shape[0] + + @property + def has_labels(self): + return any(x != 0 for x in self.label_shape) + + @property + def has_onehot_labels(self): + return self._get_raw_labels().dtype == np.int64 + +#---------------------------------------------------------------------------- + +class ImageFolderDataset(Dataset): + def __init__(self, + path, # Path to directory or zip. + resolution = None, # Ensure specific resolution, None = highest available. + **super_kwargs, # Additional arguments for the Dataset base class. + ): + self._path = path + self._zipfile = None + + if os.path.isdir(self._path): + self._type = 'dir' + self._all_fnames = {os.path.relpath(os.path.join(root, fname), start=self._path) for root, _dirs, files in os.walk(self._path) for fname in files} + elif self._file_ext(self._path) == '.zip': + self._type = 'zip' + self._all_fnames = set(self._get_zipfile().namelist()) + else: + raise IOError('Path must point to a directory or zip') + + PIL.Image.init() + self._image_fnames = sorted(fname for fname in self._all_fnames if self._file_ext(fname) in PIL.Image.EXTENSION) + if len(self._image_fnames) == 0: + raise IOError('No image files found in the specified path') + + name = os.path.splitext(os.path.basename(self._path))[0] + raw_shape = [len(self._image_fnames)] + list(self._load_raw_image(0).shape) + if resolution is not None and (raw_shape[2] != resolution or raw_shape[3] != resolution): + raise IOError(f'Image files do not match the specified resolution. Resolution is {resolution}, shape is {raw_shape}') + super().__init__(name=name, raw_shape=raw_shape, **super_kwargs) + + def _get_zipfile(self): + assert self._type == 'zip' + if self._zipfile is None: + self._zipfile = zipfile.ZipFile(self._path) + return self._zipfile + + def _open_file(self, fname): + if self._type == 'dir': + return open(os.path.join(self._path, fname), 'rb') + if self._type == 'zip': + return self._get_zipfile().open(fname, 'r') + return None + + def close(self): + try: + if self._zipfile is not None: + self._zipfile.close() + finally: + self._zipfile = None + + def __getstate__(self): + return dict(super().__getstate__(), _zipfile=None) + + def _load_raw_image(self, raw_idx): + fname = self._image_fnames[raw_idx] + + with self._open_file(fname) as f: + use_pyspng = pyspng is not None and self._file_ext(fname) == '.png' + image = load_image_from_buffer(f, use_pyspng=use_pyspng) + + return image + + def _load_raw_labels(self): + fname = 'dataset.json' + labels_files = [f for f in self._all_fnames if f.endswith(fname)] + if len(labels_files) == 0: + return None + assert len(labels_files) == 1, f"There can be only a single {fname} file" + with self._open_file(labels_files[0]) as f: + labels = json.load(f)['labels'] + if labels is None: + return None + labels = dict(labels) + labels = [labels[remove_root(fname, self._name).replace('\\', '/')] for fname in self._image_fnames] + labels = np.array(labels) + + if labels.dtype in NUMPY_INTEGER_TYPES: + labels = labels.astype(np.int64) + elif labels.dtype in NUMPY_FLOAT_TYPES: + labels = labels.astype(np.float32) + else: + raise NotImplementedError(f"Unsupported label dtype: {labels.dtype}") + + return labels + +#---------------------------------------------------------------------------- + +class VideoFramesFolderDataset(Dataset): + def __init__(self, + path, # Path to directory or zip. + cfg: DictConfig, # Config + resolution=None, # Unused arg for backward compatibility + load_n_consecutive: int=None, # Should we load first N frames for each video? + load_n_consecutive_random_offset: bool=True, # Should we use a random offset when loading consecutive frames? + subsample_factor: int=1, # Sampling factor, i.e. decreasing the temporal resolution + discard_short_videos: bool=False, # Should we discard videos that are shorter than `load_n_consecutive`? + **super_kwargs, # Additional arguments for the Dataset base class. + ): + self.sampling_dict = OmegaConf.to_container(OmegaConf.create({**cfg.sampling})) if 'sampling' in cfg else None + self.max_num_frames = cfg.max_num_frames + self._path = path + self._zipfile = None + self.load_n_consecutive = load_n_consecutive + self.load_n_consecutive_random_offset = load_n_consecutive_random_offset + self.subsample_factor = subsample_factor + self.discard_short_videos = discard_short_videos + + if self.subsample_factor > 1 and self.load_n_consecutive is None: + raise NotImplementedError("Can do subsampling only when loading consecutive frames.") + + listdir_full_paths = lambda d: sorted([os.path.join(d, x) for x in os.listdir(d)]) + name = os.path.splitext(os.path.basename(self._path))[0] + + if os.path.isdir(self._path): + self._type = 'dir' + # We assume that the depth is 2 + self._all_objects = {o for d in listdir_full_paths(self._path) for o in (([d] + listdir_full_paths(d)) if os.path.isdir(d) else [d])} + self._all_objects = {os.path.relpath(o, start=os.path.dirname(self._path)) for o in {self._path}.union(self._all_objects)} + elif self._file_ext(self._path) == '.zip': + self._type = 'zip' + self._all_objects = set(self._get_zipfile().namelist()) + else: + raise IOError('Path must be either a directory or point to a zip archive') + + PIL.Image.init() + self._video_dir2frames = {} + objects = sorted([d for d in self._all_objects]) + root_path_depth = len(os.path.normpath(objects[0]).split(os.path.sep)) + curr_d = objects[1] # Root path is the first element + + for o in objects[1:]: + curr_obj_depth = len(os.path.normpath(o).split(os.path.sep)) + + if self._file_ext(o) in PIL.Image.EXTENSION: + assert o.startswith(curr_d), f"Object {o} is out of sync. It should lie inside {curr_d}" + assert curr_obj_depth == root_path_depth + 2, "Frame images should be inside directories" + if not curr_d in self._video_dir2frames: + self._video_dir2frames[curr_d] = [] + self._video_dir2frames[curr_d].append(o) + elif self._file_ext(o) == 'json': + assert curr_obj_depth == root_path_depth + 1, "Classes info file should be inside the root dir" + pass + else: + # We encountered a new directory + assert curr_obj_depth == root_path_depth + 1, f"Video directories should be inside the root dir. {o} is not." + if curr_d in self._video_dir2frames: + sorted_files = sorted(self._video_dir2frames[curr_d]) + self._video_dir2frames[curr_d] = sorted_files + curr_d = o + + if self.discard_short_videos: + self._video_dir2frames = {d: fs for d, fs in self._video_dir2frames.items() if len(fs) >= self.load_n_consecutive * self.subsample_factor} + + self._video_idx2frames = [frames for frames in self._video_dir2frames.values()] + + if len(self._video_idx2frames) == 0: + raise IOError('No videos found in the specified archive') + + raw_shape = [len(self._video_idx2frames)] + list(self._load_raw_frames(0, [0])[0][0].shape) + + super().__init__(name=name, raw_shape=raw_shape, **super_kwargs) + + def _get_zipfile(self): + assert self._type == 'zip' + if self._zipfile is None: + self._zipfile = zipfile.ZipFile(self._path) + return self._zipfile + + def _open_file(self, fname): + if self._type == 'dir': + return open(os.path.join(os.path.dirname(self._path), fname), 'rb') + if self._type == 'zip': + return self._get_zipfile().open(fname, 'r') + return None + + def close(self): + try: + if self._zipfile is not None: + self._zipfile.close() + finally: + self._zipfile = None + + def __getstate__(self): + return dict(super().__getstate__(), _zipfile=None) + + def _load_raw_labels(self): + """ + We leave the `dataset.json` file in the same format as in the original SG2-ADA repo: + it's `labels` field is a hashmap of filename-label pairs. + """ + fname = 'dataset.json' + labels_files = [f for f in self._all_objects if f.endswith(fname)] + if len(labels_files) == 0: + return None + assert len(labels_files) == 1, f"There can be only a single {fname} file" + with self._open_file(labels_files[0]) as f: + labels = json.load(f)['labels'] + if labels is None: + return None + + labels = dict(labels) + # The `dataset.json` file defines a label for each image and + # For the video dataset, this is both inconvenient and redundant. + # So let's redefine this + video_labels = {} + for filename, label in labels.items(): + dirname = os.path.dirname(filename) + if dirname in video_labels: + assert video_labels[dirname] == label + else: + video_labels[dirname] = label + labels = video_labels + labels = [labels[os.path.normpath(dname).split(os.path.sep)[-1]] for dname in self._video_dir2frames] + labels = np.array(labels) + + if labels.dtype in NUMPY_INTEGER_TYPES: + labels = labels.astype(np.int64) + elif labels.dtype in NUMPY_FLOAT_TYPES: + labels = labels.astype(np.float32) + else: + raise NotImplementedError(f"Unsupported label dtype: {labels.dtype}") + + return labels + + def __getitem__(self, idx: int) -> Dict: + if self.load_n_consecutive: + num_frames_available = len(self._video_idx2frames[self._raw_idx[idx]]) + assert num_frames_available - self.load_n_consecutive * self.subsample_factor >= 0, f"We have only {num_frames_available} frames available, cannot load {self.load_n_consecutive} frames." + + if self.load_n_consecutive_random_offset: + random_offset = random.randint(0, num_frames_available - self.load_n_consecutive * self.subsample_factor + self.subsample_factor - 1) + else: + random_offset = 0 + frames_idx = np.arange(0, self.load_n_consecutive * self.subsample_factor, self.subsample_factor) + random_offset + else: + frames_idx = None + + frames, times = self._load_raw_frames(self._raw_idx[idx], frames_idx=frames_idx) + + assert isinstance(frames, np.ndarray) + assert list(frames[0].shape) == self.image_shape + assert frames.dtype == np.uint8 + assert len(frames) == len(times) + + if self._xflip[idx]: + assert frames.ndim == 4 # TCHW + frames = frames[:, :, :, ::-1] + + return { + 'image': frames.copy(), + 'label': self.get_label(idx), + 'times': times, + 'video_len': self.get_video_len(idx), + } + + def get_video_len(self, idx: int) -> int: + return min(self.max_num_frames, len(self._video_idx2frames[self._raw_idx[idx]])) + + def _load_raw_frames(self, raw_idx: int, frames_idx: List[int]=None) -> Tuple[np.ndarray, np.ndarray]: + frame_paths = self._video_idx2frames[raw_idx] + total_len = len(frame_paths) + offset = 0 + images = [] + + if frames_idx is None: + assert not self.sampling_dict is None, f"The dataset was created without `cfg.sampling` config and cannot sample frames on its own." + if total_len > self.max_num_frames: + offset = random.randint(0, total_len - self.max_num_frames) + frames_idx = sample_frames(self.sampling_dict, total_video_len=min(total_len, self.max_num_frames)) + offset + else: + frames_idx = np.array(frames_idx) + + for frame_idx in frames_idx: + with self._open_file(frame_paths[frame_idx]) as f: + images.append(load_image_from_buffer(f)) + + return np.array(images), frames_idx - offset + + def compute_max_num_frames(self) -> int: + return max(len(frames) for frames in self._video_idx2frames) + +#---------------------------------------------------------------------------- + +def load_image_from_buffer(f, use_pyspng: bool=False) -> np.ndarray: + if use_pyspng: + image = pyspng.load(f.read()) + else: + image = np.array(PIL.Image.open(f)) + if image.ndim == 2: + image = image[:, :, np.newaxis] # HW => HWC + image = image.transpose(2, 0, 1) # HWC => CHW + + return image + +#---------------------------------------------------------------------------- + +def video_to_image_dataset_kwargs(video_dataset_kwargs: dnnlib.EasyDict) -> dnnlib.EasyDict: + """Converts video dataset kwargs to image dataset kwargs""" + return dnnlib.EasyDict( + class_name='training.dataset.ImageFolderDataset', + path=video_dataset_kwargs.path, + use_labels=video_dataset_kwargs.use_labels, + xflip=video_dataset_kwargs.xflip, + resolution=video_dataset_kwargs.resolution, + random_seed=video_dataset_kwargs.get('random_seed'), + # Explicitly ignoring the max size, since we are now interested + # in the number of images instead of the number of videos + # max_size=video_dataset_kwargs.max_size, + ) + +#---------------------------------------------------------------------------- + +def remove_root(fname: os.PathLike, root_name: os.PathLike): + """`root_name` should NOT start with '/'""" + if fname == root_name or fname == ('/' + root_name): + return '' + elif fname.startswith(root_name + '/'): + return fname[len(root_name) + 1:] + elif fname.startswith('/' + root_name + '/'): + return fname[len(root_name) + 2:] + else: + return fname + +#---------------------------------------------------------------------------- diff --git a/src/training/layers.py b/src/training/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..92052a9b11f8ac0706b0f2cbb16f4de30d254cd4 --- /dev/null +++ b/src/training/layers.py @@ -0,0 +1,448 @@ +import random +from typing import Dict, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from omegaconf import DictConfig + +from src.torch_utils import persistence +from src.torch_utils.ops import bias_act, upfirdn2d, conv2d_resample +from src.torch_utils import misc + +#---------------------------------------------------------------------------- + +@misc.profiled_function +def normalize_2nd_moment(x, dim=1, eps=1e-8): + return x * (x.square().mean(dim=dim, keepdim=True) + eps).rsqrt() + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class MappingNetwork(torch.nn.Module): + def __init__(self, + z_dim, # Input latent (Z) dimensionality, 0 = no latent. + c_dim, # Conditioning label (C) dimensionality, 0 = no label. + w_dim, # Intermediate latent (W) dimensionality. + num_ws, # Number of intermediate latents to output, None = do not broadcast. + num_layers = 2, # Number of mapping layers. + embed_features = None, # Label embedding dimensionality, None = same as w_dim. + layer_features = None, # Number of intermediate features in the mapping layers, None = same as w_dim. + activation = 'lrelu', # Activation function: 'relu', 'lrelu', etc. + lr_multiplier = 0.01, # Learning rate multiplier for the mapping layers. + w_avg_beta = 0.995, # Decay for tracking the moving average of W during training, None = do not track. + cfg = {}, # Additional config + ): + super().__init__() + + self.cfg = cfg + self.z_dim = z_dim + self.c_dim = c_dim + self.w_dim = w_dim + self.num_ws = num_ws + self.num_layers = num_layers + self.w_avg_beta = w_avg_beta + + if embed_features is None: + embed_features = w_dim + if c_dim == 0: + embed_features = 0 + if layer_features is None: + layer_features = w_dim + + features_list = [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim] + + if c_dim > 0: + self.embed = FullyConnectedLayer(c_dim, embed_features) + + for idx in range(num_layers): + in_features = features_list[idx] + out_features = features_list[idx + 1] + layer = FullyConnectedLayer(in_features, out_features, activation=activation, lr_multiplier=lr_multiplier) + setattr(self, f'fc{idx}', layer) + + if num_ws is not None and w_avg_beta is not None: + self.register_buffer('w_avg', torch.zeros([w_dim])) + + def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, skip_w_avg_update=False): + # Embed, normalize, and concat inputs. + x = None + with torch.autograd.profiler.record_function('input'): + if self.z_dim > 0: + misc.assert_shape(z, [None, self.z_dim]) + x = normalize_2nd_moment(z.to(torch.float32)) + + if self.c_dim > 0: + misc.assert_shape(c, [None, self.c_dim]) + y = normalize_2nd_moment(self.embed(c.to(torch.float32))) + x = torch.cat([x, y], dim=1) if x is not None else y + + # Main layers. + for idx in range(self.num_layers): + layer = getattr(self, f'fc{idx}') + x = layer(x) + + # Update moving average of W. + if self.w_avg_beta is not None and self.training and not skip_w_avg_update: + with torch.autograd.profiler.record_function('update_w_avg'): + self.w_avg.copy_(x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta)) + + # Broadcast. + if self.num_ws is not None: + with torch.autograd.profiler.record_function('broadcast'): + x = x.unsqueeze(1).repeat([1, self.num_ws, 1]) + + # Apply truncation. + if truncation_psi != 1: + with torch.autograd.profiler.record_function('truncate'): + assert self.w_avg_beta is not None + if self.num_ws is None or truncation_cutoff is None: + x = self.w_avg.lerp(x, truncation_psi) + else: + x[:, :truncation_cutoff] = self.w_avg.lerp(x[:, :truncation_cutoff], truncation_psi) + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class FullyConnectedLayer(torch.nn.Module): + def __init__(self, + in_features, # Number of input features. + out_features, # Number of output features. + bias = True, # Apply additive bias before the activation function? + activation = 'linear', # Activation function: 'relu', 'lrelu', etc. + lr_multiplier = 1, # Learning rate multiplier. + bias_init = 0, # Initial value for the additive bias. + ): + super().__init__() + self.activation = activation + self.weight = torch.nn.Parameter(torch.randn([out_features, in_features]) / lr_multiplier) + self.bias = torch.nn.Parameter(torch.full([out_features], float(bias_init))) if bias else None + self.weight_gain = lr_multiplier / np.sqrt(in_features) + self.bias_gain = lr_multiplier + + def forward(self, x): + w = self.weight.to(x.dtype) * self.weight_gain + b = self.bias + if b is not None: + b = b.to(x.dtype) + if self.bias_gain != 1: + b = b * self.bias_gain + + if self.activation == 'linear' and b is not None: + x = torch.addmm(b.unsqueeze(0), x, w.t()) + else: + x = x.matmul(w.t()) + x = bias_act.bias_act(x, b, act=self.activation) + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class Conv2dLayer(torch.nn.Module): + def __init__(self, + in_channels, # Number of input channels. + out_channels, # Number of output channels. + kernel_size, # Width and height of the convolution kernel. + bias = True, # Apply additive bias before the activation function? + activation = 'linear', # Activation function: 'relu', 'lrelu', etc. + up = 1, # Integer upsampling factor. + down = 1, # Integer downsampling factor. + resample_filter = [1,3,3,1], # Low-pass filter to apply when resampling activations. + conv_clamp = None, # Clamp the output to +-X, None = disable clamping. + channels_last = False, # Expect the input to have memory_format=channels_last? + trainable = True, # Update the weights of this layer during training? + instance_norm = False, # Should we apply instance normalization to y? + lr_multiplier = 1.0, # Learning rate multiplier. + ): + super().__init__() + self.activation = activation + self.up = up + self.down = down + self.conv_clamp = conv_clamp + self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter)) + self.padding = kernel_size // 2 + self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2)) + self.act_gain = bias_act.activation_funcs[activation].def_gain + self.instance_norm = instance_norm + self.lr_multiplier = lr_multiplier + + memory_format = torch.channels_last if channels_last else torch.contiguous_format + weight = torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format) + bias = torch.zeros([out_channels]) if bias else None + if trainable: + self.weight = torch.nn.Parameter(weight) + self.bias = torch.nn.Parameter(bias) if bias is not None else None + else: + self.register_buffer('weight', weight) + if bias is not None: + self.register_buffer('bias', bias) + else: + self.bias = None + + def forward(self, x, gain=1): + w = self.weight * (self.weight_gain * self.lr_multiplier) + b = (self.bias.to(x.dtype) * self.lr_multiplier) if self.bias is not None else None + flip_weight = (self.up == 1) # slightly faster + x = conv2d_resample.conv2d_resample(x=x, w=w.to(x.dtype), f=self.resample_filter, up=self.up, down=self.down, padding=self.padding, flip_weight=flip_weight) + + act_gain = self.act_gain * gain + act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None + x = bias_act.bias_act(x, b, act=self.activation, gain=act_gain, clamp=act_clamp) + + if self.instance_norm: + x = (x - x.mean(dim=(2,3), keepdim=True)) / (x.std(dim=(2,3), keepdim=True) + 1e-8) # [batch_size, c, h, w] + + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class GenInput(nn.Module): + def __init__(self, cfg: DictConfig, channel_dim: int, motion_v_dim: int=None): + super().__init__() + + self.cfg = cfg + + if self.cfg.input.type == 'const': + self.input = torch.nn.Parameter(torch.randn([channel_dim, 4, 4])) + self.total_dim = channel_dim + elif self.cfg.input.type == 'temporal': + self.input = TemporalInput(self.cfg, channel_dim, motion_v_dim=motion_v_dim) + self.total_dim = self.input.get_dim() + else: + raise NotImplementedError(f'Unkown input type: {self.cfg.input.type}') + + def forward(self, batch_size: int, motion_v: Optional[torch.Tensor]=None, dtype=None, memory_format=None) -> torch.Tensor: + if self.cfg.input.type == 'const': + x = self.input.to(dtype=dtype, memory_format=memory_format) + x = x.unsqueeze(0).repeat([batch_size, 1, 1, 1]) + elif self.cfg.input.type == 'temporal': + x = self.input(motion_v=motion_v) # [batch_size, d, h, w] + else: + raise NotImplementedError(f'Unkown input type: {self.cfg.input.type}') + + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class TemporalInput(nn.Module): + def __init__(self, cfg: DictConfig, channel_dim: int, motion_v_dim: int): + super().__init__() + + self.cfg = cfg + self.motion_v_dim = motion_v_dim + self.const = nn.Parameter(torch.randn(1, channel_dim, 4, 4)) + + def get_dim(self): + return self.motion_v_dim + self.const.shape[1] + + def forward(self, motion_v: torch.Tensor) -> torch.Tensor: + """ + motion_v: [batch_size, motion_v_dim] + """ + out = torch.cat([ + self.const.repeat(len(motion_v), 1, 1, 1), + motion_v.unsqueeze(2).unsqueeze(3).repeat(1, 1, *self.const.shape[2:]), + ], dim=1) # [batch_size, channel_dim + num_fourier_feats * 2] + + return out + +#---------------------------------------------------------------------------- + +class TemporalDifferenceEncoder(nn.Module): + def __init__(self, cfg: DictConfig): + super().__init__() + + self.cfg = cfg + + if self.cfg.sampling.num_frames_per_video > 1: + self.d = 256 + self.const_embed = nn.Embedding(self.cfg.sampling.max_num_frames, self.d) + self.time_encoder = FixedTimeEncoder( + self.cfg.sampling.max_num_frames, + skip_small_t_freqs=self.cfg.get('skip_small_t_freqs', 0)) + + def get_dim(self) -> int: + if self.cfg.sampling.num_frames_per_video == 1: + return 1 + else: + if self.cfg.sampling.type == 'uniform': + return self.d + self.time_encoder.get_dim() + else: + return (self.d + self.time_encoder.get_dim()) * (self.cfg.sampling.num_frames_per_video - 1) + + def forward(self, t: torch.Tensor) -> torch.Tensor: + misc.assert_shape(t, [None, self.cfg.sampling.num_frames_per_video]) + + batch_size = t.shape[0] + + if self.cfg.sampling.num_frames_per_video == 1: + out = torch.zeros(len(t), 1, device=t.device) + else: + if self.cfg.sampling.type == 'uniform': + num_diffs_to_use = 1 + t_diffs = t[:, 1] - t[:, 0] # [batch_size] + else: + num_diffs_to_use = self.cfg.sampling.num_frames_per_video - 1 + t_diffs = (t[:, 1:] - t[:, :-1]).view(-1) # [batch_size * (num_frames - 1)] + # Note: float => round => long is necessary when it's originally long + const_embs = self.const_embed(t_diffs.float().round().long()) # [batch_size * num_diffs_to_use, d] + fourier_embs = self.time_encoder(t_diffs.unsqueeze(1)) # [batch_size * num_diffs_to_use, num_fourier_feats] + out = torch.cat([const_embs, fourier_embs], dim=1) # [batch_size * num_diffs_to_use, d + num_fourier_feats] + out = out.view(batch_size, num_diffs_to_use, -1).view(batch_size, -1) # [batch_size, num_diffs_to_use * (d + num_fourier_feats)] + + return out + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class FixedTimeEncoder(nn.Module): + def __init__(self, + max_num_frames: int, # Maximum T size + skip_small_t_freqs: int=0, # How many high frequencies we should skip + ): + super().__init__() + + assert max_num_frames >= 1, f"Wrong max_num_frames: {max_num_frames}" + fourier_coefs = construct_log_spaced_freqs(max_num_frames, skip_small_t_freqs=skip_small_t_freqs) + self.register_buffer('fourier_coefs', fourier_coefs) # [1, num_fourier_feats] + + def get_dim(self) -> int: + return self.fourier_coefs.shape[1] * 2 + + def forward(self, t: torch.Tensor) -> torch.Tensor: + assert t.ndim == 2, f"Wrong shape: {t.shape}" + + t = t.view(-1).float() # [batch_size * num_frames] + fourier_raw_embs = self.fourier_coefs * t.unsqueeze(1) # [bf, num_fourier_feats] + + fourier_embs = torch.cat([ + fourier_raw_embs.sin(), + fourier_raw_embs.cos(), + ], dim=1) # [bf, num_fourier_feats * 2] + + return fourier_embs + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class EqLRConv1d(nn.Module): + def __init__(self, + in_features: int, + out_features: int, + kernel_size: int, + padding: int=0, + stride: int=1, + activation: str='linear', + lr_multiplier: float=1.0, + bias=True, + bias_init=0.0, + ): + super().__init__() + + self.activation = activation + self.weight = torch.nn.Parameter(torch.randn([out_features, in_features, kernel_size]) / lr_multiplier) + self.bias = torch.nn.Parameter(torch.full([out_features], float(bias_init))) if bias else None + self.weight_gain = lr_multiplier / np.sqrt(in_features * kernel_size) + self.bias_gain = lr_multiplier + self.padding = padding + self.stride = stride + + assert self.activation in ['lrelu', 'linear'] + + def forward(self, x: torch.Tensor) -> torch.Tensor: + assert x.ndim == 3, f"Wrong shape: {x.shape}" + + w = self.weight.to(x.dtype) * self.weight_gain # [out_features, in_features, kernel_size] + b = self.bias # [out_features] + if b is not None: + b = b.to(x.dtype) + if self.bias_gain != 1: + b = b * self.bias_gain + + y = F.conv1d(input=x, weight=w, bias=b, stride=self.stride, padding=self.padding) # [batch_size, out_features, out_len] + if self.activation == 'linear': + pass + elif self.activation == 'lrelu': + y = F.leaky_relu(y, negative_slope=0.2) # [batch_size, out_features, out_len] + else: + raise NotImplementedError + return y + +#---------------------------------------------------------------------------- + +def sample_frames(cfg: Dict, total_video_len: int, **kwargs) -> np.ndarray: + if cfg['type'] == 'random': + return random_frame_sampling(cfg, total_video_len, **kwargs) + elif cfg['type'] == 'uniform': + return uniform_frame_sampling(cfg, total_video_len, **kwargs) + else: + raise NotImplementedError + +#---------------------------------------------------------------------------- + +def random_frame_sampling(cfg: Dict, total_video_len: int, use_fractional_t: bool=False) -> np.ndarray: + min_time_diff = cfg["num_frames_per_video"] - 1 + max_time_diff = min(total_video_len - 1, cfg.get('max_dist', float('inf'))) + + if type(cfg.get('total_dists')) in (list, tuple): + time_diff_range = [d for d in cfg['total_dists'] if min_time_diff <= d <= max_time_diff] + else: + time_diff_range = range(min_time_diff, max_time_diff) + + time_diff: int = random.choice(time_diff_range) + if use_fractional_t: + offset = random.random() * (total_video_len - time_diff - 1) + else: + offset = random.randint(0, total_video_len - time_diff - 1) + frames_idx = [offset] + + if cfg["num_frames_per_video"] > 1: + frames_idx.append(offset + time_diff) + + if cfg["num_frames_per_video"] > 2: + frames_idx.extend([(offset + t) for t in random.sample(range(1, time_diff), k=cfg["num_frames_per_video"] - 2)]) + + frames_idx = sorted(frames_idx) + + return np.array(frames_idx) + +#---------------------------------------------------------------------------- + +def uniform_frame_sampling(cfg: Dict, total_video_len: int, use_fractional_t: bool=False) -> np.ndarray: + # Step 1: Select the distance between frames + if type(cfg.get('dists_between_frames')) in (list, tuple): + valid_dists = [d for d in cfg['dists_between_frames'] if d <= ['max_dist_between_frames']] + valid_dists = [d for d in valid_dists if (d * cfg['num_frames_per_video'] - d + 1) <= total_video_len] + d = random.choice(valid_dists) + else: + max_dist = min(cfg.get('max_dist', float('inf')), total_video_len // cfg['num_frames_per_video']) + d = random.randint(1, max_dist) + + d_total = d * cfg['num_frames_per_video'] - d + 1 + + # Step 2: Sample. + if use_fractional_t: + offset = random.random() * (total_video_len - d_total) + else: + offset = random.randint(0, total_video_len - d_total) + + frames_idx = offset + np.arange(cfg['num_frames_per_video']) * d + + return frames_idx + +#---------------------------------------------------------------------------- + +def construct_log_spaced_freqs(max_num_frames: int, skip_small_t_freqs: int=0) -> Tuple[int, torch.Tensor]: + time_resolution = 2 ** np.ceil(np.log2(max_num_frames)) + num_fourier_feats = np.ceil(np.log2(time_resolution)).astype(int) + powers = torch.tensor([2]).repeat(num_fourier_feats).pow(torch.arange(num_fourier_feats)) # [num_fourier_feats] + powers = powers[:len(powers) - skip_small_t_freqs] # [num_fourier_feats] + fourier_coefs = powers.unsqueeze(0).float() * np.pi # [1, num_fourier_feats] + + return fourier_coefs / time_resolution + +#---------------------------------------------------------------------------- diff --git a/src/training/logging.py b/src/training/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..b4ab5b4a94874742b4bc11a5524e28a51e95f049 --- /dev/null +++ b/src/training/logging.py @@ -0,0 +1,142 @@ +import os +from typing import List, Callable, Optional, Dict +from multiprocessing.pool import ThreadPool + +from PIL import Image +import torch +from torch import Tensor +import numpy as np +import cv2 +from tqdm import tqdm +from torchvision import utils +import torchvision.transforms.functional as TVF + +#---------------------------------------------------------------------------- + +@torch.no_grad() +def generate_videos( + G: Callable, z: Tensor, c: Tensor, ts: Tensor, motion_z: Optional[Tensor]=None, + noise_mode='const', truncation_psi=1.0, verbose: bool=False, as_grids: bool=False, batch_size_num_frames: int=100) -> Tensor: + + assert len(ts) == len(z) == len(c), f"Wrong shape: {ts.shape}, {z.shape}, {c.shape}" + assert ts.ndim == 2, f"Wrong shape: {ts.shape}" + + G.eval() + videos = [] + + if c.shape[1] > 0 and truncation_psi < 1: + num_ws_to_average = 1000 + c_for_avg = c.repeat_interleave(num_ws_to_average, dim=0) # [num_classes * num_ws_to_average, num_classes] + z_for_avg = torch.randn(c_for_avg.shape[0], G.z_dim, device=z.device) # [num_classes * num_ws_to_average, z_dim] + w = G.mapping(z_for_avg, c=c_for_avg)[:, 0] # [num_classes * num_ws_to_average, w_dim] + w_avg = w.view(-1, num_ws_to_average, G.w_dim).mean(dim=1) # [num_classes, w_dim] + + iters = range(len(z)) + iters = tqdm(iters, desc='Generating videos') if verbose else iters + + if motion_z is None and not G.synthesis.motion_encoder is None: + motion_z = G.synthesis.motion_encoder(c=c, t=ts)['motion_z'] # [...any...] + + for video_idx in iters: + curr_video = [] + + + for curr_ts in ts[[video_idx]].split(batch_size_num_frames, dim=1): + curr_z = z[[video_idx]] # [1, z_dim] + curr_c = c[[video_idx]] # [1, c_dim] + curr_motion_z = motion_z[[video_idx]] + + if curr_c.shape[1] > 0 and truncation_psi < 1: + curr_w = G.mapping(curr_z, c=curr_c, truncation_psi=1) # [1, num_ws, w_dim] + curr_w = truncation_psi * curr_w + (1 - truncation_psi) * w_avg.unsqueeze(1) # [1, num_ws, w_dim] + out = G.synthesis( + ws=curr_w, + c=curr_c, + t=curr_ts, + motion_z=curr_motion_z, + noise_mode=noise_mode) # [1 * curr_num_frames, 3, h, w] + else: + out = G( + z=curr_z, + c=curr_c, + t=curr_ts, + motion_z=curr_motion_z, + truncation_psi=truncation_psi, + noise_mode=noise_mode) # [1 * curr_num_frames, 3, h, w] + + out = (out * 0.5 + 0.5).clamp(0, 1).cpu() # [1 * curr_num_frames, 3, h, w] + curr_video.append(out) + + videos.append(torch.cat(curr_video, dim=0)) + + videos = torch.stack(videos) # [len(z), video_len, c, h, w] + + if as_grids: + frame_grids = videos.permute(1, 0, 2, 3, 4) # [video_len, len(z), c, h, w] + frame_grids = [utils.make_grid(fs, nrow=int(np.sqrt(len(z)))) for fs in frame_grids] # [video_len, 3, grid_h, grid_w] + + return torch.stack(frame_grids) + else: + return videos + +#---------------------------------------------------------------------------- + +def run_batchwise(fn: Callable, data_kwargs: Dict[str, Tensor], batch_size: int, **kwargs) -> Tensor: + data_kwargs = {k: v for k, v in data_kwargs.items() if not v is None} + seq_len = len(data_kwargs[list(data_kwargs.keys())[0]]) + result = [] + + for i in range((seq_len + batch_size - 1) // batch_size): + curr_data_kwargs = {k: d[i * batch_size: (i+1) * batch_size] for k, d in data_kwargs.items()} + result.append(fn(**curr_data_kwargs, **kwargs)) + + return torch.cat(result, dim=0) + +#---------------------------------------------------------------------------- + +def save_video_frames_as_mp4(frames: List[Tensor], fps: int, save_path: os.PathLike, verbose: bool=False): + # Load data + frame_h, frame_w = frames[0].shape[1:] + fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') + video = cv2.VideoWriter(save_path, fourcc, fps, (frame_w, frame_h)) + frames = tqdm(frames, desc='Saving videos') if verbose else frames + for frame in frames: + assert frame.shape[0] == 3, "RGBA/grayscale images are not supported" + frame = np.array(TVF.to_pil_image(frame)) + video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) + + # Uncomment this line to release the memory. + # It didn't work for me on centos and complained about installing additional libraries (which requires root access) + # cv2.destroyAllWindows() + video.release() + +#---------------------------------------------------------------------------- + +def save_video_frames_as_frames(frames: List[Tensor], save_dir: os.PathLike, time_offset: int=0): + os.makedirs(save_dir, exist_ok=True) + + for i, frame in enumerate(frames): + save_path = os.path.join(save_dir, f'{i + time_offset:06d}.jpg') + TVF.to_pil_image(frame).save(save_path, q=95) + +#---------------------------------------------------------------------------- + +def save_video_frames_as_frames_parallel(frames: List[np.ndarray], save_dir: os.PathLike, time_offset: int=0, num_processes: int=1): + assert num_processes > 1, "Use `save_video_frames_as_frames` if you do not plan to use num_processes > 1." + os.makedirs(save_dir, exist_ok=True) + # We are fine with the ThreadPool instead of Pool since most of the work is I/O + pool = ThreadPool(processes=num_processes) + save_paths = [os.path.join(save_dir, f'{i + time_offset:06d}.jpg') for i in range(len(frames))] + pool.map(save_jpg_mp_proxy, [(f, p) for f, p in zip(frames, save_paths)]) + +#---------------------------------------------------------------------------- + +def save_jpg_mp_proxy(args): + return save_jpg(*args) + +#---------------------------------------------------------------------------- + +def save_jpg(x: np.ndarray, save_path: os.PathLike): + Image.fromarray(x).save(save_path, q=95) + +#---------------------------------------------------------------------------- diff --git a/src/training/loss.py b/src/training/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..bc66dcec5ff6971bc05f8e75132f8ea199ab6797 --- /dev/null +++ b/src/training/loss.py @@ -0,0 +1,175 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import random +import numpy as np +import torch +import torch.nn.functional as F +from src.torch_utils import training_stats +from src.torch_utils import misc +from src.torch_utils.ops import conv2d_gradfix + +#---------------------------------------------------------------------------- + +class Loss: + def accumulate_gradients(self, phase, real_img, real_c, gen_z, gen_c, sync, gain): # to be overridden by subclass + raise NotImplementedError() + +#---------------------------------------------------------------------------- + +class StyleGAN2Loss(Loss): + def __init__(self, cfg, device, G_mapping, G_synthesis, D, augment_pipe=None, G_motion_encoder=None, + style_mixing_prob=0.9, r1_gamma=10, pl_batch_shrink=2, pl_decay=0.01, pl_weight=2): + super().__init__() + + self.cfg = cfg + self.device = device + self.G_mapping = G_mapping + self.G_synthesis = G_synthesis + self.D = D + self.augment_pipe = augment_pipe + self.style_mixing_prob = style_mixing_prob + self.r1_gamma = r1_gamma + self.pl_batch_shrink = pl_batch_shrink + self.pl_decay = pl_decay + self.pl_weight = pl_weight + self.pl_mean = torch.zeros([], device=device) + self.G_motion_encoder = G_motion_encoder + + def run_G(self, z, c, t, sync): + with misc.ddp_sync(self.G_mapping, sync): + ws = self.G_mapping(z, c) + if self.style_mixing_prob > 0: + with torch.autograd.profiler.record_function('style_mixing'): + cutoff = torch.empty([], dtype=torch.int64, device=ws.device).random_(1, ws.shape[1]) + cutoff = torch.where(torch.rand([], device=ws.device) < self.style_mixing_prob, cutoff, torch.full_like(cutoff, ws.shape[1])) + ws[:, cutoff:] = self.G_mapping(torch.randn_like(z), c, skip_w_avg_update=True)[:, cutoff:] + with misc.ddp_sync(self.G_synthesis, sync): + out = self.G_synthesis(ws, t=t, c=c) + return out, ws + + def run_D(self, img, c, t, sync): + if self.augment_pipe is not None: + if self.cfg.model.loss_kwargs.get('video_consistent_aug', False): + nf, ch, h, w = img.shape + f = self.cfg.sampling.num_frames_per_video + n = nf // f + img = img.view(n, f * ch, h, w) # [n, f * ch, h, w] + + img = self.augment_pipe(img) # [n, f * ch, h, w] + + if self.cfg.model.loss_kwargs.get('video_consistent_aug', False): + img = img.view(n * f, ch, h, w) # [n * f, ch, h, w] + + with misc.ddp_sync(self.D, sync): + outputs = self.D(img, c, t) + + return outputs + + def accumulate_gradients(self, phase, real_img, real_c, real_t, gen_z, gen_c, gen_t, sync, gain): + assert phase in ['Gmain', 'Greg', 'Gboth', 'Dmain', 'Dreg', 'Dboth'] + do_Gmain = (phase in ['Gmain', 'Gboth']) + do_Dmain = (phase in ['Dmain', 'Dboth']) + do_Gpl = (phase in ['Greg', 'Gboth']) and (self.pl_weight != 0) + do_Dr1 = (phase in ['Dreg', 'Dboth']) and (self.r1_gamma != 0) + + real_img = real_img.view(-1, *real_img.shape[2:]) # [batch_size * num_frames, c, h, w] + + # Gmain: Maximize logits for generated images. + if do_Gmain: + with torch.autograd.profiler.record_function('Gmain_forward'): + gen_img, _gen_ws = self.run_G(gen_z, gen_c, gen_t, sync=(sync and not do_Gpl)) # [batch_size * num_frames, c, h, w] + D_out_gen = self.run_D(gen_img, gen_c, gen_t, sync=False) # [batch_size] + training_stats.report('Loss/scores/fake', D_out_gen['image_logits']) + training_stats.report('Loss/signs/fake', D_out_gen['image_logits'].sign()) + loss_Gmain = F.softplus(-D_out_gen['image_logits']) # -log(sigmoid(y)) + if 'video_logits' in D_out_gen: + loss_Gmain_video = F.softplus(-D_out_gen['video_logits']).mean() # -log(sigmoid(y)) # [1] + training_stats.report('Loss/scores/fake_video', D_out_gen['video_logits']) + training_stats.report('Loss/G/loss_video', loss_Gmain_video) + else: + loss_Gmain_video = 0.0 # [1] + training_stats.report('Loss/G/loss', loss_Gmain) + with torch.autograd.profiler.record_function('Gmain_backward'): + (loss_Gmain + loss_Gmain_video).mean().mul(gain).backward() + + # Gpl: Apply path length regularization. + if do_Gpl: + with torch.autograd.profiler.record_function('Gpl_forward'): + batch_size = gen_z.shape[0] // self.pl_batch_shrink + gen_img, gen_ws = self.run_G(gen_z[:batch_size], gen_c[:batch_size], gen_t[:batch_size], sync=sync) # [batch_size * num_frames, c, h, w] + pl_noise = torch.randn_like(gen_img) / np.sqrt(gen_img.shape[2] * gen_img.shape[3]) + with torch.autograd.profiler.record_function('pl_grads'), conv2d_gradfix.no_weight_gradients(): + pl_grads = torch.autograd.grad(outputs=[(gen_img * pl_noise).sum()], inputs=[gen_ws], create_graph=True, only_inputs=True)[0] + pl_lengths = pl_grads.square().sum(2).mean(1).sqrt() + pl_mean = self.pl_mean.lerp(pl_lengths.mean(), self.pl_decay) + self.pl_mean.copy_(pl_mean.detach()) + pl_penalty = (pl_lengths - pl_mean).square() + training_stats.report('Loss/pl_penalty', pl_penalty) + loss_Gpl = pl_penalty * self.pl_weight + training_stats.report('Loss/G/reg', loss_Gpl) + with torch.autograd.profiler.record_function('Gpl_backward'): + (gen_img[:, 0, 0, 0] * 0 + loss_Gpl).mean().mul(gain).backward() + + # Dmain: Minimize logits for generated images. + loss_Dgen = 0 + if do_Dmain: + with torch.autograd.profiler.record_function('Dgen_forward'): + with torch.no_grad(): + gen_img, _gen_ws = self.run_G(gen_z, gen_c, gen_t, sync=False) # [batch_size * num_frames, c, h, w] + D_out_gen = self.run_D(gen_img, gen_c, gen_t, sync=False) # Gets synced by loss_Dreal. + training_stats.report('Loss/scores/fake', D_out_gen['image_logits']) + training_stats.report('Loss/signs/fake', D_out_gen['image_logits'].sign()) + loss_Dgen = F.softplus(D_out_gen['image_logits']) # -log(1 - sigmoid(y)) + + if 'video_logits' in D_out_gen: + loss_Dgen_video = F.softplus(D_out_gen['video_logits']).mean() # [1] + training_stats.report('Loss/scores/fake_video', D_out_gen['video_logits']) + else: + loss_Dgen_video = 0.0 # [1] + + with torch.autograd.profiler.record_function('Dgen_backward'): + (loss_Dgen + loss_Dgen_video).mean().mul(gain).backward() + + # Dmain: Maximize logits for real images. + # Dr1: Apply R1 regularization. + if do_Dmain or do_Dr1: + name = 'Dreal_Dr1' if do_Dmain and do_Dr1 else 'Dreal' if do_Dmain else 'Dr1' + with torch.autograd.profiler.record_function(name + '_forward'): + real_img_tmp = real_img.detach().requires_grad_(do_Dr1) + D_out_real = self.run_D(real_img_tmp, real_c, real_t, sync=sync) + training_stats.report('Loss/scores/real', D_out_real['image_logits']) + training_stats.report('Loss/signs/real', D_out_real['image_logits'].sign()) + + loss_Dreal = 0 + loss_Dreal_dist_preds = 0 + loss_Dreal_video = 0.0 # [1] + if do_Dmain: + loss_Dreal = F.softplus(-D_out_real['image_logits']) # -log(sigmoid(y)) + training_stats.report('Loss/D/loss', loss_Dgen + loss_Dreal) + + if 'video_logits' in D_out_gen: + loss_Dreal_video = F.softplus(-D_out_real['video_logits']).mean() # [1] + training_stats.report('Loss/scores/real_video', D_out_real['video_logits']) + training_stats.report('Loss/D/loss_video', loss_Dgen_video + loss_Dreal_video) + + loss_Dr1 = 0 + if do_Dr1: + with torch.autograd.profiler.record_function('r1_grads'), conv2d_gradfix.no_weight_gradients(): + r1_grads = torch.autograd.grad(outputs=[D_out_real['image_logits'].sum()], inputs=[real_img_tmp], create_graph=True, only_inputs=True)[0] + r1_penalty = r1_grads.square().sum([1,2,3]) + loss_Dr1 = r1_penalty * (self.r1_gamma / 2) # [batch_size * num_frames_per_video] + loss_Dr1 = loss_Dr1.view(-1, len(real_img_tmp) // len(D_out_real['image_logits'])).mean(dim=1) # [batch_size] + training_stats.report('Loss/r1_penalty', r1_penalty) + training_stats.report('Loss/D/reg', loss_Dr1) + + dummy_video_logits = (D_out_real["video_logits"].sum() * 0.0) if "video_logits" in D_out_real else 0.0 + with torch.autograd.profiler.record_function(name + '_backward'): + (D_out_real["image_logits"] * 0 + dummy_video_logits + loss_Dreal + loss_Dreal_video + loss_Dr1 + loss_Dreal_dist_preds).mean().mul(gain).backward() + +#---------------------------------------------------------------------------- diff --git a/src/training/mocogan.py b/src/training/mocogan.py new file mode 100644 index 0000000000000000000000000000000000000000..82e82abddb05a41aa93894cb29310b2a01c8346f --- /dev/null +++ b/src/training/mocogan.py @@ -0,0 +1,294 @@ +import functools +from typing import Tuple, List, Dict + +import numpy as np +from torch import Tensor +import torch +import torch.nn as nn +from omegaconf import DictConfig, OmegaConf + +from src.torch_utils import persistence +from src.training.networks import Discriminator as ImageDiscriminator + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class Discriminator(nn.Module): + """ + MoCoGAN discriminator, consisting on 2 parts: ImageDiscriminator and VideoDiscriminator + """ + def __init__(self, + cfg: DictConfig, + img_channels: int, + img_resolution: int, + *img_discr_args, + **img_discr_kwargs): + + super().__init__() + + self.cfg = cfg + self.image_discr = ImageDiscriminator( + img_resolution=img_resolution, + img_channels=img_channels, + cfg=OmegaConf.create({ + 'sampling': {'num_frames_per_video': 1}, + 'dummy_c': False, + 'fmaps': 1.0 if img_resolution >= 512 else 0.5, + 'mbstd_group_size': 4, + 'concat_res': -1, + }), + *img_discr_args, + **img_discr_kwargs, + ) + self.video_discr = MoCoGANVideoDiscriminator( + n_channels=img_channels, + n_output_neurons=1, + bn_use_gamma=True, + use_noise=True, + noise_sigma=0.1, + image_size=img_resolution, + num_t_paddings=cfg.video_discr_num_t_paddings, + ) + self.video_discr.apply(weights_init) + + def params_with_lr(self, lr: float) -> List[Dict]: + return [ + {'params': self.image_discr.parameters()}, + {'params': self.video_discr.parameters(), 'lr': self.cfg.video_discr_lr_multiplier * lr} + ] + + def forward(self, img: Tensor, c: Tensor, t: Tensor, **img_discr_kwargs) -> Tuple[Tensor, "None"]: + """ + - img has shape [batch_size * num_frames_per_video, c, h, w] + - c has shape [batch_size, c_dim] + - t has shape [batch_size, num_frames_per_video] + """ + batch_size, num_frames_per_video = t.shape + image_logits = self.image_discr(img, c, t, **img_discr_kwargs)['image_logits'] # [batch_size * num_frames] + + # Preparing input for the video discriminator + videos = img.view(batch_size, num_frames_per_video, *img.shape[1:]) # [batch_size, t, c, h, w] + videos = videos.permute(0, 2, 1, 3, 4).contiguous() # [batch_size, c, t, h, w] + video_logits = self.video_discr(videos) # (num_subdiscrs, num_layers, [batch_size, 1, out_t, out_h, out_w]) + + # We return a tuple for backward compatibility + return {'image_logits': image_logits, 'video_logits': video_logits.flatten(start_dim=1)} + +#---------------------------------------------------------------------------- + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1 and hasattr(m, 'weight'): + m.weight.data.normal_(0.0, 0.02) + elif classname.find('BatchNorm3d') != -1: + m.weight.data.normal_(1.0, 0.02) + m.bias.data.fill_(0) + + +def get_norm_layer(norm_type='instance'): + if norm_type == 'batch': + norm_layer = functools.partial(nn.BatchNorm3d, affine=True) + elif norm_type == 'instance': + norm_layer = functools.partial(nn.InstanceNorm3d, + affine=False, + track_running_stats=True) + else: + raise NotImplementedError('normalization layer [%s] is not found' % norm_type) + return norm_layer + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class VideoDiscriminator(nn.Module): + def __init__(self, + num_input_channels, + ndf=64, + n_layers=3, + n_frames_per_sample=16, + norm_layer=nn.InstanceNorm3d, + num_sub_discrs=2, + get_intermediate_feat=True): + + super().__init__() + self.num_sub_discrs = num_sub_discrs + self.n_layers = n_layers + self.get_intermediate_feat = get_intermediate_feat + ndf_max = 64 + + for i in range(num_sub_discrs): + block = SubVideoDiscriminator( + num_input_channels=num_input_channels, + ndf=min(ndf_max, ndf * (2 ** (num_sub_discrs - 1 - i))), + n_layers=n_layers, + norm_layer=norm_layer, + get_intermediate_feat=get_intermediate_feat) + + if get_intermediate_feat: + for j in range(n_layers + 2): + setattr(self, 'scale' + str(i) + '_layer' + str(j), getattr(block, 'model' + str(j))) + else: + setattr(self, 'layer' + str(i), block.model) + + stride = 2 if n_frames_per_sample > 16 else [1, 2, 2] + self.downsample = nn.AvgPool3d( + 3, + stride=stride, + padding=[1, 1, 1], + count_include_pad=False + ) + + def singleD_forward(self, model, input): + if self.get_intermediate_feat: + result = [input] + for i in range(len(model)): + result.append(model[i](result[-1])) + return result[1:] + else: + return [model(input)] + + def forward(self, x): + result = [] + x = x + + for block_idx in range(self.num_sub_discrs): + if self.get_intermediate_feat: + model = [getattr(self, 'scale' + str(self.num_sub_discrs - 1 - block_idx) + '_layer' + str(j)) for j in range(self.n_layers + 2)] + else: + model = getattr(self, 'layer' + str(self.num_sub_discrs - 1 - block_idx)) + result.append(self.singleD_forward(model, x)) + + if block_idx != (self.num_sub_discrs - 1): + x = self.downsample(x) + + return result + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class SubVideoDiscriminator(nn.Module): + def __init__(self, + num_input_channels, + ndf=64, + n_layers=3, + norm_layer=nn.InstanceNorm3d, + get_intermediate_feat=True): + + super().__init__() + self.get_intermediate_feat = get_intermediate_feat + self.n_layers = n_layers + + kernel_size = 4 + padw = int(np.ceil((kernel_size - 1.0) / 2)) + + sequence = [[ + nn.Conv3d(num_input_channels, ndf, kernel_size=kernel_size, stride=2, padding=padw), + nn.LeakyReLU(0.2, True) + ]] + + nf = ndf + for n in range(1, n_layers): + nf_prev = nf + nf = min(nf * 2, 512) + sequence += [[ + nn.Conv3d(nf_prev, nf, kernel_size=kernel_size, stride=2, padding=padw), + norm_layer(nf), + nn.LeakyReLU(0.2, True) + ]] + + nf_prev = nf + nf = min(nf * 2, 512) + sequence += [[ + nn.Conv3d(nf_prev, nf, kernel_size=kernel_size, stride=1, padding=padw), + norm_layer(nf), + nn.LeakyReLU(0.2, True) + ]] + + sequence += [[ + nn.Conv3d(nf, 1, kernel_size=kernel_size, stride=1, padding=padw) + ]] + + if get_intermediate_feat: + for n in range(len(sequence)): + setattr(self, 'model' + str(n), nn.Sequential(*sequence[n])) + else: + self.model = nn.Sequential(*[s for ss in sequence for s in ss]) + + def forward(self, x): + if self.get_intermediate_feat: + res = [x] + for n in range(self.n_layers + 2): + model = getattr(self, 'model' + str(n)) + res.append(model(res[-1])) + return res[1:] + else: + return self.model(x) + +#---------------------------------------------------------------------------- + +class MoCoGANVideoDiscriminator(nn.Module): + def __init__(self, n_channels, n_output_neurons=1, bn_use_gamma=True, use_noise=False, noise_sigma=None, ndf=64, image_size: int=64, num_t_paddings: int=0): + super(MoCoGANVideoDiscriminator, self).__init__() + + self.n_channels = n_channels + self.n_output_neurons = n_output_neurons + self.use_noise = use_noise + self.bn_use_gamma = bn_use_gamma + + layers = [ + Noise(use_noise, sigma=noise_sigma), + nn.Conv3d(n_channels, ndf, 4, stride=(1, 2, 2), padding=(2 if num_t_paddings > 0 else 0, 1, 1), bias=False), + nn.LeakyReLU(0.2, inplace=True), + + Noise(use_noise, sigma=noise_sigma), + nn.Conv3d(ndf, ndf * 2, 4, stride=(1, 2, 2), padding=(2 if num_t_paddings > 1 else 0, 1, 1), bias=False), + nn.BatchNorm3d(ndf * 2), + nn.LeakyReLU(0.2, inplace=True), + + Noise(use_noise, sigma=noise_sigma), + nn.Conv3d(ndf * 2, ndf * 4, 4, stride=(1, 2, 2), padding=(2 if num_t_paddings > 2 else 0, 1, 1), bias=False), + nn.BatchNorm3d(ndf * 4), + nn.LeakyReLU(0.2, inplace=True), + + Noise(use_noise, sigma=noise_sigma), + nn.Conv3d(ndf * 4, ndf * 8, 4, stride=(1, 2, 2), padding=(2 if num_t_paddings > 3 else 0, 1, 1), bias=False), + nn.BatchNorm3d(ndf * 8), + nn.LeakyReLU(0.2, inplace=True), + ] + + if image_size == 256: + layers.extend([ + Noise(use_noise, sigma=noise_sigma), + nn.Conv3d(ndf * 8, ndf * 8, 3, stride=(1, 1, 1), padding=(1 + (1 if num_t_paddings > 4 else 0), 1, 1), bias=False), + nn.BatchNorm3d(ndf * 8), + nn.LeakyReLU(0.2, inplace=True), + + Noise(use_noise, sigma=noise_sigma), + nn.Conv3d(ndf * 8, ndf * 8, 3, stride=(1, 1, 1), padding=(1 + (1 if num_t_paddings > 5 else 0), 1, 1), bias=False), + nn.BatchNorm3d(ndf * 8), + nn.LeakyReLU(0.2, inplace=True), + ]) + + layers.extend([ + nn.Conv3d(ndf * 8, n_output_neurons, kernel_size=4, stride=1, padding=(2 if num_t_paddings > 5 else 0, 0, 0), bias=False), + ]) + + self.main = nn.Sequential(*layers) + + def forward(self, input): + return self.main(input).squeeze() + +#---------------------------------------------------------------------------- + +class Noise(nn.Module): + def __init__(self, use_noise, sigma=0.2): + super(Noise, self).__init__() + + self.use_noise = use_noise + self.sigma = sigma + + def forward(self, x): + if self.use_noise: + return x + self.sigma * torch.randn_like(x) + return x + +#---------------------------------------------------------------------------- diff --git a/src/training/motion.py b/src/training/motion.py new file mode 100644 index 0000000000000000000000000000000000000000..3ced3c8967a077a878c4b134f15a18641d42da28 --- /dev/null +++ b/src/training/motion.py @@ -0,0 +1,224 @@ +from typing import Dict + +import numpy as np +import torch +import torch.nn as nn +from omegaconf import DictConfig + +from src.torch_utils import misc +from src.torch_utils import persistence +from src.training.layers import ( + MappingNetwork, + EqLRConv1d, + FullyConnectedLayer, +) + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class MotionMappingNetwork(torch.nn.Module): + def __init__(self, cfg: DictConfig): + super().__init__() + self.cfg = cfg + + assert self.cfg.motion.gen_strategy in ["autoregressive", "conv"], f"Unknown generation strategy: {self.cfg.motion.gen_strategy}" + + if self.cfg.motion.fourier: + self.time_encoder = AlignedTimeEncoder( + cfg=self.cfg, + latent_dim=self.cfg.motion.v_dim + ) + else: + self.mapping = MappingNetwork( + z_dim=self.cfg.motion.z_dim, + c_dim=self.cfg.c_dim, + w_dim=self.cfg.motion.v_dim, + num_ws=None, + num_layers=2, + activation='lrelu', + w_avg_beta=None, + cfg=self.cfg, + ) + + if self.cfg.motion.gen_strategy == 'autoregressive': + self.rnn = nn.LSTM( + input_size=self.cfg.motion.z_dim + self.cfg.c_dim, + hidden_size=self.cfg.motion.z_dim, + bidirectional=False, + batch_first=True) + self._parameters_flattened = False + self.num_additional_codes = 0 + elif self.cfg.motion.gen_strategy == 'conv': + # Using Conv1d without paddings instead of LSTM makes the generations good for any time in t \in (0, +\infty), + # while LSTM would diverge for large `t` + # Also, this allows us to use equalized learning rates + self.conv = nn.Sequential( + EqLRConv1d(self.cfg.motion.z_dim + self.cfg.c_dim, self.cfg.motion.z_dim, self.cfg.motion.kernel_size, padding=0, activation='lrelu', lr_multiplier=0.01), + EqLRConv1d(self.cfg.motion.z_dim, self.cfg.motion.v_dim, self.cfg.motion.kernel_size, padding=0, activation='lrelu', lr_multiplier=0.01), + ) + self.num_additional_codes = (self.cfg.motion.kernel_size - 1) * 2 + else: + raise NotImplementedError(f'Unknown generation strategy: {self.cfg.motion.gen_strategy}') + + def get_max_traj_len(self, t: torch.Tensor) -> int: + max_t = max(self.cfg.sampling.max_num_frames - 1, t.max().item()) # [1] + max_traj_len = np.ceil(max_t / self.cfg.motion.motion_z_distance).astype(int).item() + 2 # [1] + return max_traj_len + + def generate_motion_u_codes(self, c: torch.Tensor, t: torch.Tensor, motion_z: torch.Tensor=None) -> Dict: + """ + Arguments: + - c of shape [batch_size, c_dim] + - t of shape [batch_size, num_frames] + - w of shape [batch_size, w_dim] + - motion_z of shape [batch_size, max_traj_len, motion_z_dim] --- in case we want to reuse some existing motion noise + """ + out = {} + batch_size, num_frames = t.shape + + # Consutruct trajectories (from code idx for now) + max_traj_len = self.get_max_traj_len(t) + self.num_additional_codes # [1] + + if motion_z is None: + motion_z = torch.randn(batch_size, max_traj_len, self.cfg.motion.z_dim, device=c.device) # [batch_size, max_traj_len, motion.z_dim] + + # Input motion trajectories are just random noise + input_trajs = motion_z[:batch_size, :max_traj_len, :self.cfg.motion.z_dim].to(c.device) # [batch_size, max_traj_len, motion.z_dim] + + if self.cfg.c_dim > 0: + # Different classes might have different motions, so it should be useful to condition on c + misc.assert_shape(c, [batch_size, None]) + input_trajs = torch.cat([input_trajs, c.unsqueeze(1).repeat(1, max_traj_len, 1)], dim=2) # [batch_size, max_traj_len, motion.z_dim + cond_dim] + + if self.cfg.motion.gen_strategy == 'autoregressive': + # Somehow, RNN parameters do not get flattened on their own and we get a lot of warnings... + if not self._parameters_flattened: + self.rnn.flatten_parameters() + self._parameters_flattened = True + trajs, _ = self.rnn(input_trajs) # [batch_size, max_traj_len, motion.z_dim] + elif self.cfg.motion.gen_strategy == 'conv': + trajs = self.conv(input_trajs.permute(0, 2, 1)).permute(0, 2, 1) # [batch_size, max_traj_len, motion.v_dim] + else: + raise NotImplementedError(f'Unknown generation strategy: {self.cfg.motion.gen_strategy}') + + # Now, we should select neighbouring codes for each frame + left_idx = (t / self.cfg.motion.motion_z_distance).floor().long() # [batch_size, num_frames] + batch_idx = torch.arange(batch_size, device=c.device).unsqueeze(1).repeat(1, num_frames) # [batch_size, num_frames] + motion_u_left = trajs[batch_idx, left_idx] # [batch_size, num_frames, motion.z_dim] + motion_u_right = trajs[batch_idx, left_idx + 1] # [batch_size, num_frames, motion.z_dim] + + # Compute `u` codes as the interpolation between `u_left` and `u_right` + t_left = t - t % self.cfg.motion.motion_z_distance # [batch_size, num_frames] + t_right = t_left + self.cfg.motion.motion_z_distance # [batch_size, num_frames] + # Compute interpolation weights `alpha` (we'll use them later) + interp_weights = ((t % self.cfg.motion.motion_z_distance) / self.cfg.motion.motion_z_distance).unsqueeze(2).to(torch.float32) # [batch_size, num_frames, 1] + motion_u = motion_u_left * (1 - interp_weights) + motion_u_right * interp_weights # [batch_size, num_frames, motion.z_dim] + motion_u = motion_u.view(batch_size * num_frames, motion_u.shape[2]).to(torch.float32) # [batch_size * num_frames, motion.z_dim] + + # Save the results into the output dict + out['motion_u_left'] = motion_u_left # [batch_size, num_frames, motion.z_dim] + out['motion_u_right'] = motion_u_right # [batch_size, num_frames, motion.z_dim] + out['t_left'] = t_left # [batch_size, num_frames] + out['t_right'] = t_right # [batch_size, num_frames] + out['interp_weights'] = interp_weights # [batch_size, num_frames, 1] + out['motion_u'] = motion_u # [batch_size * num_frames, motion.z_dim] + out['motion_z'] = motion_z # [batch_size+, max_traj_len+, motion.z_dim+] + + return out + + def get_dim(self) -> int: + return self.cfg.motion.v_dim if self.time_encoder is None else self.time_encoder.get_dim() + + def forward(self, c: torch.Tensor, t: torch.Tensor, motion_z: Dict=None) -> Dict: + assert len(c) == len(t), f"Wrong shape: {c.shape}, {t.shape}" + assert t.ndim == 2, f"Wrong shape: {t.shape}" + + out = {} # We'll be aggregating the result here + motion_u_info: Dict = self.generate_motion_u_codes(c, t, motion_z=motion_z) # Dict of tensors + motion_u = motion_u_info['motion_u'].view(t.shape[0] * t.shape[1], -1) # [batch_size * num_frames, motion.z_dim] + + # Compute the `v` motion codes + if self.cfg.motion.fourier: + motion_v = self.time_encoder( + t=t, + motion_u_left=motion_u_info['motion_u_left'], + motion_u_right=motion_u_info['motion_u_right'], + t_left=motion_u_info['t_left'], + t_right=motion_u_info['t_right'], + interp_weights=motion_u_info['interp_weights'], + ) # [batch_size * num_frames, motion_v_dim] + else: + motion_v = self.mapping(z=motion_u, c=c.repeat_interleave(t.shape[1], dim=0)) # [batch_size * num_frames, motion.v_dim] + + out['motion_v'] = motion_v # [batch_size * num_frames, motion.v_dim] + out['motion_z'] = motion_u_info['motion_z'] # (Any shape) + + return out + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class AlignedTimeEncoder(nn.Module): + def __init__(self, + latent_dim: int=512, + cfg: DictConfig = {}, + ): + super().__init__() + self.cfg = cfg + self.latent_dim = latent_dim + + freqs = construct_linspaced_frequencies(self.cfg.time_enc.dim, self.cfg.time_enc.min_period_len, self.cfg.time_enc.max_period_len) + self.register_buffer('freqs', freqs) # [1, num_fourier_feats] + + # Creating the affine without bias to prevent motion mode collapse + self.periods_predictor = FullyConnectedLayer(latent_dim, freqs.shape[1], activation='linear', bias=False) + self.phase_predictor = FullyConnectedLayer(latent_dim, freqs.shape[1], activation='linear', bias=False) + period_lens = 2 * np.pi / self.freqs # [1, num_fourier_feats] + phase_scales = self.cfg.time_enc.max_period_len / period_lens # [1, num_fourier_feats] + self.register_buffer('phase_scales', phase_scales) + + self.aligners_predictor = FullyConnectedLayer(latent_dim, self.freqs.shape[1] * 2, activation='linear', bias=False) + + def get_dim(self) -> int: + return self.freqs.shape[1] * 2 + + def forward(self, t: torch.Tensor, motion_u_left: torch.Tensor, motion_u_right: torch.Tensor, interp_weights: torch.Tensor, t_left: torch.Tensor, t_right: torch.Tensor): + batch_size, num_frames, motion_u_dim = motion_u_left.shape # [1], [1], [1] + + misc.assert_shape(t, [batch_size, num_frames]) + misc.assert_shape(motion_u_left, [batch_size, num_frames, None]) + misc.assert_shape(motion_u_right, [batch_size, num_frames, None]) + misc.assert_shape(interp_weights, [batch_size, num_frames, 1]) + assert t.shape == t_left.shape == t_right.shape, f"Wrong shape: {t.shape} vs {t_left.shape} vs {t_right.shape}" + + motion_u_left = motion_u_left.view(batch_size * num_frames, motion_u_dim) # [batch_size * num_frames, motion_u_dim] + motion_u_right = motion_u_right.view(batch_size * num_frames, motion_u_dim) # [batch_size * num_frames, motion_u_dim] + periods = self.periods_predictor(motion_u_left).tanh() + 1 # [batch_size * num_frames, feat_dim] + phases = self.phase_predictor(motion_u_left) # [batch_size * num_frames, feat_dim] + aligners_left = self.aligners_predictor(motion_u_left) # [batch_size * num_frames, out_dim] + aligners_right = self.aligners_predictor(motion_u_right) # [batch_size * num_frames, out_dim] + + raw_pos_embs = self.freqs * periods * t.view(-1).float().unsqueeze(1) + phases * self.phase_scales # [bf, feat_dim] + raw_pos_embs_left = self.freqs * periods * t_left.view(-1).float().unsqueeze(1) + phases * self.phase_scales # [bf, feat_dim] + raw_pos_embs_right = self.freqs * periods * t_right.view(-1).float().unsqueeze(1) + phases * self.phase_scales # [bf, feat_dim] + + pos_embs = torch.cat([raw_pos_embs.sin(), raw_pos_embs.cos()], dim=1) # [bf, out_dim] + pos_embs_left = torch.cat([raw_pos_embs_left.sin(), raw_pos_embs_left.cos()], dim=1) # [bf, out_dim] + pos_embs_right = torch.cat([raw_pos_embs_right.sin(), raw_pos_embs_right.cos()], dim=1) # [bf, out_dim] + + interp_weights = interp_weights.view(-1, 1) # [bf, 1] + aligners_remove = pos_embs_left * (1 - interp_weights) + pos_embs_right * interp_weights # [bf, out_dim] + aligners_add = aligners_left * (1 - interp_weights) + aligners_right * interp_weights # [bf, out_dim] + time_embs = pos_embs - aligners_remove + aligners_add # [bf, out_dim] + + return time_embs + +#---------------------------------------------------------------------------- + +def construct_linspaced_frequencies(num_freqs: int, min_period_len: int, max_period_len: int) -> torch.Tensor: + freqs = 2 * np.pi / (2 ** np.linspace(np.log2(min_period_len), np.log2(max_period_len), num_freqs)) # [num_freqs] + freqs = torch.from_numpy(freqs[::-1].copy().astype(np.float32)).unsqueeze(0) # [1, num_freqs] + + return freqs + +#---------------------------------------------------------------------------- diff --git a/src/training/networks.py b/src/training/networks.py new file mode 100644 index 0000000000000000000000000000000000000000..c8b6b5f5b36e0ed063f87e1ff0098e21c579c1dc --- /dev/null +++ b/src/training/networks.py @@ -0,0 +1,678 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import numpy as np +import torch +from torch import Tensor +from omegaconf import OmegaConf + +from src.torch_utils import misc +from src.torch_utils import persistence +from src.torch_utils.ops import conv2d_resample, upfirdn2d, bias_act, fma + +from src.training.motion import MotionMappingNetwork +from src.training.layers import ( + FullyConnectedLayer, + GenInput, + TemporalDifferenceEncoder, + Conv2dLayer, + MappingNetwork, +) + +#---------------------------------------------------------------------------- + +@misc.profiled_function +def modulated_conv2d( + x, # Input tensor of shape [batch_size, in_channels, in_height, in_width]. + weight, # Weight tensor of shape [out_channels, in_channels, kernel_height, kernel_width]. + styles, # Modulation coefficients of shape [batch_size, in_channels]. + noise = None, # Optional noise tensor to add to the output activations. + up = 1, # Integer upsampling factor. + down = 1, # Integer downsampling factor. + padding = 0, # Padding with respect to the upsampled image. + resample_filter = None, # Low-pass filter to apply when resampling activations. Must be prepared beforehand by calling upfirdn2d.setup_filter(). + demodulate = True, # Apply weight demodulation? + flip_weight = True, # False = convolution, True = correlation (matches torch.nn.functional.conv2d). + fused_modconv = True, # Perform modulation, convolution, and demodulation as a single fused operation? +): + batch_size = x.shape[0] + out_channels, in_channels, kh, kw = weight.shape + misc.assert_shape(weight, [out_channels, in_channels, kh, kw]) # [OIkk] + misc.assert_shape(x, [batch_size, in_channels, None, None]) # [NIHW] + misc.assert_shape(styles, [batch_size, in_channels]) # [NI] + + # Pre-normalize inputs to avoid FP16 overflow. + if x.dtype == torch.float16 and demodulate: + weight = weight * (1 / np.sqrt(in_channels * kh * kw) / weight.norm(float('inf'), dim=[1,2,3], keepdim=True)) # max_Ikk + styles = styles / styles.norm(float('inf'), dim=1, keepdim=True) # max_I + + # Calculate per-sample weights and demodulation coefficients. + w = None + dcoefs = None + if demodulate or fused_modconv: + w = weight.unsqueeze(0) * styles.reshape(batch_size, 1, -1, 1, 1) # [NOIkk] + if demodulate: + dcoefs = (w.square().sum(dim=[2,3,4]) + 1e-8).rsqrt() # [NO] + if demodulate and fused_modconv: + w = w * dcoefs.reshape(batch_size, -1, 1, 1, 1) # [NOIkk] + + # Execute by scaling the activations before and after the convolution. + if not fused_modconv: + x = x * styles.to(x.dtype).reshape(batch_size, -1, 1, 1) + x = conv2d_resample.conv2d_resample(x=x, w=weight.to(x.dtype), f=resample_filter, up=up, down=down, padding=padding, flip_weight=flip_weight) + if demodulate and noise is not None: + x = fma.fma(x, dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1), noise.to(x.dtype)) + elif demodulate: + x = x * dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1) + elif noise is not None: + x = x.add_(noise.to(x.dtype)) + return x + + # Execute as one fused op using grouped convolution. + with misc.suppress_tracer_warnings(): # this value will be treated as a constant + batch_size = int(batch_size) + misc.assert_shape(x, [batch_size, in_channels, None, None]) + x = x.reshape(1, -1, *x.shape[2:]) + w = w.reshape(-1, in_channels, kh, kw) + x = conv2d_resample.conv2d_resample(x=x, w=w.to(x.dtype), f=resample_filter, up=up, down=down, padding=padding, groups=batch_size, flip_weight=flip_weight) + x = x.reshape(batch_size, -1, *x.shape[2:]) + if noise is not None: + x = x.add_(noise) + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class SynthesisLayer(torch.nn.Module): + def __init__(self, + in_channels, # Number of input channels. + out_channels, # Number of output channels. + w_dim, # Intermediate latent (W) dimensionality. + resolution, # Resolution of this layer. + kernel_size = 3, # Convolution kernel size. + up = 1, # Integer upsampling factor. + activation = 'lrelu', # Activation function: 'relu', 'lrelu', etc. + resample_filter = [1,3,3,1], # Low-pass filter to apply when resampling activations. + conv_clamp = None, # Clamp the output of convolution layers to +-X, None = disable clamping. + channels_last = False, # Use channels_last format for the weights? + cfg = {}, # Additional config + ): + super().__init__() + + self.cfg = cfg + self.resolution = resolution + self.up = up + self.activation = activation + self.conv_clamp = conv_clamp + self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter)) + self.padding = kernel_size // 2 + self.act_gain = bias_act.activation_funcs[activation].def_gain + + self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1) + memory_format = torch.channels_last if channels_last else torch.contiguous_format + self.weight = torch.nn.Parameter(torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format)) + if self.cfg.use_noise: + self.register_buffer('noise_const', torch.randn([resolution, resolution])) + self.noise_strength = torch.nn.Parameter(torch.zeros([])) + self.bias = torch.nn.Parameter(torch.zeros([out_channels])) + + def forward(self, x, w, noise_mode='random', fused_modconv=True, gain=1): + assert noise_mode in ['random', 'const', 'none'] + in_resolution = self.resolution // self.up + misc.assert_shape(x, [None, self.weight.shape[1], in_resolution, in_resolution]) + styles = self.affine(w) + + noise = None + if self.cfg.use_noise and noise_mode == 'random': + noise = torch.randn([x.shape[0], 1, self.resolution, self.resolution], device=x.device) * self.noise_strength + if self.cfg.use_noise and noise_mode == 'const': + noise = self.noise_const * self.noise_strength + + flip_weight = (self.up == 1) # slightly faster + x = modulated_conv2d(x=x, weight=self.weight, styles=styles, noise=noise, up=self.up, + padding=self.padding, resample_filter=self.resample_filter, flip_weight=flip_weight, + fused_modconv=fused_modconv) + + act_gain = self.act_gain * gain + act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None + x = bias_act.bias_act(x, self.bias.to(x.dtype), act=self.activation, gain=act_gain, clamp=act_clamp) + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class ToRGBLayer(torch.nn.Module): + def __init__(self, in_channels, out_channels, w_dim, kernel_size=1, conv_clamp=None, channels_last=False): + super().__init__() + self.conv_clamp = conv_clamp + self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1) + memory_format = torch.channels_last if channels_last else torch.contiguous_format + self.weight = torch.nn.Parameter(torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format)) + self.bias = torch.nn.Parameter(torch.zeros([out_channels])) + self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2)) + + def forward(self, x, w, fused_modconv=True): + styles = self.affine(w) * self.weight_gain + x = modulated_conv2d(x=x, weight=self.weight, styles=styles, demodulate=False, fused_modconv=fused_modconv) + x = bias_act.bias_act(x, self.bias.to(x.dtype), clamp=self.conv_clamp) + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class SynthesisBlock(torch.nn.Module): + def __init__(self, + in_channels, # Number of input channels, 0 = first block. + out_channels, # Number of output channels. + w_dim, # Intermediate latent (W) dimensionality. + motion_v_dim, # Motion code size + resolution, # Resolution of this block. + img_channels, # Number of output color channels. + is_last, # Is this the last block? + architecture = 'skip', # Architecture: 'orig', 'skip', 'resnet'. + resample_filter = [1,3,3,1], # Low-pass filter to apply when resampling activations. + conv_clamp = 256, # Clamp the output of convolution layers to +-X, None = disable clamping. + use_fp16 = True, # Use FP16 for this block? + fp16_channels_last = False, # Use channels-last memory format with FP16? + cfg = {}, # Additional config + **layer_kwargs, # Arguments for SynthesisLayer. + ): + assert architecture in ['orig', 'skip', 'resnet'] + super().__init__() + + self.cfg = cfg + self.in_channels = in_channels + self.w_dim = w_dim + self.resolution = resolution + self.img_channels = img_channels + self.is_last = is_last + self.architecture = architecture + self.use_fp16 = use_fp16 + self.channels_last = (use_fp16 and fp16_channels_last) + self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter)) + self.num_conv = 0 + self.num_torgb = 0 + + if in_channels == 0: + self.input = GenInput(self.cfg, out_channels, motion_v_dim=motion_v_dim) + conv1_in_channels = self.input.total_dim + else: + self.conv0 = SynthesisLayer(in_channels, out_channels, w_dim=w_dim, resolution=self.resolution, up=2, + resample_filter=resample_filter, conv_clamp=conv_clamp, channels_last=self.channels_last, + kernel_size=3, cfg=cfg, **layer_kwargs) + self.num_conv += 1 + conv1_in_channels = out_channels + + self.conv1 = SynthesisLayer(conv1_in_channels, out_channels, w_dim=w_dim, resolution=self.resolution, + conv_clamp=conv_clamp, channels_last=self.channels_last, kernel_size=3, cfg=cfg, **layer_kwargs) + self.num_conv += 1 + + if is_last or architecture == 'skip': + self.torgb = ToRGBLayer(out_channels, img_channels, w_dim=w_dim, + conv_clamp=conv_clamp, channels_last=self.channels_last) + self.num_torgb += 1 + + if in_channels != 0 and architecture == 'resnet': + self.skip = Conv2dLayer(in_channels, out_channels, kernel_size=1, bias=False, up=2, + resample_filter=resample_filter, channels_last=self.channels_last) + + def forward(self, x, img, ws, motion_v=None, force_fp32=False, fused_modconv=None, **layer_kwargs): + misc.assert_shape(ws, [None, self.num_conv + self.num_torgb, self.w_dim]) + w_iter = iter(ws.unbind(dim=1)) + dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32 + memory_format = torch.channels_last if self.channels_last and not force_fp32 else torch.contiguous_format + + if fused_modconv is None: + with misc.suppress_tracer_warnings(): # this value will be treated as a constant + fused_modconv = (not self.training) and (dtype == torch.float32 or (isinstance(x, Tensor) and int(x.shape[0]) == 1)) + + # Input. + if self.in_channels == 0: + x = self.input(ws.shape[0], motion_v=motion_v, dtype=dtype, memory_format=memory_format) + else: + misc.assert_shape(x, [None, self.in_channels, self.resolution // 2, self.resolution // 2]) + x = x.to(dtype=dtype, memory_format=memory_format) + + # Main layers. + if self.in_channels == 0: + x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs) + elif self.architecture == 'resnet': + y = self.skip(x, gain=np.sqrt(0.5)) + x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs) + x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, gain=np.sqrt(0.5), **layer_kwargs) + x = y.add_(x) + else: + conv0_w = next(w_iter) + x = self.conv0(x, conv0_w, fused_modconv=fused_modconv, **layer_kwargs) + x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs) + + # ToRGB. + if img is not None: + misc.assert_shape(img, [None, self.img_channels, self.resolution // 2, self.resolution // 2]) + img = upfirdn2d.upsample2d(img, self.resample_filter) + + if self.is_last or self.architecture == 'skip': + y = self.torgb(x, next(w_iter), fused_modconv=fused_modconv) + y = y.to(dtype=torch.float32, memory_format=torch.contiguous_format) + img = img.add_(y) if img is not None else y + + assert x.dtype == dtype + assert img is None or img.dtype == torch.float32 + return x, img + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class SynthesisNetwork(torch.nn.Module): + def __init__(self, + w_dim, # Intermediate latent (W) dimensionality. + img_resolution, # Output image resolution. + img_channels, # Number of color channels. + channel_base = 32768, # Overall multiplier for the number of channels. + channel_max = 512, # Maximum number of channels in any layer. + num_fp16_res = 0, # Use FP16 for the N highest resolutions. + cfg = {}, # Additional config + **block_kwargs, # Arguments for SynthesisBlock. + ): + assert img_resolution >= 4 and img_resolution & (img_resolution - 1) == 0 + super().__init__() + + self.w_dim = w_dim + self.cfg = cfg + self.img_resolution = img_resolution + self.img_resolution_log2 = int(np.log2(img_resolution)) + self.img_channels = img_channels + self.block_resolutions = [2 ** i for i in range(2, self.img_resolution_log2 + 1)] + #channels_dict = {4: 512, 8: 512, 16: 512, 32: 512, 64: 256, 128: 128, 256: 64, 512: 64} + channels_dict = {4: 512, 8: 512, 16: 512, 32: 512, 64: 256, 128: 128, 256: 64} + #fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8) + + if self.cfg.motion.v_dim > 0: + self.motion_encoder = MotionMappingNetwork(self.cfg) + self.motion_v_dim = self.motion_encoder.get_dim() + else: + self.motion_encoder = None + self.motion_v_dim = 0 + + self.num_ws = 0 + for res in self.block_resolutions: + in_channels = channels_dict[res // 2] if res > 4 else 0 + out_channels = channels_dict[res] + use_fp16 = (res > 16) + is_last = (res == self.img_resolution) + block = SynthesisBlock( + in_channels, + out_channels, + w_dim=self.w_dim + (self.motion_v_dim if self.cfg.time_enc.cond_type == 'concat_w' else 0), + motion_v_dim=self.motion_v_dim, + resolution=res, + img_channels=img_channels, + is_last=is_last, + use_fp16=use_fp16, + cfg=cfg, + **block_kwargs) + self.num_ws += block.num_conv + + if is_last: + self.num_ws += block.num_torgb + setattr(self, f'b{res}', block) + + def forward(self, ws, t=None, c=None, motion_z=None, motion_v=None, **block_kwargs): + assert len(ws) == len(c) == len(t), f"Wrong shape: {ws.shape}, {c.shape}, {t.shape}" + assert t.ndim == 2, f"Wrong shape: {t.shape}" + + misc.assert_shape(ws, [None, self.num_ws, self.w_dim]) + block_ws = [] + + if self.motion_encoder is None: + ws = ws.repeat_interleave(t.shape[1], dim=0) # [batch_size * num_frames, num_ws, w_dim] + motion_v = None + else: + if motion_v is None: + motion_info = self.motion_encoder(c, t, motion_z=motion_z) # [batch_size * num_frames, motion_v_dim] + motion_v = motion_info['motion_v'] # [batch_size * num_frames, motion_v_dim] + + if self.cfg.time_enc.cond_type in ['concat_w', 'sum_w']: + misc.assert_shape(motion_v, [t.shape[0] * t.shape[1], self.motion_v_dim]) + + if self.cfg.time_enc.cond_type == 'concat_w': + motion_vs = motion_v.unsqueeze(1).repeat(1, self.num_ws, 1) # [batch_size * num_frames, num_ws, motion_v_dim] + ws = torch.cat([ws.repeat_interleave(t.shape[1], dim=0), motion_vs], dim=2) # [batch_size * num_frames, num_ws, w_dim + motion_v_dim] + elif self.cfg.time_enc.cond_type == 'sum_w': + ws = ws.repeat_interleave(t.shape[1], dim=0) + motion_v.unsqueeze(1) # [batch_size * num_frames, num_ws, w_dim + motion_v_dim] + else: + ws = ws.repeat_interleave(t.shape[1], dim=0) # [batch_size * num_frames, num_ws, w_dim] + + with torch.autograd.profiler.record_function('split_ws'): + ws = ws.to(torch.float32) + w_idx = 0 + + for res in self.block_resolutions: + block = getattr(self, f'b{res}') + block_ws.append(ws.narrow(1, w_idx, block.num_conv + block.num_torgb)) + w_idx += block.num_conv + + x = img = None + for res, cur_ws in zip(self.block_resolutions, block_ws): + block = getattr(self, f'b{res}') + if self.cfg.time_enc.cond_type != 'concat_const': + motion_v = None # To make sure that we do not leak. + x, img = block(x, img, cur_ws, motion_v=motion_v, **block_kwargs) + + return img + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class Generator(torch.nn.Module): + def __init__(self, + c_dim, # Conditioning label (C) dimensionality. + w_dim, # Intermediate latent (W) dimensionality. + img_resolution, # Output resolution. + img_channels, # Number of output color channels. + mapping_kwargs = {}, # Arguments for MappingNetwork. + synthesis_kwargs = {}, # Arguments for SynthesisNetwork. + cfg = {}, # Config + ): + super().__init__() + + self.cfg = cfg + self.sampling_dict = OmegaConf.to_container(OmegaConf.create({**self.cfg.sampling})) + self.z_dim = self.cfg.z_dim + self.c_dim = c_dim + self.w_dim = w_dim + self.img_resolution = img_resolution + self.img_channels = img_channels + self.synthesis = SynthesisNetwork(w_dim=w_dim, img_resolution=img_resolution, img_channels=img_channels, + cfg=cfg, channel_base=16384, channel_max=512, num_fp16_res=4, conv_clamp=256, + **synthesis_kwargs) + self.num_ws = self.synthesis.num_ws + self.mapping = MappingNetwork(z_dim=self.z_dim, c_dim=c_dim, w_dim=w_dim, num_ws=self.num_ws, cfg=cfg, **mapping_kwargs) + + def forward(self, z, c, t, truncation_psi=1, truncation_cutoff=None, **synthesis_kwargs): + assert len(z) == len(c) == len(t), f"Wrong shape: {z.shape}, {c.shape}, {t.shape}" + assert t.ndim == 2, f"Wrong shape: {t.shape}" + + ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff) # [batch_size, num_ws, w_dim] + img = self.synthesis(ws, t=t, c=c, **synthesis_kwargs) # [batch_size * num_frames, c, h, w] + + return img + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class DiscriminatorBlock(torch.nn.Module): + def __init__(self, + in_channels, # Number of input channels, 0 = first block. + tmp_channels, # Number of intermediate channels. + out_channels, # Number of output channels. + resolution, # Resolution of this block. + img_channels, # Number of input color channels. + first_layer_idx, # Index of the first layer. + architecture = 'resnet', # Architecture: 'orig', 'skip', 'resnet'. + activation = 'lrelu', # Activation function: 'relu', 'lrelu', etc. + resample_filter = [1,3,3,1], # Low-pass filter to apply when resampling activations. + conv_clamp = 256, # Clamp the output of convolution layers to +-X, None = disable clamping. + use_fp16 = True, # Use FP16 for this block? + fp16_channels_last = False, # Use channels-last memory format with FP16? + freeze_layers = 0, # Freeze-D: Number of layers to freeze. + cfg = {}, # Main config. + ): + assert architecture in ['orig', 'skip', 'resnet'] + super().__init__() + + self.cfg = cfg + self.in_channels = in_channels + self.resolution = resolution + self.img_channels = img_channels + self.first_layer_idx = first_layer_idx + self.architecture = architecture + self.use_fp16 = use_fp16 + self.channels_last = (use_fp16 and fp16_channels_last) + self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter)) + + self.num_layers = 0 + def trainable_gen(): + while True: + layer_idx = self.first_layer_idx + self.num_layers + trainable = (layer_idx >= freeze_layers) + self.num_layers += 1 + yield trainable + trainable_iter = trainable_gen() + conv0_in_channels = in_channels if in_channels > 0 else tmp_channels + + if in_channels == 0 or architecture == 'skip': + self.fromrgb = Conv2dLayer(img_channels, tmp_channels, kernel_size=1, activation=activation, + trainable=next(trainable_iter), conv_clamp=conv_clamp, channels_last=self.channels_last) + + self.conv0 = Conv2dLayer(conv0_in_channels, tmp_channels, kernel_size=3, activation=activation, + trainable=next(trainable_iter), conv_clamp=conv_clamp, channels_last=self.channels_last) + + self.conv1 = Conv2dLayer(tmp_channels, out_channels, kernel_size=3, activation=activation, down=2, + trainable=next(trainable_iter), resample_filter=resample_filter, conv_clamp=conv_clamp, channels_last=self.channels_last) + + if architecture == 'resnet': + self.skip = Conv2dLayer(conv0_in_channels, out_channels, kernel_size=1, bias=False, down=2, + trainable=next(trainable_iter), resample_filter=resample_filter, channels_last=self.channels_last) + + def forward(self, x, img, force_fp32=False): + dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32 + memory_format = torch.channels_last if self.channels_last and not force_fp32 else torch.contiguous_format + + # Input. + if x is not None: + misc.assert_shape(x, [None, self.in_channels, self.resolution, self.resolution]) + x = x.to(dtype=dtype, memory_format=memory_format) + + # FromRGB. + if self.in_channels == 0 or self.architecture == 'skip': + misc.assert_shape(img, [None, self.img_channels, self.resolution, self.resolution]) + img = img.to(dtype=dtype, memory_format=memory_format) + y = self.fromrgb(img) + x = x + y if x is not None else y + img = upfirdn2d.downsample2d(img, self.resample_filter) if self.architecture == 'skip' else None + + # Main layers. + if self.architecture == 'resnet': + y = self.skip(x, gain=np.sqrt(0.5)) + x = self.conv0(x) + x = self.conv1(x, gain=np.sqrt(0.5)) + x = y.add_(x) + else: + x = self.conv0(x) + x = self.conv1(x) + + assert x.dtype == dtype + return x, img + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class MinibatchStdLayer(torch.nn.Module): + def __init__(self, group_size, num_channels=1): + super().__init__() + self.group_size = group_size + self.num_channels = num_channels + + def forward(self, x): + N, C, H, W = x.shape + with misc.suppress_tracer_warnings(): # as_tensor results are registered as constants + G = torch.min(torch.as_tensor(self.group_size), torch.as_tensor(N)) if self.group_size is not None else N + F = self.num_channels + c = C // F + + y = x.reshape(G, -1, F, c, H, W) # [GnFcHW] Split minibatch N into n groups of size G, and channels C into F groups of size c. + y = y - y.mean(dim=0) # [GnFcHW] Subtract mean over group. + y = y.square().mean(dim=0) # [nFcHW] Calc variance over group. + y = (y + 1e-8).sqrt() # [nFcHW] Calc stddev over group. + y = y.mean(dim=[2,3,4]) # [nF] Take average over channels and pixels. + y = y.reshape(-1, F, 1, 1) # [nF11] Add missing dimensions. + y = y.repeat(G, 1, H, W) # [NFHW] Replicate over group and pixels. + x = torch.cat([x, y], dim=1) # [N(C+1)HW] Append to input as new channels. + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class DiscriminatorEpilogue(torch.nn.Module): + def __init__(self, + in_channels, # Number of input channels. + cmap_dim, # Dimensionality of mapped conditioning label, 0 = no label. + resolution, # Resolution of this block. + img_channels, # Number of input color channels. + architecture = 'resnet', # Architecture: 'orig', 'skip', 'resnet'. + mbstd_group_size = 4, # Group size for the minibatch standard deviation layer, None = entire minibatch. + mbstd_num_channels = 1, # Number of features for the minibatch standard deviation layer, 0 = disable. + activation = 'lrelu', # Activation function: 'relu', 'lrelu', etc. + conv_clamp = None, # Clamp the output of convolution layers to +-X, None = disable clamping. + cfg = {}, # Architecture config. + ): + assert architecture in ['orig', 'skip', 'resnet'] + super().__init__() + + self.cfg = cfg + self.in_channels = in_channels + self.cmap_dim = cmap_dim + self.resolution = resolution + self.img_channels = img_channels + self.architecture = architecture + + if architecture == 'skip': + self.fromrgb = Conv2dLayer(img_channels, in_channels, kernel_size=1, activation=activation) + self.mbstd = MinibatchStdLayer(group_size=mbstd_group_size, num_channels=mbstd_num_channels) if mbstd_num_channels > 0 else None + self.conv = Conv2dLayer(in_channels + mbstd_num_channels, in_channels, kernel_size=3, activation=activation, conv_clamp=conv_clamp) + self.fc = FullyConnectedLayer(in_channels * (resolution ** 2), in_channels, activation=activation) + self.out = FullyConnectedLayer(in_channels, 1 if cmap_dim == 0 else cmap_dim) + + def forward(self, x, img, cmap, force_fp32=False): + misc.assert_shape(x, [None, self.in_channels, self.resolution, self.resolution]) # [NCHW] + _ = force_fp32 # unused + dtype = torch.float32 + memory_format = torch.contiguous_format + + # FromRGB. + x = x.to(dtype=dtype, memory_format=memory_format) + if self.architecture == 'skip': + misc.assert_shape(img, [None, self.img_channels, self.resolution, self.resolution]) + img = img.to(dtype=dtype, memory_format=memory_format) + x = x + self.fromrgb(img) + + # Main layers. + if self.mbstd is not None: + x = self.mbstd(x) + + x = self.conv(x) + x = self.fc(x.flatten(1)) + x = self.out(x) # [batch_size, out_dim] + + # Conditioning. + if self.cmap_dim > 0: + misc.assert_shape(cmap, [None, self.cmap_dim]) + x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim)) # [batch_size, 1] + + assert x.dtype == dtype + return x + +#---------------------------------------------------------------------------- + +@persistence.persistent_class +class Discriminator(torch.nn.Module): + def __init__(self, + c_dim, # Conditioning label (C) dimensionality. + img_resolution, # Input resolution. + img_channels, # Number of input color channels. + architecture = 'resnet', # Architecture: 'orig', 'skip', 'resnet'. + channel_base = 32768, # Overall multiplier for the number of channels. + channel_max = 512, # Maximum number of channels in any layer. + num_fp16_res = 0, # Use FP16 for the N highest resolutions. + conv_clamp = None, # Clamp the output of convolution layers to +-X, None = disable clamping. + cmap_dim = None, # Dimensionality of mapped conditioning label, None = default. + block_kwargs = {}, # Arguments for DiscriminatorBlock. + mapping_kwargs = {}, # Arguments for MappingNetwork. + epilogue_kwargs = {}, # Arguments for DiscriminatorEpilogue. + cfg = {}, # Additional config. + ): + super().__init__() + + self.cfg = cfg + self.c_dim = c_dim + self.img_resolution = img_resolution + self.img_resolution_log2 = int(np.log2(img_resolution)) + self.img_channels = img_channels + self.block_resolutions = [2 ** i for i in range(self.img_resolution_log2, 2, -1)] + channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions + [4]} + fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8) + + if cmap_dim is None: + cmap_dim = channels_dict[4] + + if self.cfg.sampling.num_frames_per_video > 1: + self.time_encoder = TemporalDifferenceEncoder(self.cfg) + assert self.time_encoder.get_dim() > 0 + else: + self.time_encoder = None + + if self.c_dim == 0 and self.time_encoder is None: + cmap_dim = 0 + + common_kwargs = dict(img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp) + total_c_dim = c_dim + (0 if self.time_encoder is None else self.time_encoder.get_dim()) + cur_layer_idx = 0 + + for res in self.block_resolutions: + in_channels = channels_dict[res] if res < img_resolution else 0 + tmp_channels = channels_dict[res] + out_channels = channels_dict[res // 2] + + if res // 2 == self.cfg.concat_res: + out_channels = out_channels // self.cfg.num_frames_div_factor + if res == self.cfg.concat_res: + in_channels = (in_channels // self.cfg.num_frames_div_factor) * self.cfg.sampling.num_frames_per_video + + use_fp16 = (res >= fp16_resolution) + block = DiscriminatorBlock(in_channels, tmp_channels, out_channels, resolution=res, + first_layer_idx=cur_layer_idx, use_fp16=use_fp16, cfg=self.cfg, **block_kwargs, **common_kwargs) + setattr(self, f'b{res}', block) + cur_layer_idx += block.num_layers + + if self.c_dim > 0 or not self.time_encoder is None: + self.mapping = MappingNetwork(z_dim=0, c_dim=total_c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None, **mapping_kwargs) + self.b4 = DiscriminatorEpilogue(channels_dict[4], cmap_dim=cmap_dim, resolution=4, cfg=self.cfg, **epilogue_kwargs, **common_kwargs) + + def forward(self, img, c, t, **block_kwargs): + assert len(img) == t.shape[0] * t.shape[1], f"Wrong shape: {img.shape}, {t.shape}" + assert t.ndim == 2, f"Wrong shape: {t.shape}" + + if not self.time_encoder is None: + # Encoding the time distances + t_embs = self.time_encoder(t.view(-1, self.cfg.sampling.num_frames_per_video)) # [batch_size, t_dim] + + # Concatenate `c` and time embeddings + c = torch.cat([c, t_embs], dim=1) # [batch_size, c_dim + t_dim] + c = (c * 0.0) if self.cfg.dummy_c else c # [batch_size, c_dim + t_dim] + + x = None + for res in self.block_resolutions: + block = getattr(self, f'b{res}') + if res == self.cfg.concat_res: + # Concatenating the frames + x = x.view(-1, self.cfg.sampling.num_frames_per_video, *x.shape[1:]) # [batch_size, num_frames, c, h, w] + x = x.view(x.shape[0], -1, *x.shape[3:]) # [batch_size, num_frames * c, h, w] + x, img = block(x, img, **block_kwargs) + + cmap = None + if self.c_dim > 0 or not self.time_encoder is None: + assert c.shape[1] > 0 + if c.shape[1] > 0: + cmap = self.mapping(None, c) + x = self.b4(x, img, cmap) + x = x.squeeze(1) # [batch_size] + + return {'image_logits': x} + +#---------------------------------------------------------------------------- diff --git a/src/training/training_loop.py b/src/training/training_loop.py new file mode 100644 index 0000000000000000000000000000000000000000..36140231dee6e0259c97bc9eb43ab4acd9ee4d33 --- /dev/null +++ b/src/training/training_loop.py @@ -0,0 +1,571 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +import os +import time +import copy +import json +import pickle +import random +import PIL.Image +import numpy as np +import torch +from torch.utils.tensorboard import SummaryWriter # Note: importing torchvision BEFORE tensorboard results in SIGSEGV +import torchvision +from src import dnnlib +from omegaconf import OmegaConf +from src.torch_utils import misc +from src.torch_utils import training_stats +from src.torch_utils.ops import conv2d_gradfix +from src.torch_utils.ops import grid_sample_gradfix + +import src.legacy +from src.metrics import metric_main +from src.training.layers import sample_frames +from src.training.logging import generate_videos, save_video_frames_as_mp4 + +#---------------------------------------------------------------------------- + +def setup_snapshot_image_grid(training_set, random_seed=0): + rnd = np.random.RandomState(random_seed) + gw = np.clip(3840 // training_set.image_shape[2], 7, 32) + gh = np.clip(2160 // training_set.image_shape[1], 4, 32) + + # No labels => show random subset of training samples. + if not training_set.has_labels: + all_indices = list(range(len(training_set))) + rnd.shuffle(all_indices) + grid_indices = [all_indices[i % len(all_indices)] for i in range(gw * gh)] + else: + # Group training samples by label. + label_groups = dict() # label => [idx, ...] + for idx in range(len(training_set)): + label = tuple(training_set.get_details(idx).raw_label.flat[::-1]) + if label not in label_groups: + label_groups[label] = [] + label_groups[label].append(idx) + + # Reorder. + label_order = sorted(label_groups.keys()) + for label in label_order: + rnd.shuffle(label_groups[label]) + + # Organize into grid. + grid_indices = [] + for y in range(gh): + label = label_order[y % len(label_order)] + indices = label_groups[label] + grid_indices += [indices[x % len(indices)] for x in range(gw)] + label_groups[label] = [indices[(i + gw) % len(indices)] for i in range(len(indices))] + + # Load data. + batches = [training_set[i] for i in grid_indices] + images = [b['image'] for b in batches] + labels = [b['label'] for b in batches] + t = [b['times'] for b in batches] + + return (gw, gh), np.stack(images), np.stack(labels), np.stack(t) + +#---------------------------------------------------------------------------- + +def save_image_grid(img, fname, drange, grid_size): + lo, hi = drange + img = np.asarray(img, dtype=np.float32) + img = (img - lo) * (255 / (hi - lo)) + img = np.rint(img).clip(0, 255).astype(np.uint8) + + gw, gh = grid_size + _N, C, H, W = img.shape + img = img.reshape(gh, gw, C, H, W) + img = img.transpose(0, 3, 1, 4, 2) + img = img.reshape(gh * H, gw * W, C) + + assert C in [1, 3] + if C == 1: + PIL.Image.fromarray(img[:, :, 0], 'L').save(fname) + if C == 3: + PIL.Image.fromarray(img, 'RGB').save(fname, quality=95) + +#---------------------------------------------------------------------------- + +def training_loop( + cfg = {}, # Main config we use. + run_dir = '.', # Output directory. + training_set_kwargs = {}, # Options for training set. + data_loader_kwargs = {}, # Options for torch.utils.data.DataLoader. + G_kwargs = {}, # Options for generator network. + D_kwargs = {}, # Options for discriminator network. + G_opt_kwargs = {}, # Options for generator optimizer. + D_opt_kwargs = {}, # Options for discriminator optimizer. + augment_kwargs = None, # Options for augmentation pipeline. None = disable. + loss_kwargs = {}, # Options for loss function. + metrics = [], # Metrics to evaluate during training. + random_seed = 0, # Global random seed. + num_gpus = 1, # Number of GPUs participating in the training. + rank = 0, # Rank of the current process in [0, num_gpus[. + batch_size = 4, # Total batch size for one training iteration. Can be larger than batch_gpu * num_gpus. + batch_gpu = 4, # Number of samples processed at a time by one GPU. + ema_kimg = 10, # Half-life of the exponential moving average (EMA) of generator weights. + ema_rampup = None, # EMA ramp-up coefficient. + G_reg_interval = 4, # How often to perform regularization for G? None = disable lazy regularization. + D_reg_interval = 16, # How often to perform regularization for D? None = disable lazy regularization. + augment_p = 0, # Initial value of augmentation probability. + ada_target = None, # ADA target value. None = fixed p. + ada_interval = 4, # How often to perform ADA adjustment? + ada_kimg = 500, # ADA adjustment speed, measured in how many kimg it takes for p to increase/decrease by one unit. + total_kimg = 25000, # Total length of the training, measured in thousands of real images. + kimg_per_tick = 5, # Progress snapshot interval. + image_snapshot_ticks = 50, # How often to save image snapshots? None = disable. + network_snapshot_ticks = 50, # How often to save network snapshots? None = disable. + resume_pkl = None, # Network pickle to resume training from. + resume_whole_state = False, # Should we resume the whole state or only the G/D/G_ema checkpoints? + cudnn_benchmark = True, # Enable torch.backends.cudnn.benchmark? + allow_tf32 = False, # Enable torch.backends.cuda.matmul.allow_tf32 and torch.backends.cudnn.allow_tf32? + abort_fn = None, # Callback function for determining whether to abort training. Must return consistent results across ranks. + progress_fn = None, # Callback function for updating training progress. Called for all ranks. +): + # Initialize. + experiment_name = os.path.basename(os.path.dirname(run_dir)) + start_time = time.time() + device = torch.device('cuda', rank) + random.seed(random_seed * num_gpus + rank) + np.random.seed(random_seed * num_gpus + rank) + torch.manual_seed(random_seed * num_gpus + rank) + torch.backends.cudnn.benchmark = cudnn_benchmark # Improves training speed. + torch.backends.cuda.matmul.allow_tf32 = allow_tf32 # Allow PyTorch to internally use tf32 for matmul + torch.backends.cudnn.allow_tf32 = allow_tf32 # Allow PyTorch to internally use tf32 for convolutions + conv2d_gradfix.enabled = True # Improves training speed. + grid_sample_gradfix.enabled = True # Avoids errors with the augmentation pipe. + + # Load training set. + if rank == 0: + print('Loading training set...') + training_set = dnnlib.util.construct_class_by_name(**training_set_kwargs) # subclass of training.dataset.Dataset + training_set_sampler = misc.InfiniteSampler(dataset=training_set, rank=rank, num_replicas=num_gpus, seed=random_seed) + training_set_iterator = iter(torch.utils.data.DataLoader(dataset=training_set, sampler=training_set_sampler, batch_size=batch_size//num_gpus, **data_loader_kwargs)) + if rank == 0: + print() + print('Num videos: ', len(training_set)) + print('Image shape:', training_set.image_shape) + print('Label shape:', training_set.label_shape) + print() + + # Construct networks. + if rank == 0: + print('Constructing networks...') + common_kwargs = dict(c_dim=training_set.label_dim, img_resolution=training_set.resolution, img_channels=training_set.num_channels) + G = dnnlib.util.construct_class_by_name(**G_kwargs, **common_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module + D = dnnlib.util.construct_class_by_name(**D_kwargs, **common_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module + G_ema = copy.deepcopy(G).eval() + + # Resume from existing pickle. + if (resume_pkl is not None): + if rank == 0: + print(f'Resuming from "{resume_pkl}"') + with dnnlib.util.open_url(resume_pkl) as f: + resume_data = src.legacy.load_network_pkl(f) + + if rank == 0: + for name, module in [('G', G), ('D', D), ('G_ema', G_ema)]: + misc.copy_params_and_buffers(resume_data[name], module, require_all=False) + else: + resume_data = None + + cur_nimg = 0 if not resume_whole_state else resume_data['stats']['cur_nimg'] + cur_tick = 0 if not resume_whole_state else resume_data['stats']['cur_tick'] + batch_idx = 0 if not resume_whole_state else resume_data['stats']['batch_idx'] + tick_start_nimg = cur_nimg + + # Print network summary tables. + if rank == 0 and not resume_whole_state: + z = torch.empty([batch_gpu, G.z_dim], device=device) # [bf, z_dim] + c = torch.empty([batch_gpu, G.c_dim], device=device) # [b, c_dim] + t = torch.zeros([batch_gpu, cfg.sampling.num_frames_per_video], device=device).long() # [b, f] + img = misc.print_module_summary(G, [z, c, t]) # [bf, c, h, w] + misc.print_module_summary(D, [img, c, t]) + + # Setup augmentation. + if rank == 0: + print('Setting up augmentation...') + + if (augment_kwargs is not None) and (augment_p > 0 or ada_target is not None): + augment_pipe = dnnlib.util.construct_class_by_name(**augment_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module + augment_pipe.p.copy_(torch.as_tensor(augment_p)) + + if ada_target is not None: + ada_stats = training_stats.Collector(regex='Loss/signs/real') + else: + ada_stats = None + + if resume_whole_state: + misc.copy_params_and_buffers(resume_data['augment_pipe'], augment_pipe, require_all=False) + else: + augment_pipe = None + ada_stats = None + + # Distribute across GPUs. + if rank == 0: + print(f'Distributing across {num_gpus} GPUs...') + ddp_modules = dict() + modules = [ + ('G_mapping', G.mapping), + ('G_synthesis', G.synthesis), + ('D', D), + (None, G_ema), + ('augment_pipe', augment_pipe), + ] + if cfg.model.loss_kwargs.motion_reg.coef > 0.0: + modules.append(('G_motion_encoder', G.synthesis.motion_encoder)) + + for name, module in modules: + if (num_gpus > 1) and (module is not None) and len(list(module.parameters())) != 0: + module.requires_grad_(True) + module = torch.nn.parallel.DistributedDataParallel(module, device_ids=[device], broadcast_buffers=False) + module.requires_grad_(False) + if name is not None: + ddp_modules[name] = module + + # Setup training phases. + if rank == 0: + print('Setting up training phases...') + loss = dnnlib.util.construct_class_by_name(device=device, **ddp_modules, **loss_kwargs) # subclass of training.loss.Loss + phases = [] + for name, module, opt_kwargs, reg_interval in [('G', G, G_opt_kwargs, G_reg_interval), ('D', D, D_opt_kwargs, D_reg_interval)]: + if reg_interval is None: + params = module.params_with_lr(opt_kwargs.lr) if hasattr(module, 'params_with_lr') else module.parameters() + opt = dnnlib.util.construct_class_by_name(params=params, **opt_kwargs) # subclass of torch.optim.Optimizer + phases += [dnnlib.EasyDict(name=name+'both', module=module, opt=opt, interval=1)] + else: # Lazy regularization. + mb_ratio = reg_interval / (reg_interval + 1) + opt_kwargs = dnnlib.EasyDict(opt_kwargs) + opt_kwargs.lr = opt_kwargs.lr * mb_ratio + opt_kwargs.betas = [beta ** mb_ratio for beta in opt_kwargs.betas] + params = module.params_with_lr(opt_kwargs.lr) if hasattr(module, 'params_with_lr') else module.parameters() + opt = dnnlib.util.construct_class_by_name(params=params, **opt_kwargs) # subclass of torch.optim.Optimizer + phases += [dnnlib.EasyDict(name=name+'main', module=module, opt=opt, interval=1)] + phases += [dnnlib.EasyDict(name=name+'reg', module=module, opt=opt, interval=reg_interval)] + + for phase in phases: + if rank == 0: + phase.start_event = torch.cuda.Event(enable_timing=True) + phase.end_event = torch.cuda.Event(enable_timing=True) + else: + phase.start_event = None + phase.end_event = None + phase.start_event_recorded = False + phase.end_event_recorded = False + + # Ok, we need to extract G_opt and D_opt back from phases since we want to save them... + G_opt = next(p.opt for p in phases if p.name in {'Gboth', 'Gmain', 'Greg'}) + D_opt = next(p.opt for p in phases if p.name in {'Dboth', 'Dmain', 'Dreg'}) + + if resume_whole_state: + G_opt.load_state_dict(resume_data['G_opt'].state_dict()) + D_opt.load_state_dict(resume_data['D_opt'].state_dict()) + + # Export sample images. + if rank == 0: + if not resume_whole_state: + vis = dnnlib.EasyDict(num_videos={128: 36, 256: 25, 512: 9, 1024: 1}[training_set.resolution]) + print('Exporting sample images...') + vis.grid_size, images, vis.labels, vis.frames_idx = setup_snapshot_image_grid(training_set=training_set) + save_image_grid(images[:, 0], os.path.join(run_dir, 'reals.jpg'), drange=[0,255], grid_size=vis.grid_size) + vis.grid_z = torch.randn([vis.labels.shape[0], G.z_dim], device=device).split(batch_gpu) # (num_batches, [batch_size, z_dim]) + vis.grid_c = torch.from_numpy(vis.labels).to(device).split(batch_gpu) # (num_batches, [batch_size, c_dim]) + vis.grid_t = torch.from_numpy(vis.frames_idx).to(device).split(batch_gpu) # (num_batches, [batch_size, num_frames]) + images = torch.cat([G_ema(z=z, c=c, t=t[:, [0]], noise_mode='const').cpu() for z, c, t in zip(vis.grid_z, vis.grid_c, vis.grid_t)]).numpy() + save_image_grid(images, os.path.join(run_dir, 'fakes_init.jpg'), drange=[-1,1], grid_size=vis.grid_size) + + # Generating data for videos + assert len(vis.labels) >= vis.num_videos + vis.video_len = {128: 150, 256: 150, 512: 32, 1024: 4}[training_set.resolution] + vis.vgrid_z = torch.randn(vis.num_videos, G_ema.z_dim, device=device) # [batch_size, z_dim] + vis.vgrid_c = torch.from_numpy(vis.labels)[:vis.num_videos].to(device) # [batch_size, c_dim] + vis.ts = torch.arange(vis.video_len, device=device).float().unsqueeze(0).repeat(vis.num_videos, 1) # [batch_size, video_len] + else: + vis = dnnlib.EasyDict(**resume_data['vis']) + for k in vis: + if isinstance(vis[k], torch.Tensor): + vis[k] = vis[k].to(device) + images = torch.cat([G_ema(z=z, c=c, t=t[:, [0]], noise_mode='const').cpu() for z, c, t in zip(vis.grid_z, vis.grid_c, vis.grid_t)]).numpy() + save_image_grid(images, os.path.join(run_dir, f'fakes_resume_{cur_nimg}.jpg'), drange=[-1,1], grid_size=vis.grid_size) + else: + vis = dnnlib.EasyDict() + + # Initialize logs. + if rank == 0: + print('Initializing logs...') + stats_collector = training_stats.Collector(regex='.*') + stats_metrics = dict() + stats_jsonl = None + stats_tfevents = None + if rank == 0: + stats_jsonl = open(os.path.join(run_dir, 'stats.jsonl'), 'wt') + try: + stats_tfevents = SummaryWriter(run_dir) + if not resume_whole_state: + config_yaml = OmegaConf.to_yaml(cfg) + stats_tfevents.add_text(f'config', text_to_markdown(config_yaml), global_step=0, walltime=time.time()) + except ImportError as err: + print('Skipping tfevents export:', err) + + # Train. + if rank == 0: + print(f'Training for {total_kimg} kimg...') + print() + tick_start_time = time.time() + maintenance_time = tick_start_time - start_time + if progress_fn is not None: + progress_fn(cur_nimg, total_kimg) + + # Convert to bool since hydra has a very slow access time... + use_fractional_t_for_G = bool(cfg.model.generator.motion.use_fractional_t) + + while True: + # Fetch training data. + with torch.autograd.profiler.record_function('data_fetch'): + batch = next(training_set_iterator) + phase_real_img, phase_real_c, phase_real_t, phase_real_l = batch['image'], batch['label'], batch['times'], batch['video_len'] + phase_real_img = (phase_real_img.to(device).to(torch.float32) / 127.5 - 1).split(batch_gpu) + phase_real_c = phase_real_c.to(device).split(batch_gpu) # [batch_gpu, batch_size, c_dim] + phase_real_t = phase_real_t.to(device).split(batch_gpu) # [batch_gpu, batch_size, c_dim] + all_gen_z = torch.randn([len(phases) * batch_size, G.z_dim], device=device) + all_gen_z = [phase_gen_z.split(batch_gpu) for phase_gen_z in all_gen_z.split(batch_size)] + + gen_cond_sample_idx = [np.random.randint(len(training_set)) for _ in range(len(phases) * batch_size)] + all_gen_c = [training_set.get_label(i) for i in gen_cond_sample_idx] + all_gen_c = torch.from_numpy(np.stack(all_gen_c)).pin_memory().to(device) + all_gen_c = [phase_gen_c.split(batch_gpu) for phase_gen_c in all_gen_c.split(batch_size)] + all_gen_l = [min(training_set.get_video_len(i), G.sampling_dict['max_num_frames']) for i in gen_cond_sample_idx] + all_gen_t = [sample_frames(G.sampling_dict, use_fractional_t=use_fractional_t_for_G, total_video_len=l) for l in all_gen_l] + all_gen_t = torch.from_numpy(np.stack(all_gen_t)).pin_memory().to(device) + all_gen_t = [phase_gen_t.split(batch_gpu) for phase_gen_t in all_gen_t.split(batch_size)] + + # Execute training phases. + for phase, phase_gen_z, phase_gen_c, phase_gen_t in zip(phases, all_gen_z, all_gen_c, all_gen_t): + if batch_idx % phase.interval != 0: + continue + + # Initialize gradient accumulation. + if phase.start_event is not None: + phase.start_event.record(torch.cuda.current_stream(device)) + phase.start_event_recorded = True + phase.opt.zero_grad(set_to_none=True) + phase.module.requires_grad_(True) + phase.module.train() + + # Accumulate gradients over multiple rounds. + curr_data = zip(phase_real_img, phase_real_c, phase_real_t, phase_gen_z, phase_gen_c, phase_gen_t) + for round_idx, (real_img, real_c, real_t, gen_z, gen_c, gen_t) in enumerate(curr_data): + sync = (round_idx == batch_size // (batch_gpu * num_gpus) - 1) + gain = phase.interval + + loss.accumulate_gradients( + phase=phase.name, + real_img=real_img, + real_c=real_c, + real_t=real_t, + gen_z=gen_z, + gen_c=gen_c, + gen_t=gen_t, + sync=sync, + gain=gain) + + # Update weights. + phase.module.requires_grad_(False) + with torch.autograd.profiler.record_function(phase.name + '_opt'): + for param in phase.module.parameters(): + if param.grad is not None: + misc.nan_to_num(param.grad, nan=0, posinf=1e5, neginf=-1e5, out=param.grad) + phase.opt.step() + if phase.end_event is not None: + phase.end_event.record(torch.cuda.current_stream(device)) + phase.end_event_recorded = True + + # Update G_ema. + with torch.autograd.profiler.record_function('Gema'): + ema_nimg = ema_kimg * 1000 + if ema_rampup is not None: + ema_nimg = min(ema_nimg, cur_nimg * ema_rampup) + ema_beta = 0.5 ** (batch_size / max(ema_nimg, 1e-8)) + for p_ema, p in zip(G_ema.parameters(), G.parameters()): + p_ema.copy_(p.lerp(p_ema, ema_beta)) + for b_ema, b in zip(G_ema.buffers(), G.buffers()): + b_ema.copy_(b) + + # Update state. + cur_nimg += batch_size * cfg.sampling.num_frames_per_video + batch_idx += 1 + + # Execute ADA heuristic. + if (ada_stats is not None) and (batch_idx % ada_interval == 0): + ada_stats.update() + adjust = np.sign(ada_stats['Loss/signs/real'] - ada_target) * (batch_size * ada_interval) / (ada_kimg * 1000) + augment_pipe.p.copy_((augment_pipe.p + adjust).max(misc.constant(0, device=device))) + + # Perform maintenance tasks once per tick. + done = (cur_nimg >= total_kimg * 1000) + if (not done) and (cur_tick != 0) and (cur_nimg < tick_start_nimg + kimg_per_tick * 1000): + continue + + # Print status line, accumulating the same information in stats_collector. + tick_end_time = time.time() + fields = [] + fields += [f"tick {training_stats.report0('Progress/tick', cur_tick):<5d}"] + fields += [f"kimg {training_stats.report0('Progress/kimg', cur_nimg / 1e3):<8.1f}"] + fields += [f"time {dnnlib.util.format_time(training_stats.report0('Timing/total_sec', tick_end_time - start_time)):<12s}"] + fields += [f"sec/tick {training_stats.report0('Timing/sec_per_tick', tick_end_time - tick_start_time):<7.1f}"] + fields += [f"sec/kimg {training_stats.report0('Timing/sec_per_kimg', (tick_end_time - tick_start_time) / (cur_nimg - tick_start_nimg) * 1e3):<7.2f}"] + fields += [f"maintenance {training_stats.report0('Timing/maintenance_sec', maintenance_time):<6.1f}"] + #fields += [f"cpumem {training_stats.report0('Resources/cpu_mem_gb', psutil.Process(os.getpid()).memory_info().rss / 2**30):<6.2f}"] + fields += [f"gpumem {training_stats.report0('Resources/peak_gpu_mem_gb', torch.cuda.max_memory_allocated(device) / 2**30):<6.2f}"] + torch.cuda.reset_peak_memory_stats() + fields += [f"augment {training_stats.report0('Progress/augment', float(augment_pipe.p.cpu()) if augment_pipe is not None else 0):.3f}"] + training_stats.report0('Timing/total_hours', (tick_end_time - start_time) / (60 * 60)) + training_stats.report0('Timing/total_days', (tick_end_time - start_time) / (24 * 60 * 60)) + if rank == 0: + print(' '.join(fields)) + + # Check for abort. + if (not done) and (abort_fn is not None) and abort_fn(): + done = True + if rank == 0: + print() + print('Aborting...') + + # Save image snapshot. + if (rank == 0) and (image_snapshot_ticks is not None) and (done or cur_tick % image_snapshot_ticks == 0): + images = torch.cat([G_ema(z=z, c=c, t=t[:, [0]], noise_mode='const').cpu() for z, c, t in zip(vis.grid_z, vis.grid_c, vis.grid_t)]).numpy() + save_image_grid(images, os.path.join(run_dir, f'fakes{cur_nimg//1000:06d}.jpg'), drange=[-1,1], grid_size=vis.grid_size) + + # Saving videos + videos_diff_motion = generate_videos(G_ema, vis.vgrid_z, vis.vgrid_c, vis.ts, as_grids=True) # [video_len, 3, h, w] + if not G_ema.synthesis.motion_encoder is None: + with torch.no_grad(): + motion_z = G_ema.synthesis.motion_encoder(c=vis.vgrid_c[[0]], t=vis.ts[[0]])['motion_z'] # [1, *motion_dims] + motion_z = motion_z.repeat_interleave(len(vis.ts), dim=0) # [batch_size, *motion_dims] + videos_same_motion = generate_videos(G_ema, vis.vgrid_z, vis.vgrid_c, vis.ts, motion_z=motion_z, as_grids=True) # [video_len, 3, h, w] + + assert videos_diff_motion.shape == videos_same_motion.shape, f"Wrong shape: {videos_diff_motion.shape} != {videos_same_motion.shape}" + + pad_size = 64 + videos_to_save = torch.cat([ + videos_diff_motion, + torch.ones_like(videos_diff_motion[:, :, :, :pad_size]), # Some padding between the videos + videos_same_motion, + ], dim=3) # [video_len, 3, h, w + pad_size + w] + else: + videos_to_save = videos_diff_motion + + videos_to_save = (videos_to_save * 255).to(torch.uint8).permute(0, 2, 3, 1) # [T, H, W, C] + torchvision.io.write_video(os.path.join(run_dir, f'{experiment_name}_videos_{cur_nimg//1000:06d}.mp4'), videos_to_save, fps=cfg.dataset.fps, video_codec='h264', options={'crf': '10'}) + # save_video_frames_as_mp4(videos_to_save, cfg.dataset.fps, os.path.join(run_dir, f'{experiment_name}_videos_{cur_nimg//1000:06d}.mp4')) + # if not stats_tfevents is None: + # stats_tfevents.add_video('videos', videos_to_save.unsqueeze(0), global_step=int(cur_nimg / 1e3), walltime=time.time() - start_time) + + # Save network snapshot. + snapshot_pkl = None + snapshot_data = None + snapshot_modules = [ + ('G', G), + ('D', D), + ('G_ema', G_ema), + ('augment_pipe', augment_pipe), + ('G_opt', G_opt), + ('D_opt', D_opt), + ('vis', {k: (v.to('cpu') if isinstance(v, torch.Tensor) else v) for k, v in vis.items()}), + ('stats', {'cur_nimg': cur_nimg, 'cur_tick': cur_tick, 'batch_idx': batch_idx}), + ] + if (network_snapshot_ticks is not None) and (done or cur_tick % network_snapshot_ticks == 0): + snapshot_data = dict(training_set_kwargs=dict(training_set_kwargs)) + DDP_CONSISTENCY_IGNORE_REGEX = r'.*\.(w_avg|p|rnn\..*|embeds.*\.weight|num_batches_tracked|running_mean|running_var)' + for name, module in snapshot_modules: + if module is not None: + if isinstance(module, torch.nn.Module): + if num_gpus > 1: + misc.check_ddp_consistency(module, ignore_regex=DDP_CONSISTENCY_IGNORE_REGEX) + module = copy.deepcopy(module).eval().requires_grad_(False).cpu() + else: + module = copy.deepcopy(module) + snapshot_data[name] = module + del module # conserve memory + snapshot_pkl = os.path.join(run_dir, f'network-snapshot-{cur_nimg//1000:06d}.pkl') + if rank == 0: + with open(snapshot_pkl, 'wb') as f: + pickle.dump(snapshot_data, f) + + # Evaluate metrics. + if (snapshot_data is not None) and (len(metrics) > 0): + if rank == 0: + print(f'Evaluating metrics for {experiment_name} ...') + for metric in metrics: + result_dict = metric_main.calc_metric( + metric=metric, + G=snapshot_data['G_ema'], + dataset_kwargs=training_set_kwargs, + num_gpus=num_gpus, + rank=rank, + device=device) + if rank == 0: + metric_main.report_metric(result_dict, run_dir=run_dir, snapshot_pkl=snapshot_pkl) + stats_metrics.update(result_dict.results) + del snapshot_data # conserve memory + + # Collect statistics. + for phase in phases: + value = [] + if (phase.start_event is not None) and (phase.end_event is not None) and phase.start_event_recorded and phase.end_event_recorded: + phase.end_event.synchronize() + value = phase.start_event.elapsed_time(phase.end_event) + training_stats.report0('Timing/' + phase.name, value) + stats_collector.update() + stats_dict = stats_collector.as_dict() + + # Update logs. + timestamp = time.time() + if stats_jsonl is not None: + fields = dict(stats_dict, timestamp=timestamp) + stats_jsonl.write(json.dumps(fields) + '\n') + stats_jsonl.flush() + if stats_tfevents is not None: + global_step = int(cur_nimg / 1e3) + for name, value in stats_dict.items(): + stats_tfevents.add_scalar(name, value.mean, global_step=global_step, walltime=timestamp) + for name, value in stats_metrics.items(): + stats_tfevents.add_scalar(f'Metrics/{name}', value, global_step=global_step, walltime=timestamp) + stats_tfevents.flush() + if progress_fn is not None: + progress_fn(cur_nimg // 1000, total_kimg) + + # Update state. + cur_tick += 1 + tick_start_nimg = cur_nimg + tick_start_time = time.time() + maintenance_time = tick_start_time - tick_end_time + if done: + break + + # Done. + if rank == 0: + print() + print('Exiting...') + +#---------------------------------------------------------------------------- + +def text_to_markdown(text: str) -> str: + """ + Converts an arbitrarily text into a text that would be well-displayed in TensorBoard. + TensorBoard uses markdown to render the text that's why it strips spaces and line breaks. + This function fixes that. + """ + text = text.replace(' ', '  ') # Because markdown does not support text indentation normally... + text = text.replace('\n', ' \n') # Because tensorboard uses markdown + + return text + +#---------------------------------------------------------------------------- diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..6d473e9cf57574ec9750cce5519fc48895bdaa93 --- /dev/null +++ b/train.py @@ -0,0 +1,291 @@ +import os +os.environ["CUDA_VISIBLE_DEVICES"]="1" +from transformers import CLIPVisionModel, CLIPProcessor +import torch.nn as nn +import torch +import torch.utils.data as data +import os.path as osp +import cv2 +import torchvision.transforms as transforms +import torch.optim as optim +from tensorboardX import SummaryWriter +import argparse +import numpy as np +import torchvision.transforms.functional as TVF +import torch.nn.functional as F +from models.unet_dual_encoder import Embedding_Adapter +from distributed import (get_rank, synchronize) +from diffusers import AutoencoderKL +from models.diffusion_model import SpaceTimeUnet + +parser = argparse.ArgumentParser(description="Configuration of the training script.") +parser.add_argument("--local_rank", type=int, default=0, help="local rank for distributed training") +parser.add_argument('--dataset', default="fashion_dataset/train", help="Path to the dataset") +parser.add_argument('--dataset_vae', default="fashion_dataset_tensor", help="Path to the tensors of latent space") +parser.add_argument('--output_dir', default="checkpoint", help="Path to save the checkpoints") +args = parser.parse_args() + +args = parser.parse_args() + +torch.distributed.init_process_group(backend="nccl", init_method="env://") +torch.cuda.set_device(args.local_rank) +device = torch.device("cuda", args.local_rank) +synchronize() + +frameLimit = 70 + +if get_rank() == 0: + writer = SummaryWriter('video_progress') + +def cosine_beta_schedule(timesteps, start=0.0001, end=0.02): + betas = [] + for i in reversed(range(timesteps)): + T = timesteps - 1 + beta = start + 0.5 * (end - start) * (1 + np.cos((i / T) * np.pi)) + betas.append(beta) + return torch.Tensor(betas) + +def get_index_from_list(vals, t, x_shape): + batch_size = t.shape[0] + out = vals.gather(-1, t.cpu()) + return out.reshape(batch_size, *((1,) * (len(x_shape) - 1))).to(t.device) + +def forward_diffusion_sample(x_0, t): + noise = torch.randn_like(x_0) + sqrt_alphas_cumprod_t = get_index_from_list(sqrt_alphas_cumprod, t, x_0.shape) + sqrt_one_minus_alphas_cumprod_t = get_index_from_list( + sqrt_one_minus_alphas_cumprod, t, x_0.shape + ) + # mean + variance + return sqrt_alphas_cumprod_t.to(device) * x_0.to(device) \ + + sqrt_one_minus_alphas_cumprod_t.to(device) * noise.to(device), noise.to(device) + +T = 1000 +betas = cosine_beta_schedule(timesteps=T) +# Pre-calculate different terms for closed form +alphas = 1. - betas +alphas_cumprod = torch.cumprod(alphas, axis=0) +alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value=1.0) +sqrt_recip_alphas = torch.sqrt(1.0 / alphas) +sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod) +sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - alphas_cumprod) +posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod) + +def get_transform(): + image_transforms = transforms.Compose( + [ + transforms.Resize((640, 512), interpolation=transforms.InterpolationMode.BILINEAR), + transforms.ToTensor(), + ]) + return image_transforms + +class VideoFrameDataset(data.Dataset): + def __init__(self): + super(VideoFrameDataset, self).__init__() + self.path = osp.join(args.dataset) + self.vae_path = osp.join(args.dataset_vae) + self.video_names = os.listdir(self.path) + self.transform = get_transform() + + def __getitem__(self, index): + video_name = self.video_names[index] + inputImage = torch.load(osp.join(self.vae_path, video_name[:-4]+"_image.pt"), map_location='cpu') + restOfVideo = torch.load(osp.join(self.vae_path, video_name[:-4]+".pt"), map_location='cpu') + return {'image': inputImage, 'video': restOfVideo} + + def __len__(self): + return len(self.video_names) + +vae = AutoencoderKL.from_pretrained( + "CompVis/stable-diffusion-v1-4", + subfolder="vae", + revision="ebb811dd71cdc38a204ecbdd6ac5d580f529fd8c" + ).to(device) +vae.requires_grad_(False) + +@torch.no_grad() +def VAE_encode(image): + init_latent_dist = vae.encode(image).latent_dist.sample() + init_latent_dist *= 0.18215 + encoded_image = (init_latent_dist).unsqueeze(1) + return encoded_image + +Net = SpaceTimeUnet( + dim = 64, + channels = 4, + dim_mult = (1, 2, 4, 8), + temporal_compression = (False, False, False, True), + self_attns = (False, False, False, True), + condition_on_timestep = True, +).to(device) +adapter = Embedding_Adapter(input_nc=1280, output_nc=1280).to(device) + +clip_encoder = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32").cuda() +clip_encoder.requires_grad_(False) +clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + +parameters = list(Net.parameters()) + list(adapter.parameters()) +optimizerG = optim.AdamW(parameters, lr=0.0001, weight_decay=0.01) + +Net = nn.parallel.DistributedDataParallel( + Net, + device_ids=[args.local_rank], + output_device=args.local_rank, + broadcast_buffers=False) + +adapter = nn.parallel.DistributedDataParallel( + adapter, + device_ids=[args.local_rank], + output_device=args.local_rank, + broadcast_buffers=False) + +def data_sampler(dataset, shuffle, distributed): + if distributed: + return data.distributed.DistributedSampler(dataset) + if shuffle: + return data.RandomSampler(dataset) + else: + return data.SequentialSampler(dataset) + +train_dataset = VideoFrameDataset() +sampler = data_sampler(train_dataset, shuffle=True, distributed=True) +batch = 2 +train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=batch, + sampler=sampler, + num_workers=1, + drop_last=True) + +def save_video_frames_as_mp4(frames, fps, save_path): + frame_h, frame_w = frames[0].shape[2:] + fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') + video = cv2.VideoWriter(save_path, fourcc, fps, (frame_w, frame_h)) + frames = frames[0] + for frame in frames: + frame = np.array(TVF.to_pil_image(frame)) + video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) + video.release() + +mseloss = torch.nn.MSELoss(reduction="mean") + +def get_loss(input_image, latent_video): + timesteps = torch.randint(0, T, (batch,), device=device) + timesteps = timesteps.long() + initial_frame_latent_video = latent_video[:, 0:1].clone().detach() # [b, f, c, h, w] + x_noisy, noise = forward_diffusion_sample(latent_video, timesteps) + x_noisy[:, 0:1] = initial_frame_latent_video + noise[:, 0:1] = torch.zeros(initial_frame_latent_video.shape) + x_noisy = x_noisy.permute(0, 2, 1, 3, 4) + + inputs = clip_processor(images=list(input_image), return_tensors="pt") + inputs = {k: v.to(device) for k, v in inputs.items()} + clip_hidden_states = clip_encoder(**inputs).last_hidden_state.to(device) + vae_hidden_states = vae.encode(input_image).latent_dist.sample() * 0.18215 + encoder_hidden_states = adapter(clip_hidden_states, vae_hidden_states) + + noise_pred = Net(x_noisy, encoder_hidden_states, timestep=timesteps.float()) + noise_pred = noise_pred.permute(0, 2, 1, 3, 4) + loss = 0.0 + for i in range(frameLimit): + loss += mseloss(noise_pred[:, i, :, :, :], noise[:, i, :, :, :]) + return loss + +@torch.no_grad() +def VAE_decode(video): + decoded_video = None + for i in range(video.shape[1]): + image = video[:, i, :, :, :] + image = 1 / 0.18215 * image + if i == 0: + image = vae.decode(image).sample + image = (image / 2 + 0.5).clamp(0, 1) + decoded_video = image.unsqueeze(1) + else: + image = vae.decode(image).sample + image = (image / 2 + 0.5).clamp(0, 1) + decoded_video = torch.cat([decoded_video, image.unsqueeze(1)], 1) + return decoded_video + + +@torch.no_grad() +def sample_timestep(x, image, t): + betas_t = get_index_from_list(betas, t, x.shape) + sqrt_one_minus_alphas_cumprod_t = get_index_from_list( + sqrt_one_minus_alphas_cumprod, t, x.shape + ) + sqrt_recip_alphas_t = get_index_from_list(sqrt_recip_alphas, t, x.shape) + + # Call model (current image - noise prediction) + with torch.cuda.amp.autocast(): + sample_output = Net(x.permute(0, 2, 1, 3, 4), image, timestep=t.float()) + sample_output = sample_output.permute(0, 2, 1, 3, 4) + model_mean = sqrt_recip_alphas_t * ( + x - betas_t * sample_output / sqrt_one_minus_alphas_cumprod_t + ) + if t.item() == 0: + return model_mean + else: + noise = torch.randn_like(x) + posterior_variance_t = get_index_from_list(posterior_variance, t, x.shape) + return model_mean + torch.sqrt(posterior_variance_t) * noise + + +@torch.no_grad() +def get_image_embedding(input_image): + inputs = clip_processor(images=list(input_image), return_tensors="pt") + inputs = {k: v.to(device) for k, v in inputs.items()} + clip_hidden_states = clip_encoder(**inputs).last_hidden_state.to(device) + vae_hidden_states = vae.encode(input_image).latent_dist.sample() * 0.18215 + encoder_hidden_states = adapter(clip_hidden_states, vae_hidden_states) + return encoder_hidden_states + +if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + +if not os.path.exists('training_sample'): + os.makedirs('training_sample') + +step = 0 +for epoch in range(2500): + Net.train() + adapter.train() + for data in train_dataloader: + step += 1 + vae_video = data['video'].to(device=device) # [b, f, c, h, w] + image = data['image'].to(device=device) + + loss = get_loss(input_image=image, latent_video=vae_video) + optimizerG.zero_grad() + loss.backward() + optimizerG.step() + + if get_rank() == 0 and epoch % 40 == 0: + writer.add_scalar('loss', loss, step) + if get_rank() == 0 and epoch % 100 == 0: + torch.save( + { + 'net': Net.module.state_dict(), + 'adapter': adapter.module.state_dict(), + 'opt': optimizerG.state_dict() + }, args.output_dir + "/model_" + str(epoch) + "_" + str(step) + ".pth") + if get_rank() == 0 and epoch % 100 == 0: + noise_video = torch.randn([1, frameLimit, 4, 80, 64]).to(device) + encoder_hidden_states = get_image_embedding(input_image=image[0].unsqueeze(0)) + encoded_image = VAE_encode(image[0].unsqueeze(0)) + noise_video[:, 0:1] = encoded_image + with torch.no_grad(): + for i in range(0, T)[::-1]: + t = torch.full((1,), i, device=device).long() + noise_video = sample_timestep(noise_video, encoder_hidden_states, t) + noise_video[:, 0:1] = encoded_image + final_video = VAE_decode(noise_video) + writer.add_image('input image', image[0], step) + writer.add_video('video', final_video, step) + save_video_frames_as_mp4(final_video, 25, "training_sample/video"+str(epoch)+".mp4") + +if get_rank() == 0: + torch.save({ + 'net': Net.module.state_dict(), + 'adapter': adapter.module.state_dict() + }, args.output_dir + "/vae_clip_e100.pth") \ No newline at end of file