Spaces:

lucky0146
/

CodeFormer

Runtime error

App Files Files Community

lucky0146 commited on Mar 10

Commit

cf32411

verified ·

1 Parent(s): 862663e

Update codeformer_arch.py

Browse files

Files changed (1) hide show

codeformer_arch.py +44 -88

codeformer_arch.py CHANGED Viewed

@@ -5,18 +5,12 @@ from torch import nn, Tensor
 import torch.nn.functional as F
 from typing import Optional, List
-from vqgan_arch import *
 from basicsr.utils import get_root_logger
 from basicsr.utils.registry import ARCH_REGISTRY
 def calc_mean_std(feat, eps=1e-5):
-    """Calculate mean and std for adaptive_instance_normalization.
-    Args:
-        feat (Tensor): 4D tensor.
-        eps (float): A small value added to the variance to avoid
-            divide-by-zero. Default: 1e-5.
-    """
     size = feat.size()
     assert len(size) == 4, 'The input feature should be 4D tensor.'
     b, c = size[:2]
@@ -25,30 +19,15 @@ def calc_mean_std(feat, eps=1e-5):
     feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1)
     return feat_mean, feat_std
 def adaptive_instance_normalization(content_feat, style_feat):
-    """Adaptive instance normalization.
-    Adjust the reference features to have the similar color and illuminations
-    as those in the degradate features.
-    Args:
-        content_feat (Tensor): The reference feature.
-        style_feat (Tensor): The degradate features.
-    """
     size = content_feat.size()
     style_mean, style_std = calc_mean_std(style_feat)
     content_mean, content_std = calc_mean_std(content_feat)
     normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
     return normalized_feat * style_std.expand(size) + style_mean.expand(size)
 class PositionEmbeddingSine(nn.Module):
-    """
-    This is a more standard version of the position embedding, very similar to the one
-    used by the Attention is all you need paper, generalized to work on images.
-    """
     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
         super().__init__()
         self.num_pos_feats = num_pos_feats
@@ -93,14 +72,12 @@ def _get_activation_fn(activation):
         return F.gelu
     if activation == "glu":
         return F.glu
-    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
 class TransformerSALayer(nn.Module):
     def __init__(self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu"):
         super().__init__()
         self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
-        # Implementation of Feedforward model - MLP
         self.linear1 = nn.Linear(embed_dim, dim_mlp)
         self.dropout = nn.Dropout(dropout)
         self.linear2 = nn.Linear(dim_mlp, embed_dim)
@@ -119,15 +96,11 @@ class TransformerSALayer(nn.Module):
                 tgt_mask: Optional[Tensor] = None,
                 tgt_key_padding_mask: Optional[Tensor] = None,
                 query_pos: Optional[Tensor] = None):
-        # self attention
         tgt2 = self.norm1(tgt)
         q = k = self.with_pos_embed(tgt2, query_pos)
         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
                               key_padding_mask=tgt_key_padding_mask)[0]
         tgt = tgt + self.dropout1(tgt2)
-        # ffn
         tgt2 = self.norm2(tgt)
         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
         tgt = tgt + self.dropout2(tgt2)
@@ -136,17 +109,21 @@ class TransformerSALayer(nn.Module):
 class Fuse_sft_block(nn.Module):
     def __init__(self, in_ch, out_ch):
         super().__init__()
-        self.encode_enc = ResBlock(2*in_ch, out_ch)
         self.scale = nn.Sequential(
-                    nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
-                    nn.LeakyReLU(0.2, True),
-                    nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
         self.shift = nn.Sequential(
-                    nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
-                    nn.LeakyReLU(0.2, True),
-                    nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
     def forward(self, enc_feat, dec_feat, w=1):
         enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1))
@@ -156,18 +133,18 @@ class Fuse_sft_block(nn.Module):
         out = dec_feat + residual
         return out
 @ARCH_REGISTRY.register()
 class CodeFormer(VQAutoEncoder):
     def __init__(self, dim_embd=512, n_head=8, n_layers=9,
-                codebook_size=1024, latent_size=256,
-                connect_list=['32', '64', '128', '256'],
-                fix_modules=['quantize','generator'], vqgan_path=None):
-        super(CodeFormer, self).__init__(512, 64, [1, 2, 2, 4, 4, 8], 'nearest',2, [16], codebook_size)
         if vqgan_path is not None:
-            self.load_state_dict(
-                torch.load(vqgan_path, map_location='cpu')['params_ema'])
         if fix_modules is not None:
             for module in fix_modules:
@@ -177,19 +154,18 @@ class CodeFormer(VQAutoEncoder):
         self.connect_list = connect_list
         self.n_layers = n_layers
         self.dim_embd = dim_embd
-        self.dim_mlp = dim_embd*2
         self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd))
         self.feat_emb = nn.Linear(256, self.dim_embd)
-        # transformer
         self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
-                                    for _ in range(self.n_layers)])
-        # logits_predict head
         self.idx_pred_layer = nn.Sequential(
             nn.LayerNorm(dim_embd),
-            nn.Linear(dim_embd, codebook_size, bias=False))
         self.channels = {
             '16': 512,
@@ -200,12 +176,9 @@ class CodeFormer(VQAutoEncoder):
             '512': 64,
         }
-        # after second residual block for > 16, before attn layer for ==16
-        self.fuse_encoder_block = {'512':2, '256':5, '128':8, '64':11, '32':14, '16':18}
-        # after first residual block for > 16, before attn layer for ==16
-        self.fuse_generator_block = {'16':6, '32': 9, '64':12, '128':15, '256':18, '512':21}
-        # fuse_convs_dict
         self.fuse_convs_dict = nn.ModuleDict()
         for f_size in self.connect_list:
             in_ch = self.channels[f_size]
@@ -221,60 +194,43 @@ class CodeFormer(VQAutoEncoder):
             module.weight.data.fill_(1.0)
     def forward(self, x, w=0, detach_16=True, code_only=False, adain=False):
-        # ################### Encoder #####################
         enc_feat_dict = {}
         out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
-        for i, block in enumerate(self.encoder.blocks):
-            x = block(x)
             if i in out_list:
                 enc_feat_dict[str(x.shape[-1])] = x.clone()
         lq_feat = x
-        # ################# Transformer ###################
-        # quant_feat, codebook_loss, quant_stats = self.quantize(lq_feat)
-        pos_emb = self.position_emb.unsqueeze(1).repeat(1,x.shape[0],1)
-        # BCHW -> BC(HW) -> (HW)BC
-        feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2,0,1))
         query_emb = feat_emb
-        # Transformer encoder
         for layer in self.ft_layers:
             query_emb = layer(query_emb, query_pos=pos_emb)
-        # output logits
-        logits = self.idx_pred_layer(query_emb) # (hw)bn
-        logits = logits.permute(1,0,2) # (hw)bn -> b(hw)n
-        if code_only: # for training stage II
-          # logits doesn't need softmax before cross_entropy loss
             return logits, lq_feat
-        # ################# Quantization ###################
-        # if self.training:
-        #     quant_feat = torch.einsum('btn,nc->btc', [soft_one_hot, self.quantize.embedding.weight])
-        #     # b(hw)c -> bc(hw) -> bchw
-        #     quant_feat = quant_feat.permute(0,2,1).view(lq_feat.shape)
-        # ------------
         soft_one_hot = F.softmax(logits, dim=2)
         _, top_idx = torch.topk(soft_one_hot, 1, dim=2)
-        quant_feat = self.quantize.get_codebook_feat(top_idx, shape=[x.shape[0],16,16,256])
-        # preserve gradients
-        # quant_feat = lq_feat + (quant_feat - lq_feat).detach()
         if detach_16:
-            quant_feat = quant_feat.detach() # for training stage III
         if adain:
             quant_feat = adaptive_instance_normalization(quant_feat, lq_feat)
-        # ################## Generator ####################
         x = quant_feat
         fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
-        for i, block in enumerate(self.generator.blocks):
-            x = block(x)
-            if i in fuse_list: # fuse after i-th block
                 f_size = str(x.shape[-1])
-                if w>0:
                     x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
         out = x
-        # logits doesn't need softmax before cross_entropy loss
         return out, logits, lq_feat

 import torch.nn.functional as F
 from typing import Optional, List
+from vqgan_arch import *  # Custom import from root
 from basicsr.utils import get_root_logger
 from basicsr.utils.registry import ARCH_REGISTRY
 def calc_mean_std(feat, eps=1e-5):
+    """Calculate mean and std for adaptive_instance_normalization."""
     size = feat.size()
     assert len(size) == 4, 'The input feature should be 4D tensor.'
     b, c = size[:2]
     feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1)
     return feat_mean, feat_std
 def adaptive_instance_normalization(content_feat, style_feat):
+    """Adaptive instance normalization."""
     size = content_feat.size()
     style_mean, style_std = calc_mean_std(style_feat)
     content_mean, content_std = calc_mean_std(content_feat)
     normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
     return normalized_feat * style_std.expand(size) + style_mean.expand(size)
 class PositionEmbeddingSine(nn.Module):
     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
         super().__init__()
         self.num_pos_feats = num_pos_feats
         return F.gelu
     if activation == "glu":
         return F.glu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
 class TransformerSALayer(nn.Module):
     def __init__(self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu"):
         super().__init__()
         self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
         self.linear1 = nn.Linear(embed_dim, dim_mlp)
         self.dropout = nn.Dropout(dropout)
         self.linear2 = nn.Linear(dim_mlp, embed_dim)
                 tgt_mask: Optional[Tensor] = None,
                 tgt_key_padding_mask: Optional[Tensor] = None,
                 query_pos: Optional[Tensor] = None):
         tgt2 = self.norm1(tgt)
         q = k = self.with_pos_embed(tgt2, query_pos)
         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
                               key_padding_mask=tgt_key_padding_mask)[0]
         tgt = tgt + self.dropout1(tgt2)
         tgt2 = self.norm2(tgt)
         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
         tgt = tgt + self.dropout2(tgt2)
 class Fuse_sft_block(nn.Module):
     def __init__(self, in_ch, out_ch):
         super().__init__()
+        self.encode_enc = nn.Sequential(
+            nn.Conv2d(2 * in_ch, out_ch, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.LeakyReLU(0.2, inplace=True)
+        )
         self.scale = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, 3, 1, 1),
+            nn.LeakyReLU(0.2, True),
+            nn.Conv2d(out_ch, out_ch, 3, 1, 1)
+        )
         self.shift = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, 3, 1, 1),
+            nn.LeakyReLU(0.2, True),
+            nn.Conv2d(out_ch, out_ch, 3, 1, 1)
+        )
     def forward(self, enc_feat, dec_feat, w=1):
         enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1))
         out = dec_feat + residual
         return out
 @ARCH_REGISTRY.register()
 class CodeFormer(VQAutoEncoder):
     def __init__(self, dim_embd=512, n_head=8, n_layers=9,
+                 codebook_size=1024, latent_size=256,
+                 connect_list=['32', '64', '128', '256'],
+                 fix_modules=['quantize', 'generator'], vqgan_path=None):
+        # Adjust down_factor to ensure it works with channel scaling
+        down_factor = [1, 2, 2, 4, 4, 8]  # Ensure this matches the number of steps
+        super().__init__(512, 64, down_factor, 'nearest', len(down_factor) - 1, 16, codebook_size)
         if vqgan_path is not None:
+            self.load_state_dict(torch.load(vqgan_path, map_location='cpu')['params_ema'])
         if fix_modules is not None:
             for module in fix_modules:
         self.connect_list = connect_list
         self.n_layers = n_layers
         self.dim_embd = dim_embd
+        self.dim_mlp = dim_embd * 2
         self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd))
         self.feat_emb = nn.Linear(256, self.dim_embd)
         self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
+                                        for _ in range(self.n_layers)])
         self.idx_pred_layer = nn.Sequential(
             nn.LayerNorm(dim_embd),
+            nn.Linear(dim_embd, codebook_size, bias=False)
+        )
         self.channels = {
             '16': 512,
             '512': 64,
         }
+        self.fuse_encoder_block = {'512': 2, '256': 5, '128': 8, '64': 11, '32': 14, '16': 18}
+        self.fuse_generator_block = {'16': 6, '32': 9, '64': 12, '128': 15, '256': 18, '512': 21}
         self.fuse_convs_dict = nn.ModuleDict()
         for f_size in self.connect_list:
             in_ch = self.channels[f_size]
             module.weight.data.fill_(1.0)
     def forward(self, x, w=0, detach_16=True, code_only=False, adain=False):
         enc_feat_dict = {}
         out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
+        for i, block in enumerate(self.encoder):
+            x = block(x)
             if i in out_list:
                 enc_feat_dict[str(x.shape[-1])] = x.clone()
         lq_feat = x
+        pos_emb = self.position_emb.unsqueeze(1).repeat(1, x.shape[0], 1)
+        feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2, 0, 1))
         query_emb = feat_emb
         for layer in self.ft_layers:
             query_emb = layer(query_emb, query_pos=pos_emb)
+        logits = self.idx_pred_layer(query_emb)
+        logits = logits.permute(1, 0, 2)
+        if code_only:
             return logits, lq_feat
         soft_one_hot = F.softmax(logits, dim=2)
         _, top_idx = torch.topk(soft_one_hot, 1, dim=2)
+        quant_feat = self.quantize.get_codebook_feat(top_idx, shape=[x.shape[0], 16, 16, 256])
         if detach_16:
+            quant_feat = quant_feat.detach()
         if adain:
             quant_feat = adaptive_instance_normalization(quant_feat, lq_feat)
         x = quant_feat
         fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
+        for i, block in enumerate(self.decoder):
+            x = block(x)
+            if i in fuse_list:
                 f_size = str(x.shape[-1])
+                if w > 0:
                     x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
         out = x
         return out, logits, lq_feat