| | import torch |
| | from .sd_vae_decoder import VAEAttentionBlock, SDVAEDecoderStateDictConverter |
| | from .sd_unet import ResnetBlock, UpSampler |
| | from .tiler import TileWorker |
| |
|
| |
|
| |
|
| | class SD3VAEDecoder(torch.nn.Module): |
| | def __init__(self): |
| | super().__init__() |
| | self.scaling_factor = 1.5305 |
| | self.shift_factor = 0.0609 |
| | self.conv_in = torch.nn.Conv2d(16, 512, kernel_size=3, padding=1) |
| |
|
| | self.blocks = torch.nn.ModuleList([ |
| | |
| | ResnetBlock(512, 512, eps=1e-6), |
| | VAEAttentionBlock(1, 512, 512, 1, eps=1e-6), |
| | ResnetBlock(512, 512, eps=1e-6), |
| | |
| | ResnetBlock(512, 512, eps=1e-6), |
| | ResnetBlock(512, 512, eps=1e-6), |
| | ResnetBlock(512, 512, eps=1e-6), |
| | UpSampler(512), |
| | |
| | ResnetBlock(512, 512, eps=1e-6), |
| | ResnetBlock(512, 512, eps=1e-6), |
| | ResnetBlock(512, 512, eps=1e-6), |
| | UpSampler(512), |
| | |
| | ResnetBlock(512, 256, eps=1e-6), |
| | ResnetBlock(256, 256, eps=1e-6), |
| | ResnetBlock(256, 256, eps=1e-6), |
| | UpSampler(256), |
| | |
| | ResnetBlock(256, 128, eps=1e-6), |
| | ResnetBlock(128, 128, eps=1e-6), |
| | ResnetBlock(128, 128, eps=1e-6), |
| | ]) |
| |
|
| | self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-6) |
| | self.conv_act = torch.nn.SiLU() |
| | self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1) |
| | |
| | def tiled_forward(self, sample, tile_size=64, tile_stride=32): |
| | hidden_states = TileWorker().tiled_forward( |
| | lambda x: self.forward(x), |
| | sample, |
| | tile_size, |
| | tile_stride, |
| | tile_device=sample.device, |
| | tile_dtype=sample.dtype |
| | ) |
| | return hidden_states |
| |
|
| | def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs): |
| | |
| | if tiled: |
| | return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride) |
| |
|
| | |
| | hidden_states = sample / self.scaling_factor + self.shift_factor |
| | hidden_states = self.conv_in(hidden_states) |
| | time_emb = None |
| | text_emb = None |
| | res_stack = None |
| |
|
| | |
| | for i, block in enumerate(self.blocks): |
| | hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack) |
| | |
| | |
| | hidden_states = self.conv_norm_out(hidden_states) |
| | hidden_states = self.conv_act(hidden_states) |
| | hidden_states = self.conv_out(hidden_states) |
| |
|
| | return hidden_states |
| | |
| | @staticmethod |
| | def state_dict_converter(): |
| | return SDVAEDecoderStateDictConverter() |