lty2226262 commited on
Commit
d0eab23
·
verified ·
1 Parent(s): b934a64

Upload folder using huggingface_hub

Browse files
model_index.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "GSRefinerPipeline",
3
+ "_diffusers_version": "0.25.1",
4
+ "feature_extractor": [
5
+ null,
6
+ null
7
+ ],
8
+ "image_encoder": [
9
+ null,
10
+ null
11
+ ],
12
+ "requires_safety_checker": true,
13
+ "safety_checker": [
14
+ null,
15
+ null
16
+ ],
17
+ "scheduler": [
18
+ "diffusers",
19
+ "DDPMScheduler"
20
+ ],
21
+ "text_encoder": [
22
+ "transformers",
23
+ "CLIPTextModel"
24
+ ],
25
+ "tokenizer": [
26
+ "transformers",
27
+ "CLIPTokenizerFast"
28
+ ],
29
+ "unet": [
30
+ "diffusers",
31
+ "UNet2DConditionModel"
32
+ ],
33
+ "vae": [
34
+ "autoencoder_kl",
35
+ "AutoencoderKL"
36
+ ]
37
+ }
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDPMScheduler",
3
+ "_diffusers_version": "0.25.1",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "interpolation_type": "linear",
11
+ "num_train_timesteps": 1000,
12
+ "prediction_type": "epsilon",
13
+ "rescale_betas_zero_snr": false,
14
+ "sample_max_value": 1.0,
15
+ "set_alpha_to_one": false,
16
+ "sigma_max": null,
17
+ "sigma_min": null,
18
+ "skip_prk_steps": true,
19
+ "steps_offset": 1,
20
+ "thresholding": false,
21
+ "timestep_spacing": "trailing",
22
+ "timestep_type": "discrete",
23
+ "trained_betas": null,
24
+ "use_karras_sigmas": false,
25
+ "variance_type": "fixed_small"
26
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/sd-turbo",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.38.0",
24
+ "vocab_size": 49408
25
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67e013543d4fac905c882e2993d86a2d454ee69dc9e8f37c0c23d33a48959d15
3
+ size 1361596304
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "!",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.25.1",
4
+ "_name_or_path": "stabilityai/sd-turbo",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20,
13
+ 20
14
+ ],
15
+ "attention_type": "default",
16
+ "block_out_channels": [
17
+ 320,
18
+ 640,
19
+ 1280,
20
+ 1280
21
+ ],
22
+ "center_input_sample": false,
23
+ "class_embed_type": null,
24
+ "class_embeddings_concat": false,
25
+ "conv_in_kernel": 3,
26
+ "conv_out_kernel": 3,
27
+ "cross_attention_dim": 1024,
28
+ "cross_attention_norm": null,
29
+ "down_block_types": [
30
+ "CrossAttnDownBlock2D",
31
+ "CrossAttnDownBlock2D",
32
+ "CrossAttnDownBlock2D",
33
+ "DownBlock2D"
34
+ ],
35
+ "downsample_padding": 1,
36
+ "dropout": 0.0,
37
+ "dual_cross_attention": false,
38
+ "encoder_hid_dim": null,
39
+ "encoder_hid_dim_type": null,
40
+ "flip_sin_to_cos": true,
41
+ "freq_shift": 0,
42
+ "in_channels": 4,
43
+ "layers_per_block": 2,
44
+ "mid_block_only_cross_attention": null,
45
+ "mid_block_scale_factor": 1,
46
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
47
+ "norm_eps": 1e-05,
48
+ "norm_num_groups": 32,
49
+ "num_attention_heads": null,
50
+ "num_class_embeds": null,
51
+ "only_cross_attention": false,
52
+ "out_channels": 4,
53
+ "projection_class_embeddings_input_dim": null,
54
+ "resnet_out_scale_factor": 1.0,
55
+ "resnet_skip_time_act": false,
56
+ "resnet_time_scale_shift": "default",
57
+ "reverse_transformer_layers_per_block": null,
58
+ "sample_size": 64,
59
+ "time_cond_proj_dim": null,
60
+ "time_embedding_act_fn": null,
61
+ "time_embedding_dim": null,
62
+ "time_embedding_type": "positional",
63
+ "timestep_post_act": null,
64
+ "transformer_layers_per_block": 1,
65
+ "up_block_types": [
66
+ "UpBlock2D",
67
+ "CrossAttnUpBlock2D",
68
+ "CrossAttnUpBlock2D",
69
+ "CrossAttnUpBlock2D"
70
+ ],
71
+ "upcast_attention": null,
72
+ "use_linear_projection": true
73
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c68c294957aba4385da9604dceda7f00000a11dcb1cb62c08364a4ccdf39f3af
3
+ size 3463726504
vae/autoencoder_kl.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Dict, Optional, Tuple, Union
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+ from peft import LoraConfig
19
+
20
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
21
+ from diffusers.loaders import FromOriginalVAEMixin
22
+ from diffusers.utils.accelerate_utils import apply_forward_hook
23
+ from diffusers.models.attention_processor import (
24
+ ADDED_KV_ATTENTION_PROCESSORS,
25
+ CROSS_ATTENTION_PROCESSORS,
26
+ Attention,
27
+ AttentionProcessor,
28
+ AttnAddedKVProcessor,
29
+ AttnProcessor,
30
+ )
31
+ from diffusers.models.modeling_outputs import AutoencoderKLOutput
32
+ from diffusers.models.modeling_utils import ModelMixin
33
+ from diffusers.models.autoencoders.vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
34
+
35
+
36
+ def my_vae_encoder_fwd(self, sample):
37
+ sample = self.conv_in(sample)
38
+ l_blocks = []
39
+ # down
40
+ for down_block in self.down_blocks:
41
+ l_blocks.append(sample)
42
+ sample = down_block(sample)
43
+ # middle
44
+ sample = self.mid_block(sample)
45
+ sample = self.conv_norm_out(sample)
46
+ sample = self.conv_act(sample)
47
+ sample = self.conv_out(sample)
48
+ self.current_down_blocks = l_blocks
49
+ return sample
50
+
51
+
52
+ def my_vae_decoder_fwd(self, sample, latent_embeds=None):
53
+ sample = self.conv_in(sample)
54
+ upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
55
+ # middle
56
+ sample = self.mid_block(sample, latent_embeds)
57
+ sample = sample.to(upscale_dtype)
58
+ if not self.ignore_skip:
59
+ skip_convs = [self.skip_conv_1, self.skip_conv_2, self.skip_conv_3, self.skip_conv_4]
60
+ # up
61
+ for idx, up_block in enumerate(self.up_blocks):
62
+ skip_in = skip_convs[idx](self.incoming_skip_acts[::-1][idx] * self.gamma)
63
+ # add skip
64
+ sample = sample + skip_in
65
+ sample = up_block(sample, latent_embeds)
66
+ else:
67
+ for idx, up_block in enumerate(self.up_blocks):
68
+ sample = up_block(sample, latent_embeds)
69
+ # post-process
70
+ if latent_embeds is None:
71
+ sample = self.conv_norm_out(sample)
72
+ else:
73
+ sample = self.conv_norm_out(sample, latent_embeds)
74
+ sample = self.conv_act(sample)
75
+ sample = self.conv_out(sample)
76
+ return sample
77
+
78
+
79
+ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
80
+ r"""
81
+ A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
82
+
83
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
84
+ for all models (such as downloading or saving).
85
+
86
+ Parameters:
87
+ in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
88
+ out_channels (int, *optional*, defaults to 3): Number of channels in the output.
89
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
90
+ Tuple of downsample block types.
91
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
92
+ Tuple of upsample block types.
93
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
94
+ Tuple of block output channels.
95
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
96
+ latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
97
+ sample_size (`int`, *optional*, defaults to `32`): Sample input size.
98
+ scaling_factor (`float`, *optional*, defaults to 0.18215):
99
+ The component-wise standard deviation of the trained latent space computed using the first batch of the
100
+ training set. This is used to scale the latent space to have unit variance when training the diffusion
101
+ model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
102
+ diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
103
+ / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
104
+ Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
105
+ force_upcast (`bool`, *optional*, default to `True`):
106
+ If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
107
+ can be fine-tuned / trained to a lower range without loosing too much precision in which case
108
+ `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
109
+ """
110
+
111
+ _supports_gradient_checkpointing = True
112
+
113
+ @register_to_config
114
+ def __init__(
115
+ self,
116
+ in_channels: int = 3,
117
+ out_channels: int = 3,
118
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
119
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
120
+ block_out_channels: Tuple[int] = (64,),
121
+ layers_per_block: int = 1,
122
+ act_fn: str = "silu",
123
+ latent_channels: int = 4,
124
+ norm_num_groups: int = 32,
125
+ sample_size: int = 32,
126
+ scaling_factor: float = 0.18215,
127
+ force_upcast: float = True,
128
+ lora_rank: int = 4,
129
+ gamma: float = 1.0,
130
+ ignore_skip: bool = False,
131
+ ):
132
+ super().__init__()
133
+
134
+ # pass init params to Encoder
135
+ self.encoder = Encoder(
136
+ in_channels=in_channels,
137
+ out_channels=latent_channels,
138
+ down_block_types=down_block_types,
139
+ block_out_channels=block_out_channels,
140
+ layers_per_block=layers_per_block,
141
+ act_fn=act_fn,
142
+ norm_num_groups=norm_num_groups,
143
+ double_z=True,
144
+ )
145
+
146
+ # pass init params to Decoder
147
+ self.decoder = Decoder(
148
+ in_channels=latent_channels,
149
+ out_channels=out_channels,
150
+ up_block_types=up_block_types,
151
+ block_out_channels=block_out_channels,
152
+ layers_per_block=layers_per_block,
153
+ norm_num_groups=norm_num_groups,
154
+ act_fn=act_fn,
155
+ )
156
+
157
+ self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
158
+ self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1)
159
+
160
+ self.use_slicing = False
161
+ self.use_tiling = False
162
+
163
+ # only relevant if vae tiling is enabled
164
+ self.tile_sample_min_size = self.config.sample_size
165
+ sample_size = (
166
+ self.config.sample_size[0]
167
+ if isinstance(self.config.sample_size, (list, tuple))
168
+ else self.config.sample_size
169
+ )
170
+ self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
171
+ self.tile_overlap_factor = 0.25
172
+
173
+ self.encoder.forward = my_vae_encoder_fwd.__get__(self.encoder, self.encoder.__class__)
174
+ self.decoder.forward = my_vae_decoder_fwd.__get__(self.decoder, self.decoder.__class__)
175
+ # add the skip connection convs
176
+ self.decoder.skip_conv_1 = torch.nn.Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
177
+ self.decoder.skip_conv_2 = torch.nn.Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
178
+ self.decoder.skip_conv_3 = torch.nn.Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
179
+ self.decoder.skip_conv_4 = torch.nn.Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
180
+ self.decoder.ignore_skip = ignore_skip
181
+ self.decoder.gamma = gamma
182
+
183
+ target_modules_vae = ["conv1", "conv2", "conv_in", "conv_shortcut", "conv", "conv_out",
184
+ "skip_conv_1", "skip_conv_2", "skip_conv_3", "skip_conv_4",
185
+ "to_k", "to_q", "to_v", "to_out.0",
186
+ ]
187
+ target_modules = []
188
+ for id, (name, param) in enumerate(self.named_modules()):
189
+ if 'decoder' in name and any(name.endswith(x) for x in target_modules_vae):
190
+ target_modules.append(name)
191
+ target_modules_vae = target_modules
192
+
193
+ vae_lora_config = LoraConfig(r=lora_rank, init_lora_weights="gaussian", target_modules=target_modules_vae)
194
+ self.add_adapter(vae_lora_config, adapter_name="vae_skip")
195
+
196
+ def _set_gradient_checkpointing(self, module, value=False):
197
+ if isinstance(module, (Encoder, Decoder)):
198
+ module.gradient_checkpointing = value
199
+
200
+ def enable_tiling(self, use_tiling: bool = True):
201
+ r"""
202
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
203
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
204
+ processing larger images.
205
+ """
206
+ self.use_tiling = use_tiling
207
+
208
+ def disable_tiling(self):
209
+ r"""
210
+ Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
211
+ decoding in one step.
212
+ """
213
+ self.enable_tiling(False)
214
+
215
+ def enable_slicing(self):
216
+ r"""
217
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
218
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
219
+ """
220
+ self.use_slicing = True
221
+
222
+ def disable_slicing(self):
223
+ r"""
224
+ Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
225
+ decoding in one step.
226
+ """
227
+ self.use_slicing = False
228
+
229
+ @property
230
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
231
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
232
+ r"""
233
+ Returns:
234
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
235
+ indexed by its weight name.
236
+ """
237
+ # set recursively
238
+ processors = {}
239
+
240
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
241
+ if hasattr(module, "get_processor"):
242
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
243
+
244
+ for sub_name, child in module.named_children():
245
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
246
+
247
+ return processors
248
+
249
+ for name, module in self.named_children():
250
+ fn_recursive_add_processors(name, module, processors)
251
+
252
+ return processors
253
+
254
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
255
+ def set_attn_processor(
256
+ self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
257
+ ):
258
+ r"""
259
+ Sets the attention processor to use to compute attention.
260
+
261
+ Parameters:
262
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
263
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
264
+ for **all** `Attention` layers.
265
+
266
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
267
+ processor. This is strongly recommended when setting trainable attention processors.
268
+
269
+ """
270
+ count = len(self.attn_processors.keys())
271
+
272
+ if isinstance(processor, dict) and len(processor) != count:
273
+ raise ValueError(
274
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
275
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
276
+ )
277
+
278
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
279
+ if hasattr(module, "set_processor"):
280
+ if not isinstance(processor, dict):
281
+ module.set_processor(processor, _remove_lora=_remove_lora)
282
+ else:
283
+ module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
284
+
285
+ for sub_name, child in module.named_children():
286
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
287
+
288
+ for name, module in self.named_children():
289
+ fn_recursive_attn_processor(name, module, processor)
290
+
291
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
292
+ def set_default_attn_processor(self):
293
+ """
294
+ Disables custom attention processors and sets the default attention implementation.
295
+ """
296
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
297
+ processor = AttnAddedKVProcessor()
298
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
299
+ processor = AttnProcessor()
300
+ else:
301
+ raise ValueError(
302
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
303
+ )
304
+
305
+ self.set_attn_processor(processor, _remove_lora=True)
306
+
307
+ @apply_forward_hook
308
+ def encode(
309
+ self, x: torch.FloatTensor, return_dict: bool = True
310
+ ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
311
+ """
312
+ Encode a batch of images into latents.
313
+
314
+ Args:
315
+ x (`torch.FloatTensor`): Input batch of images.
316
+ return_dict (`bool`, *optional*, defaults to `True`):
317
+ Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
318
+
319
+ Returns:
320
+ The latent representations of the encoded images. If `return_dict` is True, a
321
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
322
+ """
323
+ if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
324
+ return self.tiled_encode(x, return_dict=return_dict)
325
+
326
+ if self.use_slicing and x.shape[0] > 1:
327
+ encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
328
+ h = torch.cat(encoded_slices)
329
+ else:
330
+ h = self.encoder(x)
331
+
332
+ moments = self.quant_conv(h)
333
+ posterior = DiagonalGaussianDistribution(moments)
334
+
335
+ if not return_dict:
336
+ return (posterior,)
337
+
338
+ return AutoencoderKLOutput(latent_dist=posterior)
339
+
340
+ def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
341
+ if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
342
+ return self.tiled_decode(z, return_dict=return_dict)
343
+
344
+ z = self.post_quant_conv(z)
345
+ dec = self.decoder(z)
346
+
347
+ if not return_dict:
348
+ return (dec,)
349
+
350
+ return DecoderOutput(sample=dec)
351
+
352
+ @apply_forward_hook
353
+ def decode(
354
+ self, z: torch.FloatTensor, return_dict: bool = True, generator=None
355
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
356
+ """
357
+ Decode a batch of images.
358
+
359
+ Args:
360
+ z (`torch.FloatTensor`): Input batch of latent vectors.
361
+ return_dict (`bool`, *optional*, defaults to `True`):
362
+ Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
363
+
364
+ Returns:
365
+ [`~models.vae.DecoderOutput`] or `tuple`:
366
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
367
+ returned.
368
+
369
+ """
370
+ if self.use_slicing and z.shape[0] > 1:
371
+ decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
372
+ decoded = torch.cat(decoded_slices)
373
+ else:
374
+ decoded = self._decode(z).sample
375
+
376
+ if not return_dict:
377
+ return (decoded,)
378
+
379
+ return DecoderOutput(sample=decoded)
380
+
381
+ def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
382
+ blend_extent = min(a.shape[2], b.shape[2], blend_extent)
383
+ for y in range(blend_extent):
384
+ b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
385
+ return b
386
+
387
+ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
388
+ blend_extent = min(a.shape[3], b.shape[3], blend_extent)
389
+ for x in range(blend_extent):
390
+ b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
391
+ return b
392
+
393
+ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
394
+ r"""Encode a batch of images using a tiled encoder.
395
+
396
+ When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
397
+ steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
398
+ different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
399
+ tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
400
+ output, but they should be much less noticeable.
401
+
402
+ Args:
403
+ x (`torch.FloatTensor`): Input batch of images.
404
+ return_dict (`bool`, *optional*, defaults to `True`):
405
+ Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
406
+
407
+ Returns:
408
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
409
+ If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
410
+ `tuple` is returned.
411
+ """
412
+ overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
413
+ blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
414
+ row_limit = self.tile_latent_min_size - blend_extent
415
+
416
+ # Split the image into 512x512 tiles and encode them separately.
417
+ rows = []
418
+ for i in range(0, x.shape[2], overlap_size):
419
+ row = []
420
+ for j in range(0, x.shape[3], overlap_size):
421
+ tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
422
+ tile = self.encoder(tile)
423
+ tile = self.quant_conv(tile)
424
+ row.append(tile)
425
+ rows.append(row)
426
+ result_rows = []
427
+ for i, row in enumerate(rows):
428
+ result_row = []
429
+ for j, tile in enumerate(row):
430
+ # blend the above tile and the left tile
431
+ # to the current tile and add the current tile to the result row
432
+ if i > 0:
433
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
434
+ if j > 0:
435
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
436
+ result_row.append(tile[:, :, :row_limit, :row_limit])
437
+ result_rows.append(torch.cat(result_row, dim=3))
438
+
439
+ moments = torch.cat(result_rows, dim=2)
440
+ posterior = DiagonalGaussianDistribution(moments)
441
+
442
+ if not return_dict:
443
+ return (posterior,)
444
+
445
+ return AutoencoderKLOutput(latent_dist=posterior)
446
+
447
+ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
448
+ r"""
449
+ Decode a batch of images using a tiled decoder.
450
+
451
+ Args:
452
+ z (`torch.FloatTensor`): Input batch of latent vectors.
453
+ return_dict (`bool`, *optional*, defaults to `True`):
454
+ Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
455
+
456
+ Returns:
457
+ [`~models.vae.DecoderOutput`] or `tuple`:
458
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
459
+ returned.
460
+ """
461
+ overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
462
+ blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
463
+ row_limit = self.tile_sample_min_size - blend_extent
464
+
465
+ # Split z into overlapping 64x64 tiles and decode them separately.
466
+ # The tiles have an overlap to avoid seams between tiles.
467
+ rows = []
468
+ for i in range(0, z.shape[2], overlap_size):
469
+ row = []
470
+ for j in range(0, z.shape[3], overlap_size):
471
+ tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
472
+ tile = self.post_quant_conv(tile)
473
+ decoded = self.decoder(tile)
474
+ row.append(decoded)
475
+ rows.append(row)
476
+ result_rows = []
477
+ for i, row in enumerate(rows):
478
+ result_row = []
479
+ for j, tile in enumerate(row):
480
+ # blend the above tile and the left tile
481
+ # to the current tile and add the current tile to the result row
482
+ if i > 0:
483
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
484
+ if j > 0:
485
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
486
+ result_row.append(tile[:, :, :row_limit, :row_limit])
487
+ result_rows.append(torch.cat(result_row, dim=3))
488
+
489
+ dec = torch.cat(result_rows, dim=2)
490
+ if not return_dict:
491
+ return (dec,)
492
+
493
+ return DecoderOutput(sample=dec)
494
+
495
+ def forward(
496
+ self,
497
+ sample: torch.FloatTensor,
498
+ sample_posterior: bool = False,
499
+ return_dict: bool = True,
500
+ generator: Optional[torch.Generator] = None,
501
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
502
+ r"""
503
+ Args:
504
+ sample (`torch.FloatTensor`): Input sample.
505
+ sample_posterior (`bool`, *optional*, defaults to `False`):
506
+ Whether to sample from the posterior.
507
+ return_dict (`bool`, *optional*, defaults to `True`):
508
+ Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
509
+ """
510
+ x = sample
511
+ posterior = self.encode(x).latent_dist
512
+ if sample_posterior:
513
+ z = posterior.sample(generator=generator)
514
+ else:
515
+ z = posterior.mode()
516
+ dec = self.decode(z).sample
517
+
518
+ if not return_dict:
519
+ return (dec,)
520
+
521
+ return DecoderOutput(sample=dec)
522
+
523
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
524
+ def fuse_qkv_projections(self):
525
+ """
526
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
527
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
528
+
529
+ <Tip warning={true}>
530
+
531
+ This API is 🧪 experimental.
532
+
533
+ </Tip>
534
+ """
535
+ self.original_attn_processors = None
536
+
537
+ for _, attn_processor in self.attn_processors.items():
538
+ if "Added" in str(attn_processor.__class__.__name__):
539
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
540
+
541
+ self.original_attn_processors = self.attn_processors
542
+
543
+ for module in self.modules():
544
+ if isinstance(module, Attention):
545
+ module.fuse_projections(fuse=True)
546
+
547
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
548
+ def unfuse_qkv_projections(self):
549
+ """Disables the fused QKV projection if enabled.
550
+
551
+ <Tip warning={true}>
552
+
553
+ This API is 🧪 experimental.
554
+
555
+ </Tip>
556
+
557
+ """
558
+ if self.original_attn_processors is not None:
559
+ self.set_attn_processor(self.original_attn_processors)
vae/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.25.1",
4
+ "_name_or_path": "stabilityai/sd-turbo",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 4,
21
+ "layers_per_block": 2,
22
+ "norm_num_groups": 32,
23
+ "out_channels": 3,
24
+ "sample_size": 768,
25
+ "scaling_factor": 0.18215,
26
+ "up_block_types": [
27
+ "UpDecoderBlock2D",
28
+ "UpDecoderBlock2D",
29
+ "UpDecoderBlock2D",
30
+ "UpDecoderBlock2D"
31
+ ]
32
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ee246bd99ac761d52ff6b2775decd0532b92276beeabc09fee1b9d7e12978d3
3
+ size 338717612