File size: 4,208 Bytes
1c0d5df | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | model:
target: simgen.models.cascade_controlnet.UniControlNet
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 64
channels: 4
cond_stage_trainable: false
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
mode: local
parameterization: "v"
local_control_config:
target: simgen.models.local_adapter.LocalAdapter
params:
in_channels: 4
model_channels: 320
local_channels: 6 # 21, then 6 for 2 condition, now 15 for 5
inject_channels: [192, 256, 384, 512]
inject_layers: [1, 4, 7, 10]
num_res_blocks: 2
attention_resolutions: [4, 2, 1]
channel_mult: [1, 2, 4, 4]
use_checkpoint: True
# num_heads: 8
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024 # 768
legacy: False
unet_config:
target: simgen.models.local_adapter.LocalControlUNetModel
params:
image_size: 32
in_channels: 4
model_channels: 320
out_channels: 4
num_res_blocks: 2
attention_resolutions: [4, 2, 1]
channel_mult: [1, 2, 4, 4]
use_checkpoint: True
# num_heads: 8
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024 # 768
legacy: False
first_stage_config:
target: simgen.ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: simgen.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
params:
freeze: True
layer: "penultimate"
# target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
first_cond_config:
target: simgen.models.t2i_model.T2IModel
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 64
channels: 4
cond_stage_trainable: false
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
parameterization: "v"
unet_config:
target: simgen.ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32
in_channels: 4
model_channels: 320
out_channels: 4
num_res_blocks: 2
attention_resolutions: [4, 2, 1]
channel_mult: [1, 2, 4, 4]
use_checkpoint: True
# num_heads: 8
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024 # 768
legacy: False
first_stage_config:
target: simgen.ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: simgen.ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
params:
freeze: True
layer: "penultimate"
# target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|