diff --git a/cldm_v15.yaml b/cldm_v15.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/cldm_v15.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/cldm_v21.yaml b/cldm_v21.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fc65193647e476e108fce5977f11250d55919106 --- /dev/null +++ b/cldm_v21.yaml @@ -0,0 +1,85 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/control_sd15_canny.yaml b/control_sd15_canny.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_sd15_canny.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_sd15_depth.yaml b/control_sd15_depth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_sd15_depth.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_sd15_hed.yaml b/control_sd15_hed.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_sd15_hed.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_sd15_mlsd.yaml b/control_sd15_mlsd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_sd15_mlsd.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_sd15_normal.yaml b/control_sd15_normal.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_sd15_normal.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_sd15_openpose.yaml b/control_sd15_openpose.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_sd15_openpose.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_sd15_scribble.yaml b/control_sd15_scribble.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_sd15_scribble.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_sd15_seg.yaml b/control_sd15_seg.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_sd15_seg.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11e_sd15_ip2p.pth b/control_v11e_sd15_ip2p.pth new file mode 100644 index 0000000000000000000000000000000000000000..b88b48157076d2e8230763b49f600618ec38526f --- /dev/null +++ b/control_v11e_sd15_ip2p.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8d77429b111b25ffad4adb4c686a19c73c905d638dd8acb1e0b1d1da300192c +size 1445234339 diff --git a/control_v11e_sd15_ip2p.yaml b/control_v11e_sd15_ip2p.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11e_sd15_ip2p.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11e_sd15_shuffle.pth b/control_v11e_sd15_shuffle.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c3655d7a335ae8e23b250205ce923716545d2f0 --- /dev/null +++ b/control_v11e_sd15_shuffle.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:657f632c35176c467e71c471b55b1a0e94ae1a49d739e1b705624f454e638e1d +size 1445235365 diff --git a/control_v11e_sd15_shuffle.yaml b/control_v11e_sd15_shuffle.yaml new file mode 100644 index 0000000000000000000000000000000000000000..862304b0090bf65984473c30ab0ebc30a4858400 --- /dev/null +++ b/control_v11e_sd15_shuffle.yaml @@ -0,0 +1,80 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + global_average_pooling: True + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11f1e_sd15_tile.pth b/control_v11f1e_sd15_tile.pth new file mode 100644 index 0000000000000000000000000000000000000000..f0c50aea7aac578a1bb6f908ff7236ed2f26ee73 --- /dev/null +++ b/control_v11f1e_sd15_tile.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aa69b8d391e72c2fced6a650268137dc0ed594cafe8a2c0f8b994799a21979b +size 1445235023 diff --git a/control_v11f1e_sd15_tile.yaml b/control_v11f1e_sd15_tile.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11f1e_sd15_tile.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11f1p_sd15_depth.pth b/control_v11f1p_sd15_depth.pth new file mode 100644 index 0000000000000000000000000000000000000000..e20ec5a5140ed678f1db1bd0efe02e74a757fd91 --- /dev/null +++ b/control_v11f1p_sd15_depth.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:761077ffe369fe8cf16ae353f8226bd4ca29805b161052f82c0170c7b50f1d99 +size 1445235365 diff --git a/control_v11f1p_sd15_depth.yaml b/control_v11f1p_sd15_depth.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11f1p_sd15_depth.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_canny.pth b/control_v11p_sd15_canny.pth new file mode 100644 index 0000000000000000000000000000000000000000..a39717042be81190d25eba173a94f3bf35606090 --- /dev/null +++ b/control_v11p_sd15_canny.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99cfe4c70910e38e3fece9918a4979ed7d3dcf9b81cee293e1755363af5406a +size 1445234681 diff --git a/control_v11p_sd15_canny.yaml b/control_v11p_sd15_canny.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_canny.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_inpaint.pth b/control_v11p_sd15_inpaint.pth new file mode 100644 index 0000000000000000000000000000000000000000..6edfc3664c3a104b9f01fda95464159dedcf6b10 --- /dev/null +++ b/control_v11p_sd15_inpaint.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2a0ff91882a0fbacbded5edeca7babdb6d1bd8f770880a91c177bd44cd58633 +size 1445235365 diff --git a/control_v11p_sd15_inpaint.yaml b/control_v11p_sd15_inpaint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_inpaint.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_lineart.pth b/control_v11p_sd15_lineart.pth new file mode 100644 index 0000000000000000000000000000000000000000..36362b436291818e2838f04d4c9b8e5e38082c90 --- /dev/null +++ b/control_v11p_sd15_lineart.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb5fb912696830da948e5056b50adfa3fcc68ff760c43e4e7a8499853af43c61 +size 1445235365 diff --git a/control_v11p_sd15_lineart.yaml b/control_v11p_sd15_lineart.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_lineart.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_mlsd.pth b/control_v11p_sd15_mlsd.pth new file mode 100644 index 0000000000000000000000000000000000000000..403300e5af4fe31de57afa06bf85264b254d96f8 --- /dev/null +++ b/control_v11p_sd15_mlsd.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22dab177f4c4566ac1c8f68dc2c429b023114e15a644e8c7c0e8f956afe92acf +size 1445234339 diff --git a/control_v11p_sd15_mlsd.yaml b/control_v11p_sd15_mlsd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_mlsd.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_normalbae.pth b/control_v11p_sd15_normalbae.pth new file mode 100644 index 0000000000000000000000000000000000000000..d57f40e120d924cecbde229d49709111812422fc --- /dev/null +++ b/control_v11p_sd15_normalbae.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9608158c204261259f4b5a7815ced8f5c15e5de4e30a9403d07f68f29f81e941 +size 1445236049 diff --git a/control_v11p_sd15_normalbae.yaml b/control_v11p_sd15_normalbae.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_normalbae.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_openpose.pth b/control_v11p_sd15_openpose.pth new file mode 100644 index 0000000000000000000000000000000000000000..56b8d5bad6d1c47d5ff9c7878c3848d1962a29ff --- /dev/null +++ b/control_v11p_sd15_openpose.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db97becd92cd19aff71352a60e93c2508decba3dee64f01f686727b9b406a9dd +size 1445235707 diff --git a/control_v11p_sd15_openpose.yaml b/control_v11p_sd15_openpose.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_openpose.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_scribble.pth b/control_v11p_sd15_scribble.pth new file mode 100644 index 0000000000000000000000000000000000000000..3959453782ccca5b138b4943937447eba140b118 --- /dev/null +++ b/control_v11p_sd15_scribble.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14d33e8bc14ba679595d976789e4eedc36a3d2d21866ab5896355a7e6677749f +size 1445235707 diff --git a/control_v11p_sd15_scribble.yaml b/control_v11p_sd15_scribble.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_scribble.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_seg.pth b/control_v11p_sd15_seg.pth new file mode 100644 index 0000000000000000000000000000000000000000..faec95dfcc9292b453db25702f62e5a07e81da87 --- /dev/null +++ b/control_v11p_sd15_seg.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bad6d753886c0edc978e79dfb06083737ab1bd9702e5b7c0f81b1e16a5ed12cc +size 1445233933 diff --git a/control_v11p_sd15_seg.yaml b/control_v11p_sd15_seg.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_seg.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15_softedge.pth b/control_v11p_sd15_softedge.pth new file mode 100644 index 0000000000000000000000000000000000000000..65f19b20d0f0cf9e17987a33e2a54b931a056f41 --- /dev/null +++ b/control_v11p_sd15_softedge.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20759ef71a109c4796e637d86eea8db492ec9772b6d5cb7a9afd077847b403f3 +size 1445235707 diff --git a/control_v11p_sd15_softedge.yaml b/control_v11p_sd15_softedge.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15_softedge.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v11p_sd15s2_lineart_anime.pth b/control_v11p_sd15s2_lineart_anime.pth new file mode 100644 index 0000000000000000000000000000000000000000..227a215b358f669b0c8c9326898743f8963f5ae1 --- /dev/null +++ b/control_v11p_sd15s2_lineart_anime.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59169ab86e8ae15ad50208cc5832f1cc598b95391d4482ba742b344c8f7f6904 +size 1445238101 diff --git a/control_v11p_sd15s2_lineart_anime.yaml b/control_v11p_sd15s2_lineart_anime.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb --- /dev/null +++ b/control_v11p_sd15s2_lineart_anime.yaml @@ -0,0 +1,79 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/control_v1p_sd15_brightness.safetensors b/control_v1p_sd15_brightness.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e917df15e3dc3fd102668000c2a5516548e41caf --- /dev/null +++ b/control_v1p_sd15_brightness.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9161c3825216e6baa45806fe9763df13ee7c60f0e12e693b7d4a00f039b1ba86 +size 1445154814 diff --git a/control_v1p_sd15_illumination.safetensors b/control_v1p_sd15_illumination.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1523718bb6e97613cbec1c735d6b96ded370d8cc --- /dev/null +++ b/control_v1p_sd15_illumination.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0524d0cca37e98a5402c22c729ee86da76cd4e44449cf6a90b0619b1f8b3b23 +size 1445154814 diff --git a/control_v1p_sd15_qrcode_monster.ckpt b/control_v1p_sd15_qrcode_monster.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..d2993b0f8083c06c62b1798e59f05add8ce377a9 --- /dev/null +++ b/control_v1p_sd15_qrcode_monster.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b5d69da6d00efd8eb8d1f4ba56152ae1b420d6fd55c65813bdb5773c487d4dd +size 1445200597 diff --git a/control_v1p_sd15_qrcode_monster.yaml b/control_v1p_sd15_qrcode_monster.yaml new file mode 100644 index 0000000000000000000000000000000000000000..05dc29ec08445c5701104c66a676df763ec066a5 --- /dev/null +++ b/control_v1p_sd15_qrcode_monster.yaml @@ -0,0 +1,80 @@ +model: + target: cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + control_key: "hint" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + + control_stage_config: + target: cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + hint_channels: 3 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder + diff --git a/diffusion_pytorch_model.safetensors b/diffusion_pytorch_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c07fe4109a7afe72477f6884df8073882bf49658 --- /dev/null +++ b/diffusion_pytorch_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b0696ebf86ab5c87ce553d87f4b62a48dba1add4ee314b6f7ee98e778145f7f +size 1445157124 diff --git a/image_adapter_v14.yaml b/image_adapter_v14.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/image_adapter_v14.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/lightingBasedPicture_v10.safetensors b/lightingBasedPicture_v10.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1523718bb6e97613cbec1c735d6b96ded370d8cc --- /dev/null +++ b/lightingBasedPicture_v10.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0524d0cca37e98a5402c22c729ee86da76cd4e44449cf6a90b0619b1f8b3b23 +size 1445154814 diff --git a/sketch_adapter_v14.yaml b/sketch_adapter_v14.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/sketch_adapter_v14.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_canny_sd14v1.yaml b/t2iadapter_canny_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/t2iadapter_canny_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_canny_sd15v2.yaml b/t2iadapter_canny_sd15v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/t2iadapter_canny_sd15v2.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_color_sd14v1.yaml b/t2iadapter_color_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6780dd94ca6abfe58b4e3dcce1206b902fc3d540 --- /dev/null +++ b/t2iadapter_color_sd14v1.yaml @@ -0,0 +1,6 @@ +model: + target: scripts.adapter.Adapter_light + params: + channels: [320, 640, 1280, 1280] + nums_rb: 4 + cin: 192 \ No newline at end of file diff --git a/t2iadapter_depth_sd14v1.yaml b/t2iadapter_depth_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_depth_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_depth_sd15v2.yaml b/t2iadapter_depth_sd15v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_depth_sd15v2.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_keypose_sd14v1.yaml b/t2iadapter_keypose_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_keypose_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_openpose_sd14v1.yaml b/t2iadapter_openpose_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_openpose_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_seg_sd14v1.yaml b/t2iadapter_seg_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_seg_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_sketch_sd14v1.yaml b/t2iadapter_sketch_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/t2iadapter_sketch_sd14v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_sketch_sd15v2.yaml b/t2iadapter_sketch_sd15v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3 --- /dev/null +++ b/t2iadapter_sketch_sd15v2.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 64 + use_conv: false \ No newline at end of file diff --git a/t2iadapter_style_sd14v1.yaml b/t2iadapter_style_sd14v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f634fbe7e46b9e4057298af395e0a28ac1516cf --- /dev/null +++ b/t2iadapter_style_sd14v1.yaml @@ -0,0 +1,8 @@ +model: + target: scripts.adapter.StyleAdapter + params: + width: 1024 + context_dim: 768 + num_head: 8 + n_layes: 3 + num_token: 8 \ No newline at end of file diff --git a/t2iadapter_zoedepth_sd15v1.yaml b/t2iadapter_zoedepth_sd15v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5 --- /dev/null +++ b/t2iadapter_zoedepth_sd15v1.yaml @@ -0,0 +1,9 @@ +model: + target: tencentarc.t21_adapter + params: + channels: [320, 640, 1280, 1280] + nums_rb: 2 + ksize: 1 + sk: true + cin: 192 + use_conv: false \ No newline at end of file