diff --git a/audio_encoders/put_audio_encoder_models_here b/audio_encoders/put_audio_encoder_models_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/checkpoints/put_checkpoints_here b/checkpoints/put_checkpoints_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/clip/put_clip_or_text_encoder_models_here b/clip/put_clip_or_text_encoder_models_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/clip/umt5_xxl_fp16.safetensors b/clip/umt5_xxl_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9aa640a3a4ba42fcb2baa7c929a0e0ec9cdc9968 --- /dev/null +++ b/clip/umt5_xxl_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b8850f1961e1cf8a77cca4c964a358d303f490833c6c087d0cff4b2f99db2af +size 11366399385 diff --git a/clip_vision/clip_vision_h.safetensors b/clip_vision/clip_vision_h.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0451a9fda3873d0db8725fcea14e5660e1793f83 --- /dev/null +++ b/clip_vision/clip_vision_h.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64a7ef761bfccbadbaa3da77366aac4185a6c58fa5de5f589b42a65bcc21f161 +size 1264219396 diff --git a/clip_vision/put_clip_vision_models_here b/clip_vision/put_clip_vision_models_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/configs/anything_v3.yaml b/configs/anything_v3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8bcfe584ae73d60e2c7a6f89b3f7befbd487ea34 --- /dev/null +++ b/configs/anything_v3.yaml @@ -0,0 +1,73 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder + params: + layer: "hidden" + layer_idx: -2 diff --git a/configs/v1-inference.yaml b/configs/v1-inference.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4effe569e897369918625f9d8be5603a0e6a0d6 --- /dev/null +++ b/configs/v1-inference.yaml @@ -0,0 +1,70 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/configs/v1-inference_clip_skip_2.yaml b/configs/v1-inference_clip_skip_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8bcfe584ae73d60e2c7a6f89b3f7befbd487ea34 --- /dev/null +++ b/configs/v1-inference_clip_skip_2.yaml @@ -0,0 +1,73 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder + params: + layer: "hidden" + layer_idx: -2 diff --git a/configs/v1-inference_clip_skip_2_fp16.yaml b/configs/v1-inference_clip_skip_2_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7eca31c7b5e571c2b1348e94ed9d69978ebd2d52 --- /dev/null +++ b/configs/v1-inference_clip_skip_2_fp16.yaml @@ -0,0 +1,74 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder + params: + layer: "hidden" + layer_idx: -2 diff --git a/configs/v1-inference_fp16.yaml b/configs/v1-inference_fp16.yaml new file mode 100644 index 0000000000000000000000000000000000000000..147f42b17b835cc839338156f99e8f971df5c1aa --- /dev/null +++ b/configs/v1-inference_fp16.yaml @@ -0,0 +1,71 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/configs/v1-inpainting-inference.yaml b/configs/v1-inpainting-inference.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45f3f82d461cd8c6109f26ec3b1da75366eda0b0 --- /dev/null +++ b/configs/v1-inpainting-inference.yaml @@ -0,0 +1,71 @@ +model: + base_learning_rate: 7.5e-05 + target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: hybrid # important + monitor: val/loss_simple_ema + scale_factor: 0.18215 + finetune_keys: null + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 9 # 4 data + 4 downscaled image + 1 mask + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder + diff --git a/configs/v2-inference-v.yaml b/configs/v2-inference-v.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8ec8dfbfefe94ae8522c93017668fea78d580acf --- /dev/null +++ b/configs/v2-inference-v.yaml @@ -0,0 +1,68 @@ +model: + base_learning_rate: 1.0e-4 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + parameterization: "v" + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False # we set this to false because this is an inference only config + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/configs/v2-inference-v_fp32.yaml b/configs/v2-inference-v_fp32.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d5c9b9cb29ca162ade44a7c922f59e75d7d57813 --- /dev/null +++ b/configs/v2-inference-v_fp32.yaml @@ -0,0 +1,68 @@ +model: + base_learning_rate: 1.0e-4 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + parameterization: "v" + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False # we set this to false because this is an inference only config + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + use_fp16: False + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/configs/v2-inference.yaml b/configs/v2-inference.yaml new file mode 100644 index 0000000000000000000000000000000000000000..152c4f3c2b36c3b246a9cb10eb8166134b0d2e1c --- /dev/null +++ b/configs/v2-inference.yaml @@ -0,0 +1,67 @@ +model: + base_learning_rate: 1.0e-4 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False # we set this to false because this is an inference only config + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/configs/v2-inference_fp32.yaml b/configs/v2-inference_fp32.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d03231f3f2c2e8ef8fbe0d781e5f3d65409ef3a --- /dev/null +++ b/configs/v2-inference_fp32.yaml @@ -0,0 +1,67 @@ +model: + base_learning_rate: 1.0e-4 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False # we set this to false because this is an inference only config + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + use_fp16: False + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/configs/v2-inpainting-inference.yaml b/configs/v2-inpainting-inference.yaml new file mode 100644 index 0000000000000000000000000000000000000000..32a9471d71b828c51bcbbabfe34c5f6c8282c803 --- /dev/null +++ b/configs/v2-inpainting-inference.yaml @@ -0,0 +1,158 @@ +model: + base_learning_rate: 5.0e-05 + target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: hybrid + scale_factor: 0.18215 + monitor: val/loss_simple_ema + finetune_keys: null + use_ema: False + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + image_size: 32 # unused + in_channels: 9 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [ ] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" + + +data: + target: ldm.data.laion.WebDataModuleFromConfig + params: + tar_base: null # for concat as in LAION-A + p_unsafe_threshold: 0.1 + filter_word_list: "data/filters.yaml" + max_pwatermark: 0.45 + batch_size: 8 + num_workers: 6 + multinode: True + min_size: 512 + train: + shards: + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -" + - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar" + shuffle: 10000 + image_key: jpg + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.RandomCrop + params: + size: 512 + postprocess: + target: ldm.data.laion.AddMask + params: + mode: "512train-large" + p_drop: 0.25 + # NOTE use enough shards to avoid empty validation loops in workers + validation: + shards: + - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - " + shuffle: 0 + image_key: jpg + image_transforms: + - target: torchvision.transforms.Resize + params: + size: 512 + interpolation: 3 + - target: torchvision.transforms.CenterCrop + params: + size: 512 + postprocess: + target: ldm.data.laion.AddMask + params: + mode: "512train-large" + p_drop: 0.25 + +lightning: + find_unused_parameters: True + modelcheckpoint: + params: + every_n_train_steps: 5000 + + callbacks: + metrics_over_trainsteps_checkpoint: + params: + every_n_train_steps: 10000 + + image_logger: + target: main.ImageLogger + params: + enable_autocast: False + disabled: False + batch_frequency: 1000 + max_images: 4 + increase_log_steps: False + log_first_step: False + log_images_kwargs: + use_ema_scope: False + inpaint: False + plot_progressive_rows: False + plot_diffusion_rows: False + N: 4 + unconditional_guidance_scale: 5.0 + unconditional_guidance_label: [""] + ddim_steps: 50 # todo check these out for depth2img, + ddim_eta: 0.0 # todo check these out for depth2img, + + trainer: + benchmark: True + val_check_interval: 5000000 + num_sanity_val_steps: 0 + accumulate_grad_batches: 1 diff --git a/controlnet/put_controlnets_and_t2i_here b/controlnet/put_controlnets_and_t2i_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/detection/vitpose_h_wholebody_data.bin b/detection/vitpose_h_wholebody_data.bin new file mode 100644 index 0000000000000000000000000000000000000000..309a258c01a80402762b1f57208b90c2f2803ae6 --- /dev/null +++ b/detection/vitpose_h_wholebody_data.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6a9e7cb3a87ed65a098b096029e70150408acfafc3d695019a66b289d7719e1 +size 2548958740 diff --git a/detection/vitpose_h_wholebody_model.onnx b/detection/vitpose_h_wholebody_model.onnx new file mode 100644 index 0000000000000000000000000000000000000000..26819d54d1113adb18c975435f4c2104094802cd --- /dev/null +++ b/detection/vitpose_h_wholebody_model.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21466cd6c93d0066782ad5923c14a4e6569133def212dc2895c73596c2e553b +size 420252 diff --git a/detection/yolov10m.onnx b/detection/yolov10m.onnx new file mode 100644 index 0000000000000000000000000000000000000000..b781a65ff19ca23d2db19972c662f71104a4ecc8 --- /dev/null +++ b/detection/yolov10m.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89b526498a6d55f869a6ab52e3a2eb20ad45b3711c1f7de3dd9ca0b399dfd6d7 +size 61659339 diff --git a/diffusers/put_diffusers_models_here b/diffusers/put_diffusers_models_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/diffusion_models/put_diffusion_model_files_here b/diffusion_models/put_diffusion_model_files_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/diffusion_models/wan2.2_animate_14B_bf16.safetensors b/diffusion_models/wan2.2_animate_14B_bf16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..77ddd725cd59a9b6efe7c269a8ab207825cc7be9 --- /dev/null +++ b/diffusion_models/wan2.2_animate_14B_bf16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d37cb0120488dec2a061135aa3018426925318a60838c3de935c829abc88667 +size 34549787368 diff --git a/diffusion_models/wan2.2_t2v_low_noise_14B_fp16.safetensors b/diffusion_models/wan2.2_t2v_low_noise_14B_fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b9aae7cbf1245a23edc3a2a03f32e2ae43c272c --- /dev/null +++ b/diffusion_models/wan2.2_t2v_low_noise_14B_fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:431d1613ffa809ae1f735b661a01788c6d74991f51efd01f45d5aee955ccd224 +size 28577095592 diff --git a/embeddings/put_embeddings_or_textual_inversion_concepts_here b/embeddings/put_embeddings_or_textual_inversion_concepts_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/gligen/put_gligen_models_here b/gligen/put_gligen_models_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/hypernetworks/put_hypernetworks_here b/hypernetworks/put_hypernetworks_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/latent_upscale_models/put_latent_upscale_models_here b/latent_upscale_models/put_latent_upscale_models_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/loras/lightx2v_I2V_14B_480p_cfg_step_distill_rank256_bf16.safetensors b/loras/lightx2v_I2V_14B_480p_cfg_step_distill_rank256_bf16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce8dfe3127d8fd1080e3b1e8fb37793b84ba0245 --- /dev/null +++ b/loras/lightx2v_I2V_14B_480p_cfg_step_distill_rank256_bf16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f974aced60f1d833eca0f5d6851346fbb6eacc307c6371555b565f5617fe3859 +size 2923503944 diff --git a/loras/lightx2v_elite_it2v_animate_face.safetensors b/loras/lightx2v_elite_it2v_animate_face.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..414d9578ae2aba7230710a66091f9107c82ea92e --- /dev/null +++ b/loras/lightx2v_elite_it2v_animate_face.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d85056b5a2df1f211a42c232ce71c0c23636313c84460df21765eee211cb518 +size 3257907064 diff --git a/loras/put_loras_here b/loras/put_loras_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/loras/wan2.2_animate_14B_relight_lora_bf16.safetensors b/loras/wan2.2_animate_14B_relight_lora_bf16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b0d09b9692f7a7f27bff2682d98e2558a2d44d23 --- /dev/null +++ b/loras/wan2.2_animate_14B_relight_lora_bf16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f4b6b9d3bc745a86e7bfd511f3880d90cf59c3bded9584d92c028f583fa74a3 +size 1436673432 diff --git a/model_patches/put_model_patches_here b/model_patches/put_model_patches_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/photomaker/put_photomaker_models_here b/photomaker/put_photomaker_models_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sam2/.cache/huggingface/.gitignore b/sam2/.cache/huggingface/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f59ec20aabf5842d237244ece8c81ab184faeac1 --- /dev/null +++ b/sam2/.cache/huggingface/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/sam2/.cache/huggingface/download/sam2.1_hiera_base_plus-fp16.safetensors.lock b/sam2/.cache/huggingface/download/sam2.1_hiera_base_plus-fp16.safetensors.lock new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sam2/.cache/huggingface/download/sam2.1_hiera_base_plus-fp16.safetensors.metadata b/sam2/.cache/huggingface/download/sam2.1_hiera_base_plus-fp16.safetensors.metadata new file mode 100644 index 0000000000000000000000000000000000000000..d7618916a28c1414a688fd64bd3950a8016daf20 --- /dev/null +++ b/sam2/.cache/huggingface/download/sam2.1_hiera_base_plus-fp16.safetensors.metadata @@ -0,0 +1,3 @@ +f885607d88bb3f9145efa49c3e3c50a9e5bf13eb +a2693628452963a5f17e73a70a90b5faa112109307c828dec36e5fb407061005 +1775155236.8529332 diff --git a/sam2/sam2.1_hiera_base_plus-fp16.safetensors b/sam2/sam2.1_hiera_base_plus-fp16.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2f30b72892eacb5500ab87cfc2c3308bc5f03a70 --- /dev/null +++ b/sam2/sam2.1_hiera_base_plus-fp16.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2693628452963a5f17e73a70a90b5faa112109307c828dec36e5fb407061005 +size 161773292 diff --git a/sams/sam_vit_b_01ec64.pth b/sams/sam_vit_b_01ec64.pth new file mode 100644 index 0000000000000000000000000000000000000000..ab7d111e57bd052a76fe669986560e3555e9c8f6 --- /dev/null +++ b/sams/sam_vit_b_01ec64.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2df62732614e57411cdcf32a23ffdf28910380d03139ee0f4fcbe91eb8c912 +size 375042383 diff --git a/sams/sam_vit_l_0b3195.pth b/sams/sam_vit_l_0b3195.pth new file mode 100644 index 0000000000000000000000000000000000000000..87a638d6b789dd2b10fc7414a88dacc34a50769a --- /dev/null +++ b/sams/sam_vit_l_0b3195.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3adcc4315b642a4d2101128f611684e8734c41232a17c648ed1693702a49a622 +size 1249524607 diff --git a/style_models/put_t2i_style_model_here b/style_models/put_t2i_style_model_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/text_encoders/put_text_encoder_files_here b/text_encoders/put_text_encoder_files_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ultralytics/bbox/face_yolov8m.pt b/ultralytics/bbox/face_yolov8m.pt new file mode 100644 index 0000000000000000000000000000000000000000..dbfa5813f1ecf8c0b80c12fc5951a706afdeaf30 --- /dev/null +++ b/ultralytics/bbox/face_yolov8m.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3893a92c5c1907136b6cc75404094db767c1e0cfefe1b43e87dad72af2e4c9f +size 51996128 diff --git a/ultralytics/bbox/hand_yolov8n.pt b/ultralytics/bbox/hand_yolov8n.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2ba2d9d01df9bd230fd4a110de7e0ce06875463 --- /dev/null +++ b/ultralytics/bbox/hand_yolov8n.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3991202eb69e9ddcb3b9ba80cdeb41e734ffaf844403d6c9f47d515cd88c6f29 +size 6237883 diff --git a/ultralytics/bbox/nails_seg_s_yolov8_v1.pt b/ultralytics/bbox/nails_seg_s_yolov8_v1.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2412ddd16f6f8102ce8e7b26a62e2eadb10c2c8 --- /dev/null +++ b/ultralytics/bbox/nails_seg_s_yolov8_v1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99b7d1c6ceb4bde32d80fe7ae8c8eb809c27d99b55cf9db54b6692afe68f4070 +size 23860899 diff --git a/ultralytics/bbox/nipple.pt b/ultralytics/bbox/nipple.pt new file mode 100644 index 0000000000000000000000000000000000000000..9884b138d6134a048a9b801bbb6cd9ad2a75a2a0 --- /dev/null +++ b/ultralytics/bbox/nipple.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67e04f8d23cb3e56f94a2b892657e500184a9cceabd7cfad2adeb313c19c5b5c +size 36613559 diff --git a/ultralytics/bbox/phone_v01.pt b/ultralytics/bbox/phone_v01.pt new file mode 100644 index 0000000000000000000000000000000000000000..30f8a9e8c0352ff1a20f84fdd410e1e733f45000 --- /dev/null +++ b/ultralytics/bbox/phone_v01.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c69b2060fcbd530665cc20be3f95dee4762359d71906e5599e6816c7bc30ccf1 +size 6328174 diff --git a/ultralytics/bbox/pussy.pt b/ultralytics/bbox/pussy.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c885b229320bd28cec3ece88038efccd1544581 --- /dev/null +++ b/ultralytics/bbox/pussy.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:431f9bb029ca8a06905054cd56659268e165b6f98651fd651fd23d4dc4408a88 +size 6241198 diff --git a/ultralytics/bbox/yolov8m.pt b/ultralytics/bbox/yolov8m.pt new file mode 100644 index 0000000000000000000000000000000000000000..eea29123e0ae14239bf731ef2c912b8afa2cbd14 --- /dev/null +++ b/ultralytics/bbox/yolov8m.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d4a90cdc7a21786cc59cd19778e9eafff836df9e2da32524737c7ee6efe4fe5 +size 52136884 diff --git a/ultralytics/bbox/yolov8n-pose.pt b/ultralytics/bbox/yolov8n-pose.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc480842e923acaa94cec02a5335edbef298a0e1 --- /dev/null +++ b/ultralytics/bbox/yolov8n-pose.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6fa93dd1ee4a2c18c900a45c1d864a1c6f7aba75d84f91648a30b7fb641d212 +size 6832633 diff --git a/ultralytics/bbox/yolov8n.pt b/ultralytics/bbox/yolov8n.pt new file mode 100644 index 0000000000000000000000000000000000000000..719e6f1dbdfe7c560e5933fc8b0c5a7e857d0234 --- /dev/null +++ b/ultralytics/bbox/yolov8n.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59b3d833e2ff32e194b5bb8e08d211dc7c5bdf144b90d2c8412c47ccfc83b36 +size 6549796 diff --git a/ultralytics/segm/person_yolov8m-seg.pt b/ultralytics/segm/person_yolov8m-seg.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b072e035e0c72e7b62767f4ba6d53be332da820 --- /dev/null +++ b/ultralytics/segm/person_yolov8m-seg.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8ab26f517173b1fe8342d336a09f443eb61cb08dcbfc78d53fff4c2547ae81e +size 54827683 diff --git a/ultralytics/segm/yolov8m-seg.pt b/ultralytics/segm/yolov8m-seg.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9103e1eb76addbd7cf20ac60170e315f5a6eccd --- /dev/null +++ b/ultralytics/segm/yolov8m-seg.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51fa7e5ef385efa6d5b1d8e31b73399be6ed5d7ca71bda4bd4b2794bb445c4f4 +size 54921020 diff --git a/ultralytics/segm/yolov8n-seg.pt b/ultralytics/segm/yolov8n-seg.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a24abfad237ab25fef7484a351fd667b9e1e601 --- /dev/null +++ b/ultralytics/segm/yolov8n-seg.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7cd8f929e1903d78a12a48efecab430209f18dc46cb96c3599a5980c63c423c +size 7071756 diff --git a/unet/put_unet_files_here b/unet/put_unet_files_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/upscale_models/1xSkinContrast-High-SuperUltraCompact.pth b/upscale_models/1xSkinContrast-High-SuperUltraCompact.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a4e6fa7b6974d6fa8568094386d9eaaf1d48bc8 --- /dev/null +++ b/upscale_models/1xSkinContrast-High-SuperUltraCompact.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bbea62563203f099ea1cf60ce483ac60d22eb93d30f11be4cb1a229e3585e9e +size 181474 diff --git a/upscale_models/put_esrgan_and_other_upscale_models_here b/upscale_models/put_esrgan_and_other_upscale_models_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vae/put_vae_here b/vae/put_vae_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vae/wan_2.1_vae.safetensors b/vae/wan_2.1_vae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5ff32b52dac309662138ca42604696d34eaad7b8 --- /dev/null +++ b/vae/wan_2.1_vae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fc39d31359a4b0a64f55876d8ff7fa8d780956ae2cb13463b0223e15148976b +size 253815318 diff --git a/vae_approx/put_taesd_encoder_pth_and_taesd_decoder_pth_here b/vae_approx/put_taesd_encoder_pth_and_taesd_decoder_pth_here new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391