| model: |
| target: sgm.models.diffusion.DiffusionEngine |
| params: |
| scale_factor: 0.13025 |
| disable_first_stage_autocast: True |
|
|
| denoiser_config: |
| target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser |
| params: |
| num_idx: 1000 |
|
|
| weighting_config: |
| target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting |
| scaling_config: |
| target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling |
| discretization_config: |
| target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization |
|
|
| network_config: |
| target: sgm.modules.diffusionmodules.openaimodel.UNetModel |
| params: |
| adm_in_channels: 2816 |
| num_classes: sequential |
| use_checkpoint: True |
| in_channels: 4 |
| out_channels: 4 |
| model_channels: 320 |
| attention_resolutions: [4, 2] |
| num_res_blocks: 2 |
| channel_mult: [1, 2, 4] |
| num_head_channels: 64 |
| use_spatial_transformer: True |
| use_linear_in_transformer: True |
| transformer_depth: [1, 2, 10] |
| context_dim: 2048 |
| spatial_transformer_attn_type: softmax-xformers |
| legacy: False |
|
|
| conditioner_config: |
| target: sgm.modules.GeneralConditioner |
| params: |
| emb_models: |
| |
| - is_trainable: False |
| input_key: txt |
| target: sgm.modules.encoders.modules.FrozenCLIPEmbedder |
| params: |
| layer: hidden |
| layer_idx: 11 |
| |
| - is_trainable: False |
| input_key: txt |
| target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 |
| params: |
| arch: ViT-bigG-14 |
| version: laion2b_s39b_b160k |
| freeze: True |
| layer: penultimate |
| always_return_pooled: True |
| legacy: False |
| |
| - is_trainable: False |
| input_key: original_size_as_tuple |
| target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND |
| params: |
| outdim: 256 |
| |
| - is_trainable: False |
| input_key: crop_coords_top_left |
| target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND |
| params: |
| outdim: 256 |
| |
| - is_trainable: False |
| input_key: target_size_as_tuple |
| target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND |
| params: |
| outdim: 256 |
|
|
| first_stage_config: |
| target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper |
| params: |
| embed_dim: 4 |
| monitor: val/rec_loss |
| ddconfig: |
| attn_type: vanilla-xformers |
| double_z: true |
| z_channels: 4 |
| resolution: 256 |
| in_channels: 3 |
| out_ch: 3 |
| ch: 128 |
| ch_mult: [1, 2, 4, 4] |
| num_res_blocks: 2 |
| attn_resolutions: [] |
| dropout: 0.0 |
| lossconfig: |
| target: torch.nn.Identity |