{ "linear_start": 0.0015, "linear_end": 0.0295, "num_timesteps_cond": 1, "log_every_t": 200, "timesteps": 1000, "first_stage_key": "target_img", "cond_stage_key": "table, prev_img", "image_size": 64, "channels": 3, "cond_stage_trainable": true, "conditioning_key": "crossattn", "monitor": "val/loss_simple_ema", "scale_factor": 0.18215, "use_ema": true, "load_ema": false, "unet_config": { "target": "cheff.ldm.modules.diffusionmodules.openaimodel.UNetModel", "params": { "image_size": 64, "in_channels": 3, "out_channels": 3, "model_channels": 224, "attention_resolutions": [ 8, 4, 2 ], "num_res_blocks": 2, "channel_mult": [ 1, 2, 4, 4 ], "num_heads": 8, "use_spatial_transformer": true, "transformer_depth": 1, "context_dim": 768, "use_checkpoint": true, "legacy": false } }, "first_stage_config": { "target": "cheff.ldm.models.autoencoder.AutoencoderKL", "params": { "embed_dim": 3, "ckpt_path": null, "ddconfig": { "double_z": true, "z_channels": 3, "resolution": 256, "in_channels": 3, "out_ch": 3, "ch": 128, "ch_mult": [ 1, 2, 4 ], "num_res_blocks": 2, "attn_resolutions": [], "dropout": 0.0 }, "lossconfig": { "target": "torch.nn.Identity" } } }, "cond_stage_config": { "target": "cheff.ldm.modules.encoders.modules.MultiModalTransformerAdaptor", "params": { "autoencoder_config": { "embed_dim": 3, "ckpt_path": null, "ddconfig": { "double_z": true, "z_channels": 3, "resolution": 256, "in_channels": 3, "out_ch": 3, "ch": 128, "ch_mult": [ 1, 2, 4 ], "num_res_blocks": 2, "attn_resolutions": [], "dropout": 0.0 }, "lossconfig": { "target": "torch.nn.Identity" } }, "clip_visual_enc_config": { "input_resolution": 256, "layers": 12, "width": 768, "patch_size": 32, "heads": 12 }, "clip_enc_checkpoint": null, "context_dim": 768, "condition_feat_dim": 1024, "clip_trainable": true } } }