| { | |
| "linear_start": 0.0015, | |
| "linear_end": 0.0295, | |
| "num_timesteps_cond": 1, | |
| "log_every_t": 200, | |
| "timesteps": 1000, | |
| "first_stage_key": "target_img", | |
| "cond_stage_key": "table, prev_img", | |
| "image_size": 64, | |
| "channels": 3, | |
| "cond_stage_trainable": true, | |
| "conditioning_key": "crossattn", | |
| "monitor": "val/loss_simple_ema", | |
| "scale_factor": 0.18215, | |
| "use_ema": true, | |
| "load_ema": false, | |
| "unet_config": { | |
| "target": "cheff.ldm.modules.diffusionmodules.openaimodel.UNetModel", | |
| "params": { | |
| "image_size": 64, | |
| "in_channels": 3, | |
| "out_channels": 3, | |
| "model_channels": 224, | |
| "attention_resolutions": [ | |
| 8, | |
| 4, | |
| 2 | |
| ], | |
| "num_res_blocks": 2, | |
| "channel_mult": [ | |
| 1, | |
| 2, | |
| 4, | |
| 4 | |
| ], | |
| "num_heads": 8, | |
| "use_spatial_transformer": true, | |
| "transformer_depth": 1, | |
| "context_dim": 768, | |
| "use_checkpoint": true, | |
| "legacy": false | |
| } | |
| }, | |
| "first_stage_config": { | |
| "target": "cheff.ldm.models.autoencoder.AutoencoderKL", | |
| "params": { | |
| "embed_dim": 3, | |
| "ckpt_path": null, | |
| "ddconfig": { | |
| "double_z": true, | |
| "z_channels": 3, | |
| "resolution": 256, | |
| "in_channels": 3, | |
| "out_ch": 3, | |
| "ch": 128, | |
| "ch_mult": [ | |
| 1, | |
| 2, | |
| 4 | |
| ], | |
| "num_res_blocks": 2, | |
| "attn_resolutions": [], | |
| "dropout": 0.0 | |
| }, | |
| "lossconfig": { | |
| "target": "torch.nn.Identity" | |
| } | |
| } | |
| }, | |
| "cond_stage_config": { | |
| "target": "cheff.ldm.modules.encoders.modules.MultiModalTransformerAdaptor", | |
| "params": { | |
| "autoencoder_config": { | |
| "embed_dim": 3, | |
| "ckpt_path": null, | |
| "ddconfig": { | |
| "double_z": true, | |
| "z_channels": 3, | |
| "resolution": 256, | |
| "in_channels": 3, | |
| "out_ch": 3, | |
| "ch": 128, | |
| "ch_mult": [ | |
| 1, | |
| 2, | |
| 4 | |
| ], | |
| "num_res_blocks": 2, | |
| "attn_resolutions": [], | |
| "dropout": 0.0 | |
| }, | |
| "lossconfig": { | |
| "target": "torch.nn.Identity" | |
| } | |
| }, | |
| "clip_visual_enc_config": { | |
| "input_resolution": 256, | |
| "layers": 12, | |
| "width": 768, | |
| "patch_size": 32, | |
| "heads": 12 | |
| }, | |
| "clip_enc_checkpoint": null, | |
| "context_dim": 768, | |
| "condition_feat_dim": 1024, | |
| "clip_trainable": true | |
| } | |
| } | |
| } |