| { | |
| "model": { | |
| "base_learning_rate": 5e-07, | |
| "target": "ldm.models.diffusion.ddpm.LatentDiffusion", | |
| "params": { | |
| "ckpt_path": null, | |
| "linear_start": 0.0015, | |
| "linear_end": 0.0195, | |
| "num_timesteps_cond": 1, | |
| "log_every_t": 200, | |
| "timesteps": 1000, | |
| "first_stage_key": "image", | |
| "image_size": 64, | |
| "channels": 3, | |
| "cond_stage_trainable": true, | |
| "cond_stage_key": "faceattr", | |
| "conditioning_key": "crossattn", | |
| "monitor": "val/loss_simple_ema", | |
| "unet_config": { | |
| "target": "ldm.modules.diffusionmodules.openaimodel.UNetModel", | |
| "params": { | |
| "image_size": 64, | |
| "in_channels": 3, | |
| "out_channels": 3, | |
| "model_channels": 224, | |
| "attention_resolutions": [ | |
| 8, | |
| 4, | |
| 2 | |
| ], | |
| "num_res_blocks": 2, | |
| "channel_mult": [ | |
| 1, | |
| 2, | |
| 3, | |
| 4 | |
| ], | |
| "num_head_channels": 32, | |
| "use_spatial_transformer": true, | |
| "transformer_depth": 1, | |
| "context_dim": 256 | |
| } | |
| }, | |
| "first_stage_config": { | |
| "target": "ldm.models.autoencoder.VQModelInterface", | |
| "params": { | |
| "embed_dim": 3, | |
| "n_embed": 8192, | |
| "ddconfig": { | |
| "double_z": false, | |
| "z_channels": 3, | |
| "resolution": 256, | |
| "in_channels": 3, | |
| "out_ch": 3, | |
| "ch": 128, | |
| "ch_mult": [ | |
| 1, | |
| 2, | |
| 4 | |
| ], | |
| "num_res_blocks": 2, | |
| "attn_resolutions": [], | |
| "dropout": 0.0 | |
| }, | |
| "lossconfig": { | |
| "target": "torch.nn.Identity" | |
| } | |
| } | |
| }, | |
| "cond_stage_config": { | |
| "target": "ldm.modules.encoders.modules.FaceEmbedder", | |
| "params": { | |
| "lmk_dim": 256, | |
| "comb_mode": "stack", | |
| "keys": [ | |
| "image", | |
| "landmark" | |
| ], | |
| "attention": true, | |
| "merge_eyes": true, | |
| "face_model": "r100", | |
| "affine_crop": true | |
| } | |
| } | |
| } | |
| } | |
| } |