File size: 2,472 Bytes
86b2d29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67ff59e
86b2d29
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
{
  "linear_start": 0.0015,
  "linear_end": 0.0295,
  "num_timesteps_cond": 1,
  "log_every_t": 200,
  "timesteps": 1000,
  "first_stage_key": "target_img",
  "cond_stage_key": "table, prev_img",
  "image_size": 64,
  "channels": 3,
  "cond_stage_trainable": true,
  "conditioning_key": "crossattn",
  "monitor": "val/loss_simple_ema",
  "scale_factor": 0.18215,
  "use_ema": true,
  "load_ema": false,
  "unet_config": {
    "target": "cheff.ldm.modules.diffusionmodules.openaimodel.UNetModel",
    "params": {
      "image_size": 64,
      "in_channels": 3,
      "out_channels": 3,
      "model_channels": 224,
      "attention_resolutions": [
        8,
        4,
        2
      ],
      "num_res_blocks": 2,
      "channel_mult": [
        1,
        2,
        4,
        4
      ],
      "num_heads": 8,
      "use_spatial_transformer": true,
      "transformer_depth": 1,
      "context_dim": 768,
      "use_checkpoint": true,
      "legacy": false
    }
  },
  "first_stage_config": {
    "target": "cheff.ldm.models.autoencoder.AutoencoderKL",
    "params": {
      "embed_dim": 3,
      "ckpt_path": null,
      "ddconfig": {
        "double_z": true,
        "z_channels": 3,
        "resolution": 256,
        "in_channels": 3,
        "out_ch": 3,
        "ch": 128,
        "ch_mult": [
          1,
          2,
          4
        ],
        "num_res_blocks": 2,
        "attn_resolutions": [],
        "dropout": 0.0
      },
      "lossconfig": {
        "target": "torch.nn.Identity"
      }
    }
  },
  "cond_stage_config": {
    "target": "cheff.ldm.modules.encoders.modules.MultiModalTransformerAdaptor",
    "params": {
      "autoencoder_config": {
        "embed_dim": 3,
        "ckpt_path": null,
        "ddconfig": {
          "double_z": true,
          "z_channels": 3,
          "resolution": 256,
          "in_channels": 3,
          "out_ch": 3,
          "ch": 128,
          "ch_mult": [
            1,
            2,
            4
          ],
          "num_res_blocks": 2,
          "attn_resolutions": [],
          "dropout": 0.0
        },
        "lossconfig": {
          "target": "torch.nn.Identity"
        }
      },
      "clip_visual_enc_config": {
        "input_resolution": 256,
        "layers": 12,
        "width": 768,
        "patch_size": 32,
        "heads": 12
      },
      "clip_enc_checkpoint": null,
      "context_dim": 768,
      "condition_feat_dim": 1024,
      "clip_trainable": true
    }
  }
}