File size: 3,241 Bytes
b708a95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
model:
  cldm:
    target: model.cldm.ControlLDM
    params:
      latent_scale_factor: 0.18215
      unet_cfg:
        use_checkpoint: True
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_head_channels: 64 # need to fix for flash-attn
        use_spatial_transformer: True
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        legacy: False
      vae_cfg:
        embed_dim: 4
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
      clip_cfg:
        embed_dim: 1024
        vision_cfg:
          image_size: 224
          layers: 32
          width: 1280
          head_width: 80
          patch_size: 14
        text_cfg:
          context_length: 77
          vocab_size: 49408
          width: 1024
          heads: 16
          layers: 24
        layer: "penultimate"
      controlnet_cfg:
        use_checkpoint: True
        image_size: 32 # unused
        in_channels: 4
        hint_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_head_channels: 64 # need to fix for flash-attn
        use_spatial_transformer: True
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        legacy: False

  swinir:
    target: model.swinir.SwinIR
    params:
      img_size: 64
      patch_size: 1
      in_chans: 3
      embed_dim: 180
      depths: [6, 6, 6, 6, 6, 6, 6, 6]
      num_heads: [6, 6, 6, 6, 6, 6, 6, 6]
      window_size: 8
      mlp_ratio: 2
      sf: 8
      img_range: 1.0
      upsampler: "nearest+conv"
      resi_connection: "1conv"
      unshuffle: True
      unshuffle_scale: 8

  diffusion:
    target: model.gaussian_diffusion.Diffusion
    params:
      linear_start: 0.00085
      linear_end: 0.0120
      timesteps: 1000

dataset:
  train:
    target: dataset.codeformer.CodeformerDataset
    params:
      # training file list path
      file_list: 
      file_backend_cfg:
        target: dataset.file_backend.HardDiskBackend
      out_size: 512
      crop_type: center
      blur_kernel_size: 41
      kernel_list: ['iso', 'aniso']
      kernel_prob: [0.5, 0.5]
      blur_sigma: [0.1, 12]
      downsample_range: [1, 12]
      noise_range: [0, 15]
      jpeg_range: [30, 100]

train:
  # pretrained sd v2.1 path
  sd_path: 
  # experiment directory path
  exp_dir: 
  # stage 1 swinir path
  swinir_path: 
  learning_rate: 1e-4
  # ImageNet 1k (1.3M images)
  # batch size = 192, lr = 1e-4, total training steps = 25k
  # Our filtered laion2b-en (15M images)
  # batch size = 256, lr = 1e-4 (first 30k), 1e-5 (next 50k), total training steps = 80k
  batch_size: 256
  num_workers: 
  train_steps: 30000
  log_every: 50
  ckpt_every: 10000
  image_every: 1000
  resume: ~