File size: 3,241 Bytes
b708a95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
model:
cldm:
target: model.cldm.ControlLDM
params:
latent_scale_factor: 0.18215
unet_cfg:
use_checkpoint: True
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
legacy: False
vae_cfg:
embed_dim: 4
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
clip_cfg:
embed_dim: 1024
vision_cfg:
image_size: 224
layers: 32
width: 1280
head_width: 80
patch_size: 14
text_cfg:
context_length: 77
vocab_size: 49408
width: 1024
heads: 16
layers: 24
layer: "penultimate"
controlnet_cfg:
use_checkpoint: True
image_size: 32 # unused
in_channels: 4
hint_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
legacy: False
swinir:
target: model.swinir.SwinIR
params:
img_size: 64
patch_size: 1
in_chans: 3
embed_dim: 180
depths: [6, 6, 6, 6, 6, 6, 6, 6]
num_heads: [6, 6, 6, 6, 6, 6, 6, 6]
window_size: 8
mlp_ratio: 2
sf: 8
img_range: 1.0
upsampler: "nearest+conv"
resi_connection: "1conv"
unshuffle: True
unshuffle_scale: 8
diffusion:
target: model.gaussian_diffusion.Diffusion
params:
linear_start: 0.00085
linear_end: 0.0120
timesteps: 1000
dataset:
train:
target: dataset.codeformer.CodeformerDataset
params:
# training file list path
file_list:
file_backend_cfg:
target: dataset.file_backend.HardDiskBackend
out_size: 512
crop_type: center
blur_kernel_size: 41
kernel_list: ['iso', 'aniso']
kernel_prob: [0.5, 0.5]
blur_sigma: [0.1, 12]
downsample_range: [1, 12]
noise_range: [0, 15]
jpeg_range: [30, 100]
train:
# pretrained sd v2.1 path
sd_path:
# experiment directory path
exp_dir:
# stage 1 swinir path
swinir_path:
learning_rate: 1e-4
# ImageNet 1k (1.3M images)
# batch size = 192, lr = 1e-4, total training steps = 25k
# Our filtered laion2b-en (15M images)
# batch size = 256, lr = 1e-4 (first 30k), 1e-5 (next 50k), total training steps = 80k
batch_size: 256
num_workers:
train_steps: 30000
log_every: 50
ckpt_every: 10000
image_every: 1000
resume: ~
|