File size: 5,901 Bytes
497d0e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
data:
    data_dir:
    - /home/work/shared-fi-datasets-01/users/hsiang.chen/Project/Datasets/IR
    caption_proportion:
        prompt: 1
    external_caption_suffixes: []
    external_clipscore_suffixes: []
    clip_thr_temperature: 0.1
    clip_thr: 25.0
    del_img_clip_thr: 0.0
    sort_dataset: false
    load_text_feat: false
    load_vae_feat: false
    transform: default_train
    type: IRImgDataset
    image_size: 256
    hq_only: false
    valid_num: 0
    data: null
    extra: null
    dset: train_brief
    max_samples: null
model:
    model: SD35M_D2C
    model_pretrained: ./checkpoints/stable-diffusion-3.5-medium/sd3.5_medium.safetensors
    shift: 3.0
    teacher: null
    input_channel: 16
    image_size: 256
    mixed_precision: bf16
    fp32_attention: true
    load_from: null
    discriminator_model: null
    teacher_model: null
    teacher_model_weight_dtype: null
    resume_from:
        checkpoint: latest
        load_ema: false
        resume_optimizer: true
        resume_lr_scheduler: true
    aspect_ratio_type: ASPECT_RATIO_1024
    multi_scale: false
    pe_interpolation: 1.0
    micro_condition: false
    attn_type: linear
    autocast_linear_attn: false
    ffn_type: glumbconv
    mlp_acts:
    - silu
    - silu
    - null
    mlp_ratio: 2.5
    use_pe: false
    pos_embed_type: sincos
    qk_norm: false
    class_dropout_prob: 0.1
    linear_head_dim: 32
    cross_norm: false
    cross_attn_type: flash
    logvar: false
    cfg_scale: 4
    cfg_embed: false
    cfg_embed_scale: 1.0
    guidance_type: classifier-free
    pag_applied_layers:
    - 8
    ladd_multi_scale: true
    head_block_ids: null
    extra: null
vae:
    vae_type: SDVAE
    vae_pretrained: ./checkpoints/stable-diffusion-3.5-medium/sd3.5_medium.safetensors
    weight_dtype: float32
    scale_factor: 0.41407
    vae_latent_dim: 16
    vae_downsample_rate: 8
    sample_posterior: true
    extra: null
text_encoder:
    text_encoder_name: sd35-text
    text_encoder_pretrained: ./checkpoints/stable-diffusion-3.5-medium/text_encoders
    caption_channels: 4096
    y_norm: true
    y_norm_scale_factor: 0.01
    model_max_length: 300
    chi_prompt:
    - a photo of a cat
    - Convenience store entrance at night. On the glass door, a vinyl decal reads
        'OPEN FOR QUALITY'. Inside, shelves and fluorescent lights; outside, a cyclist
        passing by
    - Sunrise beach, shallow tide washing over smooth sand. A piece of weathered driftwood
        lies near the shoreline with a subtle branded text [SOS] on its surface; wet
        sand reflections, micro-ripples, sun flare at horizon.
    extra: null
scheduler:
    train_sampling_steps: 1000
    predict_flow_v: true
    noise_schedule: linear_flow
    pred_sigma: false
    learn_sigma: true
    vis_sampler: flow_dpm-solver
    flow_shift: 3.0
    weighting_scheme: logit_normal
    weighting_scheme_discriminator: logit_normal_trigflow
    add_noise_timesteps:
    - 1.5708
    logit_mean: 0.0
    logit_std: 1.0
    logit_mean_discriminator: 0.0
    logit_std_discriminator: 1.0
    sigma_data: 0.5
    timestep_norm_scale_factor: 1.0
    extra: null
train:
    num_workers: 10
    seed: 1229
    train_batch_size: 4
    num_epochs: 100
    gradient_accumulation_steps: 8
    grad_checkpointing: true
    gradient_clip: 0.1
    gc_step: 1
    optimizer:
        betas:
        - 0.9
        - 0.999
        - 0.9999
        eps:
        - 1.0e-30
        - 1.0e-16
        lr: 5.0e-05
        type: CAMEWrapper
        weight_decay: 0.0
    optimizer_D:
        eps: 1.0e-10
        lr: 0.0001
        type: AdamW
        weight_decay: 0.03
    load_from_optimizer: false
    load_from_lr_scheduler: false
    resume_lr_scheduler: true
    lr_schedule: cosine
    lr_schedule_args:
        num_warmup_steps: 2000
    auto_lr:
        rule: sqrt
    eval_batch_size: 16
    use_fsdp: false
    use_flash_attn: false
    eval_sampling_steps: 500
    lora_rank: 4
    log_interval: 1
    mask_type: 'null'
    mask_loss_coef: 0.0
    load_mask_index: false
    snr_loss: false
    real_prompt_ratio: 1.0
    early_stop_hours: 10000.0
    save_image_epochs: 1
    save_model_epochs: 5
    save_model_steps: 500
    visualize: true
    null_embed_root: output/pretrained_models/
    valid_prompt_embed_root: output/tmp_embed/
    validation_prompts:
    - dog
    - portrait photo of a girl, photograph, highly detailed face, depth of field
    - Self-portrait oil painting, a beautiful cyborg with golden hair, 8k
    - Astronaut in a jungle, cold color palette, muted colors, detailed, 8k
    - A photo of beautiful mountain with realistic sunset and blue lake, highly detailed,
        masterpiece
    local_save_vis: true
    deterministic_validation: true
    online_metric: false
    eval_metric_step: 2000
    online_metric_dir: metric_helper
    work_dir: output/sd35m_d2c
    skip_step: 0
    loss_type: huber
    huber_c: 0.001
    num_ddim_timesteps: 50
    ema_decay: 0.95
    debug_nan: false
    ema_update: false
    ema_rate: 0.9999
    tangent_warmup_steps: 10000
    scm_cfg_scale:
    - 1.0
    cfg_interval: null
    scm_logvar_loss: true
    norm_invariant_to_spatial_dim: true
    norm_same_as_512_scale: false
    g_norm_constant: 0.1
    g_norm_r: 1.0
    show_gradient: false
    lr_scale: null
    adv_lambda: 1.0
    scm_loss: true
    scm_lambda: 1.0
    loss_scale: 1.0
    r1_penalty: false
    r1_penalty_weight: 1.0e-05
    diff_timesteps_D: true
    suffix_checkpoints: disc
    misaligned_pairs_D: false
    discriminator_loss: cross entropy
    largest_timestep: 1.5708
    train_largest_timestep: false
    largest_timestep_prob: 0.5
    extra: null
controlnet: null
model_growth: null
work_dir: output/sd35m_d2c_breif
resume_from: latest
load_from: null
debug: true
caching: false
report_to: tensorboard
tracker_project_name: sana-baseline
name: tmp
loss_report_name: loss