File size: 5,901 Bytes
497d0e3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | data:
data_dir:
- /home/work/shared-fi-datasets-01/users/hsiang.chen/Project/Datasets/IR
caption_proportion:
prompt: 1
external_caption_suffixes: []
external_clipscore_suffixes: []
clip_thr_temperature: 0.1
clip_thr: 25.0
del_img_clip_thr: 0.0
sort_dataset: false
load_text_feat: false
load_vae_feat: false
transform: default_train
type: IRImgDataset
image_size: 256
hq_only: false
valid_num: 0
data: null
extra: null
dset: train_brief
max_samples: null
model:
model: SD35M_D2C
model_pretrained: ./checkpoints/stable-diffusion-3.5-medium/sd3.5_medium.safetensors
shift: 3.0
teacher: null
input_channel: 16
image_size: 256
mixed_precision: bf16
fp32_attention: true
load_from: null
discriminator_model: null
teacher_model: null
teacher_model_weight_dtype: null
resume_from:
checkpoint: latest
load_ema: false
resume_optimizer: true
resume_lr_scheduler: true
aspect_ratio_type: ASPECT_RATIO_1024
multi_scale: false
pe_interpolation: 1.0
micro_condition: false
attn_type: linear
autocast_linear_attn: false
ffn_type: glumbconv
mlp_acts:
- silu
- silu
- null
mlp_ratio: 2.5
use_pe: false
pos_embed_type: sincos
qk_norm: false
class_dropout_prob: 0.1
linear_head_dim: 32
cross_norm: false
cross_attn_type: flash
logvar: false
cfg_scale: 4
cfg_embed: false
cfg_embed_scale: 1.0
guidance_type: classifier-free
pag_applied_layers:
- 8
ladd_multi_scale: true
head_block_ids: null
extra: null
vae:
vae_type: SDVAE
vae_pretrained: ./checkpoints/stable-diffusion-3.5-medium/sd3.5_medium.safetensors
weight_dtype: float32
scale_factor: 0.41407
vae_latent_dim: 16
vae_downsample_rate: 8
sample_posterior: true
extra: null
text_encoder:
text_encoder_name: sd35-text
text_encoder_pretrained: ./checkpoints/stable-diffusion-3.5-medium/text_encoders
caption_channels: 4096
y_norm: true
y_norm_scale_factor: 0.01
model_max_length: 300
chi_prompt:
- a photo of a cat
- Convenience store entrance at night. On the glass door, a vinyl decal reads
'OPEN FOR QUALITY'. Inside, shelves and fluorescent lights; outside, a cyclist
passing by
- Sunrise beach, shallow tide washing over smooth sand. A piece of weathered driftwood
lies near the shoreline with a subtle branded text [SOS] on its surface; wet
sand reflections, micro-ripples, sun flare at horizon.
extra: null
scheduler:
train_sampling_steps: 1000
predict_flow_v: true
noise_schedule: linear_flow
pred_sigma: false
learn_sigma: true
vis_sampler: flow_dpm-solver
flow_shift: 3.0
weighting_scheme: logit_normal
weighting_scheme_discriminator: logit_normal_trigflow
add_noise_timesteps:
- 1.5708
logit_mean: 0.0
logit_std: 1.0
logit_mean_discriminator: 0.0
logit_std_discriminator: 1.0
sigma_data: 0.5
timestep_norm_scale_factor: 1.0
extra: null
train:
num_workers: 10
seed: 1229
train_batch_size: 4
num_epochs: 100
gradient_accumulation_steps: 8
grad_checkpointing: true
gradient_clip: 0.1
gc_step: 1
optimizer:
betas:
- 0.9
- 0.999
- 0.9999
eps:
- 1.0e-30
- 1.0e-16
lr: 5.0e-05
type: CAMEWrapper
weight_decay: 0.0
optimizer_D:
eps: 1.0e-10
lr: 0.0001
type: AdamW
weight_decay: 0.03
load_from_optimizer: false
load_from_lr_scheduler: false
resume_lr_scheduler: true
lr_schedule: cosine
lr_schedule_args:
num_warmup_steps: 2000
auto_lr:
rule: sqrt
eval_batch_size: 16
use_fsdp: false
use_flash_attn: false
eval_sampling_steps: 500
lora_rank: 4
log_interval: 1
mask_type: 'null'
mask_loss_coef: 0.0
load_mask_index: false
snr_loss: false
real_prompt_ratio: 1.0
early_stop_hours: 10000.0
save_image_epochs: 1
save_model_epochs: 5
save_model_steps: 500
visualize: true
null_embed_root: output/pretrained_models/
valid_prompt_embed_root: output/tmp_embed/
validation_prompts:
- dog
- portrait photo of a girl, photograph, highly detailed face, depth of field
- Self-portrait oil painting, a beautiful cyborg with golden hair, 8k
- Astronaut in a jungle, cold color palette, muted colors, detailed, 8k
- A photo of beautiful mountain with realistic sunset and blue lake, highly detailed,
masterpiece
local_save_vis: true
deterministic_validation: true
online_metric: false
eval_metric_step: 2000
online_metric_dir: metric_helper
work_dir: output/sd35m_d2c
skip_step: 0
loss_type: huber
huber_c: 0.001
num_ddim_timesteps: 50
ema_decay: 0.95
debug_nan: false
ema_update: false
ema_rate: 0.9999
tangent_warmup_steps: 10000
scm_cfg_scale:
- 1.0
cfg_interval: null
scm_logvar_loss: true
norm_invariant_to_spatial_dim: true
norm_same_as_512_scale: false
g_norm_constant: 0.1
g_norm_r: 1.0
show_gradient: false
lr_scale: null
adv_lambda: 1.0
scm_loss: true
scm_lambda: 1.0
loss_scale: 1.0
r1_penalty: false
r1_penalty_weight: 1.0e-05
diff_timesteps_D: true
suffix_checkpoints: disc
misaligned_pairs_D: false
discriminator_loss: cross entropy
largest_timestep: 1.5708
train_largest_timestep: false
largest_timestep_prob: 0.5
extra: null
controlnet: null
model_growth: null
work_dir: output/sd35m_d2c_breif
resume_from: latest
load_from: null
debug: true
caching: false
report_to: tensorboard
tracker_project_name: sana-baseline
name: tmp
loss_report_name: loss
|