File size: 5,901 Bytes

497d0e3

data:
    data_dir:
    - /home/work/shared-fi-datasets-01/users/hsiang.chen/Project/Datasets/IR
    caption_proportion:
        prompt: 1
    external_caption_suffixes: []
    external_clipscore_suffixes: []
    clip_thr_temperature: 0.1
    clip_thr: 25.0
    del_img_clip_thr: 0.0
    sort_dataset: false
    load_text_feat: false
    load_vae_feat: false
    transform: default_train
    type: IRImgDataset
    image_size: 256
    hq_only: false
    valid_num: 0
    data: null
    extra: null
    dset: train_brief
    max_samples: null
model:
    model: SD35M_D2C
    model_pretrained: ./checkpoints/stable-diffusion-3.5-medium/sd3.5_medium.safetensors
    shift: 3.0
    teacher: null
    input_channel: 16
    image_size: 256
    mixed_precision: bf16
    fp32_attention: true
    load_from: null
    discriminator_model: null
    teacher_model: null
    teacher_model_weight_dtype: null
    resume_from:
        checkpoint: latest
        load_ema: false
        resume_optimizer: true
        resume_lr_scheduler: true
    aspect_ratio_type: ASPECT_RATIO_1024
    multi_scale: false
    pe_interpolation: 1.0
    micro_condition: false
    attn_type: linear
    autocast_linear_attn: false
    ffn_type: glumbconv
    mlp_acts:
    - silu
    - silu
    - null
    mlp_ratio: 2.5
    use_pe: false
    pos_embed_type: sincos
    qk_norm: false
    class_dropout_prob: 0.1
    linear_head_dim: 32
    cross_norm: false
    cross_attn_type: flash
    logvar: false
    cfg_scale: 4
    cfg_embed: false
    cfg_embed_scale: 1.0
    guidance_type: classifier-free
    pag_applied_layers:
    - 8
    ladd_multi_scale: true
    head_block_ids: null
    extra: null
vae:
    vae_type: SDVAE
    vae_pretrained: ./checkpoints/stable-diffusion-3.5-medium/sd3.5_medium.safetensors
    weight_dtype: float32
    scale_factor: 0.41407
    vae_latent_dim: 16
    vae_downsample_rate: 8
    sample_posterior: true
    extra: null
text_encoder:
    text_encoder_name: sd35-text
    text_encoder_pretrained: ./checkpoints/stable-diffusion-3.5-medium/text_encoders
    caption_channels: 4096
    y_norm: true
    y_norm_scale_factor: 0.01
    model_max_length: 300
    chi_prompt:
    - a photo of a cat
    - Convenience store entrance at night. On the glass door, a vinyl decal reads
        'OPEN FOR QUALITY'. Inside, shelves and fluorescent lights; outside, a cyclist
        passing by
    - Sunrise beach, shallow tide washing over smooth sand. A piece of weathered driftwood
        lies near the shoreline with a subtle branded text [SOS] on its surface; wet
        sand reflections, micro-ripples, sun flare at horizon.
    extra: null
scheduler:
    train_sampling_steps: 1000
    predict_flow_v: true
    noise_schedule: linear_flow
    pred_sigma: false
    learn_sigma: true
    vis_sampler: flow_dpm-solver
    flow_shift: 3.0
    weighting_scheme: logit_normal
    weighting_scheme_discriminator: logit_normal_trigflow
    add_noise_timesteps:
    - 1.5708
    logit_mean: 0.0
    logit_std: 1.0
    logit_mean_discriminator: 0.0
    logit_std_discriminator: 1.0
    sigma_data: 0.5
    timestep_norm_scale_factor: 1.0
    extra: null
train:
    num_workers: 10
    seed: 1229
    train_batch_size: 4
    num_epochs: 100
    gradient_accumulation_steps: 8
    grad_checkpointing: true
    gradient_clip: 0.1
    gc_step: 1
    optimizer:
        betas:
        - 0.9
        - 0.999
        - 0.9999
        eps:
        - 1.0e-30
        - 1.0e-16
        lr: 5.0e-05
        type: CAMEWrapper
        weight_decay: 0.0
    optimizer_D:
        eps: 1.0e-10
        lr: 0.0001
        type: AdamW
        weight_decay: 0.03
    load_from_optimizer: false
    load_from_lr_scheduler: false
    resume_lr_scheduler: true
    lr_schedule: cosine
    lr_schedule_args:
        num_warmup_steps: 2000
    auto_lr:
        rule: sqrt
    eval_batch_size: 16
    use_fsdp: false
    use_flash_attn: false
    eval_sampling_steps: 500
    lora_rank: 4
    log_interval: 1
    mask_type: 'null'
    mask_loss_coef: 0.0
    load_mask_index: false
    snr_loss: false
    real_prompt_ratio: 1.0
    early_stop_hours: 10000.0
    save_image_epochs: 1
    save_model_epochs: 5
    save_model_steps: 500
    visualize: true
    null_embed_root: output/pretrained_models/
    valid_prompt_embed_root: output/tmp_embed/
    validation_prompts:
    - dog
    - portrait photo of a girl, photograph, highly detailed face, depth of field
    - Self-portrait oil painting, a beautiful cyborg with golden hair, 8k
    - Astronaut in a jungle, cold color palette, muted colors, detailed, 8k
    - A photo of beautiful mountain with realistic sunset and blue lake, highly detailed,
        masterpiece
    local_save_vis: true
    deterministic_validation: true
    online_metric: false
    eval_metric_step: 2000
    online_metric_dir: metric_helper
    work_dir: output/sd35m_d2c
    skip_step: 0
    loss_type: huber
    huber_c: 0.001
    num_ddim_timesteps: 50
    ema_decay: 0.95
    debug_nan: false
    ema_update: false
    ema_rate: 0.9999
    tangent_warmup_steps: 10000
    scm_cfg_scale:
    - 1.0
    cfg_interval: null
    scm_logvar_loss: true
    norm_invariant_to_spatial_dim: true
    norm_same_as_512_scale: false
    g_norm_constant: 0.1
    g_norm_r: 1.0
    show_gradient: false
    lr_scale: null
    adv_lambda: 1.0
    scm_loss: true
    scm_lambda: 1.0
    loss_scale: 1.0
    r1_penalty: false
    r1_penalty_weight: 1.0e-05
    diff_timesteps_D: true
    suffix_checkpoints: disc
    misaligned_pairs_D: false
    discriminator_loss: cross entropy
    largest_timestep: 1.5708
    train_largest_timestep: false
    largest_timestep_prob: 0.5
    extra: null
controlnet: null
model_growth: null
work_dir: output/sd35m_d2c_breif
resume_from: latest
load_from: null
debug: true
caching: false
report_to: tensorboard
tracker_project_name: sana-baseline
name: tmp
loss_report_name: loss