data:
  batch_size: 64
  num_workers: 8
  train_lmdb_path: ./iam_lmdbclear
  vocab_path: ./char_vocab.json
model:
  latent_shape:
  - 4
  - 16
  - 64
  scheduler:
    beta_end: 0.012
    beta_schedule: linear
    beta_start: 0.00085
    num_train_timesteps: 1000
    prediction_type: epsilon
  text_encoder:
    d_ff: 1024
    d_model: 512
    dropout: 0.1
    max_length: 32
    num_heads: 8
    num_layers: 4
    output_dim: 512
  unet:
    act_fn: silu
    attention_head_dim: 8
    block_out_channels:
    - 192
    - 384
    - 768
    - 768
    cross_attention_dim: 512
    down_block_types:
    - DownBlock2D
    - CrossAttnDownBlock2D
    - CrossAttnDownBlock2D
    - DownBlock2D
    in_channels: 4
    layers_per_block: 2
    mid_block_type: UNetMidBlock2DCrossAttn
    norm_num_groups: 32
    num_class_embeds: 657
    out_channels: 4
    sample_size:
    - 16
    - 64
    up_block_types:
    - UpBlock2D
    - CrossAttnUpBlock2D
    - CrossAttnUpBlock2D
    - UpBlock2D
  vae:
    model_name: stabilityai/sd-vae-ft-mse
training:
  compile_model: false
  ema_decay: 0.999
  ema_inv_gamma: 1.0
  ema_min_decay: 0.0
  ema_power: 1.0
  gradient_accumulation_steps: 1
  log_every_n_steps: 10
  lr_scheduler:
    min_lr: 1.0e-07
    type: cosine
    warmup_steps: 2000
  max_grad_norm: 1.0
  mixed_precision: bf16
  mode: latent
  num_epochs: 300
  num_inference_steps: 1000
  optimizer:
    beta1: 0.9
    beta2: 0.999
    eps: 1.0e-08
    lr: 0.0001
    type: adamw
    weight_decay: 0.01
  output_dir: ./experiments/hf_conditional_latent_batch64
  resume_from_checkpoint: null
  run_name: hf_conditional_latent_batch64
  sample_every_n_steps: 18000
  save_every_n_epochs: 10
  seed: 42
  use_channels_last: false
  use_ema: true
wandb:
  api_key:
  entity: null
  notes: Hugging Face UNet with EMA and latent diffusion training.
  project: handwriting-diffusion
  tags:
  - hf
  - conditional
  - latent