data: batch_size: 64 num_workers: 8 train_lmdb_path: ./iam_lmdbclear vocab_path: ./char_vocab.json model: latent_shape: - 4 - 16 - 64 scheduler: beta_end: 0.012 beta_schedule: linear beta_start: 0.00085 num_train_timesteps: 1000 prediction_type: epsilon text_encoder: d_ff: 1024 d_model: 512 dropout: 0.1 max_length: 32 num_heads: 8 num_layers: 4 output_dim: 512 unet: act_fn: silu attention_head_dim: 8 block_out_channels: - 192 - 384 - 768 - 768 cross_attention_dim: 512 down_block_types: - DownBlock2D - CrossAttnDownBlock2D - CrossAttnDownBlock2D - DownBlock2D in_channels: 4 layers_per_block: 2 mid_block_type: UNetMidBlock2DCrossAttn norm_num_groups: 32 num_class_embeds: 657 out_channels: 4 sample_size: - 16 - 64 up_block_types: - UpBlock2D - CrossAttnUpBlock2D - CrossAttnUpBlock2D - UpBlock2D vae: model_name: stabilityai/sd-vae-ft-mse training: compile_model: false ema_decay: 0.999 ema_inv_gamma: 1.0 ema_min_decay: 0.0 ema_power: 1.0 gradient_accumulation_steps: 1 log_every_n_steps: 10 lr_scheduler: min_lr: 1.0e-07 type: cosine warmup_steps: 2000 max_grad_norm: 1.0 mixed_precision: bf16 mode: latent num_epochs: 300 num_inference_steps: 1000 optimizer: beta1: 0.9 beta2: 0.999 eps: 1.0e-08 lr: 0.0001 type: adamw weight_decay: 0.01 output_dir: ./experiments/hf_conditional_latent_batch64 resume_from_checkpoint: null run_name: hf_conditional_latent_batch64 sample_every_n_steps: 18000 save_every_n_epochs: 10 seed: 42 use_channels_last: false use_ema: true wandb: api_key: entity: null notes: Hugging Face UNet with EMA and latent diffusion training. project: handwriting-diffusion tags: - hf - conditional - latent