| data: | |
| batch_size: 64 | |
| num_workers: 8 | |
| train_lmdb_path: ./iam_lmdbclear | |
| vocab_path: ./char_vocab.json | |
| model: | |
| latent_shape: | |
| - 4 | |
| - 16 | |
| - 64 | |
| scheduler: | |
| beta_end: 0.012 | |
| beta_schedule: linear | |
| beta_start: 0.00085 | |
| num_train_timesteps: 1000 | |
| prediction_type: epsilon | |
| text_encoder: | |
| d_ff: 1024 | |
| d_model: 512 | |
| dropout: 0.1 | |
| max_length: 32 | |
| num_heads: 8 | |
| num_layers: 4 | |
| output_dim: 512 | |
| unet: | |
| act_fn: silu | |
| attention_head_dim: 8 | |
| block_out_channels: | |
| - 192 | |
| - 384 | |
| - 768 | |
| - 768 | |
| cross_attention_dim: 512 | |
| down_block_types: | |
| - DownBlock2D | |
| - CrossAttnDownBlock2D | |
| - CrossAttnDownBlock2D | |
| - DownBlock2D | |
| in_channels: 4 | |
| layers_per_block: 2 | |
| mid_block_type: UNetMidBlock2DCrossAttn | |
| norm_num_groups: 32 | |
| num_class_embeds: 657 | |
| out_channels: 4 | |
| sample_size: | |
| - 16 | |
| - 64 | |
| up_block_types: | |
| - UpBlock2D | |
| - CrossAttnUpBlock2D | |
| - CrossAttnUpBlock2D | |
| - UpBlock2D | |
| vae: | |
| model_name: stabilityai/sd-vae-ft-mse | |
| training: | |
| compile_model: false | |
| ema_decay: 0.999 | |
| ema_inv_gamma: 1.0 | |
| ema_min_decay: 0.0 | |
| ema_power: 1.0 | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 10 | |
| lr_scheduler: | |
| min_lr: 1.0e-07 | |
| type: cosine | |
| warmup_steps: 2000 | |
| max_grad_norm: 1.0 | |
| mixed_precision: bf16 | |
| mode: latent | |
| num_epochs: 300 | |
| num_inference_steps: 1000 | |
| optimizer: | |
| beta1: 0.9 | |
| beta2: 0.999 | |
| eps: 1.0e-08 | |
| lr: 0.0001 | |
| type: adamw | |
| weight_decay: 0.01 | |
| output_dir: ./experiments/hf_conditional_latent_batch64 | |
| resume_from_checkpoint: null | |
| run_name: hf_conditional_latent_batch64 | |
| sample_every_n_steps: 18000 | |
| save_every_n_epochs: 10 | |
| seed: 42 | |
| use_channels_last: false | |
| use_ema: true | |
| wandb: | |
| api_key: | |
| entity: null | |
| notes: Hugging Face UNet with EMA and latent diffusion training. | |
| project: handwriting-diffusion | |
| tags: | |
| - hf | |
| - conditional | |
| - latent | |