Ahadhassan-2003
deploy: update HF Space
dc4e6da
data:
batch_size: 64
num_workers: 8
train_lmdb_path: ./iam_lmdbclear
vocab_path: ./char_vocab.json
model:
latent_shape:
- 4
- 16
- 64
scheduler:
beta_end: 0.012
beta_schedule: linear
beta_start: 0.00085
num_train_timesteps: 1000
prediction_type: epsilon
text_encoder:
d_ff: 1024
d_model: 512
dropout: 0.1
max_length: 32
num_heads: 8
num_layers: 4
output_dim: 512
unet:
act_fn: silu
attention_head_dim: 8
block_out_channels:
- 192
- 384
- 768
- 768
cross_attention_dim: 512
down_block_types:
- DownBlock2D
- CrossAttnDownBlock2D
- CrossAttnDownBlock2D
- DownBlock2D
in_channels: 4
layers_per_block: 2
mid_block_type: UNetMidBlock2DCrossAttn
norm_num_groups: 32
num_class_embeds: 657
out_channels: 4
sample_size:
- 16
- 64
up_block_types:
- UpBlock2D
- CrossAttnUpBlock2D
- CrossAttnUpBlock2D
- UpBlock2D
vae:
model_name: stabilityai/sd-vae-ft-mse
training:
compile_model: false
ema_decay: 0.999
ema_inv_gamma: 1.0
ema_min_decay: 0.0
ema_power: 1.0
gradient_accumulation_steps: 1
log_every_n_steps: 10
lr_scheduler:
min_lr: 1.0e-07
type: cosine
warmup_steps: 2000
max_grad_norm: 1.0
mixed_precision: bf16
mode: latent
num_epochs: 300
num_inference_steps: 1000
optimizer:
beta1: 0.9
beta2: 0.999
eps: 1.0e-08
lr: 0.0001
type: adamw
weight_decay: 0.01
output_dir: ./experiments/hf_conditional_latent_batch64
resume_from_checkpoint: null
run_name: hf_conditional_latent_batch64
sample_every_n_steps: 18000
save_every_n_epochs: 10
seed: 42
use_channels_last: false
use_ema: true
wandb:
api_key:
entity: null
notes: Hugging Face UNet with EMA and latent diffusion training.
project: handwriting-diffusion
tags:
- hf
- conditional
- latent