|
|
defaults: |
|
|
- _self_ |
|
|
- /callbacks: [checkpoint_every_n_steps, checkpoint_monitor, learning_rate_monitor] |
|
|
- /data: openwebtext |
|
|
- /model: small |
|
|
- /strategy: ddp |
|
|
- /noise: loglinear |
|
|
- /lr_scheduler: constant_warmup |
|
|
|
|
|
mode: train |
|
|
diffusion: absorbing_state |
|
|
backbone: dit |
|
|
ebm_backbone: null |
|
|
parameterization: subs |
|
|
time_conditioning: True |
|
|
T: 0 |
|
|
subs_masking: False |
|
|
|
|
|
seed: 1 |
|
|
|
|
|
loader: |
|
|
global_batch_size: 512 |
|
|
eval_global_batch_size: ${.global_batch_size} |
|
|
|
|
|
batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}} |
|
|
eval_batch_size: ${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}} |
|
|
num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"} |
|
|
pin_memory: True |
|
|
|
|
|
sampling: |
|
|
predictor: ddpm_cache |
|
|
steps: 128 |
|
|
noise_removal: True |
|
|
|
|
|
num_sample_batches: 2 |
|
|
num_sample_log: 2 |
|
|
semi_ar: False |
|
|
stride_length: 1 |
|
|
num_strides: 1 |
|
|
|
|
|
is_size: 2 |
|
|
is_start: 0.6 |
|
|
is_end: 0.4 |
|
|
is_temp: 1 |
|
|
|
|
|
ar_carry_over: True |
|
|
|
|
|
training: |
|
|
ema: 0.9999 |
|
|
antithetic_sampling: True |
|
|
importance_sampling: False |
|
|
sampling_eps: 1e-3 |
|
|
change_of_variables: False |
|
|
|
|
|
eval: |
|
|
checkpoint_path: '' |
|
|
disable_ema: False |
|
|
compute_generative_perplexity: False |
|
|
perplexity_batch_size: 8 |
|
|
compute_perplexity_on_sanity: False |
|
|
gen_ppl_eval_model_name_or_path: gpt2-large |
|
|
generate_samples: True |
|
|
|
|
|
optim: |
|
|
weight_decay: 0 |
|
|
lr: 3e-4 |
|
|
beta1: 0.9 |
|
|
beta2: 0.999 |
|
|
eps: 1e-8 |
|
|
|
|
|
trainer: |
|
|
_target_: lightning.Trainer |
|
|
accelerator: cuda |
|
|
num_nodes: 1 |
|
|
devices: ${device_count:} |
|
|
accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}} |
|
|
gradient_clip_val: 1.0 |
|
|
precision: 'bf16' |
|
|
num_sanity_val_steps: 2 |
|
|
max_steps: 1_000_000 |
|
|
log_every_n_steps: 10 |
|
|
limit_train_batches: 1.0 |
|
|
limit_val_batches: 1.0 |
|
|
val_check_interval: 10000 |
|
|
|
|
|
wandb: |
|
|
project: text-diffusion |
|
|
notes: Mulan for text |
|
|
group: null |
|
|
job_type: null |
|
|
name: null |
|
|
id: ${.name}_${seed} |
|
|
tags: |
|
|
- ${noise.type} |
|
|
- ${data.train} |
|
|
- ${data.valid} |
|
|
|
|
|
hydra: |
|
|
run: |
|
|
dir: ./outputs/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S} |
|
|
job: |
|
|
chdir: true |
|
|
|
|
|
checkpointing: |
|
|
|
|
|
save_dir: ${cwd:} |
|
|
|
|
|
resume_from_ckpt: true |
|
|
resume_ckpt_path: ${.save_dir}/checkpoints/last.ckpt |
|
|
|