ApexOracle / config.json
Kiria-Nozan's picture
initial release
c57b96e verified
{
"ckpt_path": null,
"config": {
"T": 0,
"backbone": "dit",
"callbacks": {
"checkpoint_every_n_steps": {
"_target_": "lightning.pytorch.callbacks.ModelCheckpoint",
"auto_insert_metric_name": false,
"dirpath": "${checkpointing.save_dir}/checkpoints",
"every_n_train_steps": 500,
"save_last": true,
"save_top_k": -1,
"verbose": true
},
"checkpoint_monitor": {
"_target_": "lightning.pytorch.callbacks.ModelCheckpoint",
"auto_insert_metric_name": false,
"dirpath": "${checkpointing.save_dir}/checkpoints",
"filename": "best",
"mode": "min",
"monitor": "val/nll",
"save_last": false,
"save_top_k": 1,
"verbose": true
},
"learning_rate_monitor": {
"_target_": "lightning.pytorch.callbacks.LearningRateMonitor",
"logging_interval": "step"
}
},
"checkpointing": {
"resume_ckpt_path": "${.save_dir}/checkpoints/last.ckpt",
"resume_from_ckpt": true,
"save_dir": "${cwd:}"
},
"data": {
"cache_dir": "/share/kuleshov/ssahoo/textdiffusion/data",
"streaming": false,
"tokenizer_name_or_path": "ibm-research/materials.selfies-ted",
"train": "openwebtext",
"valid": "wikitext103",
"wrap": true
},
"diffusion": "absorbing_state",
"eval": {
"checkpoint_path": "/data2/tianang/projects/mdlm/Checkpoints_fangping/1-255000-fine-tune.ckpt",
"compute_generative_perplexity": false,
"compute_perplexity_on_sanity": false,
"disable_ema": false,
"gen_ppl_eval_model_name_or_path": "gpt2-large",
"generate_samples": true,
"perplexity_batch_size": 8
},
"loader": {
"batch_size": "${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}",
"eval_batch_size": "${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}",
"eval_global_batch_size": 512,
"global_batch_size": 512,
"num_workers": "${eval:\"len(__import__('os').sched_getaffinity(0))\"}",
"pin_memory": true
},
"lr_scheduler": {
"_target_": "transformers.get_constant_schedule_with_warmup",
"num_warmup_steps": 2500
},
"mode": "sample_eval",
"model": {
"cond_dim": 128,
"dropout": 0.1,
"hidden_size": 768,
"length": 1024,
"n_blocks": 12,
"n_heads": 12,
"name": "small",
"scale_by_sigma": true,
"tie_word_embeddings": false,
"type": "ddit"
},
"noise": {
"sigma_max": 20,
"sigma_min": 0.0001,
"type": "loglinear"
},
"optim": {
"beta1": 0.9,
"beta2": 0.999,
"eps": 1e-08,
"lr": 0.0003,
"weight_decay": 0
},
"parameterization": "subs",
"sampling": {
"noise_removal": true,
"num_sample_batches": 2,
"num_sample_log": 2,
"num_strides": 1,
"predictor": "ddpm_cache",
"semi_ar": false,
"steps": 128,
"stride_length": 1
},
"seed": 1,
"strategy": {
"_target_": "lightning.pytorch.strategies.DDPStrategy",
"find_unused_parameters": false
},
"subs_masking": false,
"time_conditioning": false,
"trainer": {
"_target_": "lightning.Trainer",
"accelerator": "cuda",
"accumulate_grad_batches": "${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}",
"devices": "${device_count:}",
"gradient_clip_val": 1.0,
"limit_train_batches": 1.0,
"limit_val_batches": 1.0,
"log_every_n_steps": 10,
"max_steps": 1000000,
"num_nodes": 1,
"num_sanity_val_steps": 2,
"precision": "bf16",
"val_check_interval": 10000
},
"training": {
"antithetic_sampling": true,
"change_of_variables": false,
"ema": 0.9999,
"importance_sampling": false,
"sampling_eps": 0.001
},
"wandb": {
"group": null,
"id": "None_1",
"job_type": null,
"name": null,
"notes": "Mulan for text",
"project": "text-diffusion",
"tags": [
"loglinear",
"openwebtext",
"wikitext103"
]
}
},
"hidden_size": 768,
"mask_index": 4,
"max_position_embeddings": 1024,
"model_type": "mol_emb_raw",
"n_blocks": 12,
"n_heads": 12,
"noise_schedule_type": "loglinear",
"parameterization": "subs",
"sigma_max": 20,
"sigma_min": 0.0001,
"time_conditioning": false,
"tokenizer_name_or_path": "ibm-research/materials.selfies-ted",
"vocab_size": 3160
}