| { | |
| "ckpt_path": null, | |
| "config": { | |
| "T": 0, | |
| "backbone": "dit", | |
| "callbacks": { | |
| "checkpoint_every_n_steps": { | |
| "_target_": "lightning.pytorch.callbacks.ModelCheckpoint", | |
| "auto_insert_metric_name": false, | |
| "dirpath": "${checkpointing.save_dir}/checkpoints", | |
| "every_n_train_steps": 500, | |
| "save_last": true, | |
| "save_top_k": -1, | |
| "verbose": true | |
| }, | |
| "checkpoint_monitor": { | |
| "_target_": "lightning.pytorch.callbacks.ModelCheckpoint", | |
| "auto_insert_metric_name": false, | |
| "dirpath": "${checkpointing.save_dir}/checkpoints", | |
| "filename": "best", | |
| "mode": "min", | |
| "monitor": "val/nll", | |
| "save_last": false, | |
| "save_top_k": 1, | |
| "verbose": true | |
| }, | |
| "learning_rate_monitor": { | |
| "_target_": "lightning.pytorch.callbacks.LearningRateMonitor", | |
| "logging_interval": "step" | |
| } | |
| }, | |
| "checkpointing": { | |
| "resume_ckpt_path": "${.save_dir}/checkpoints/last.ckpt", | |
| "resume_from_ckpt": true, | |
| "save_dir": "${cwd:}" | |
| }, | |
| "data": { | |
| "cache_dir": "/share/kuleshov/ssahoo/textdiffusion/data", | |
| "streaming": false, | |
| "tokenizer_name_or_path": "ibm-research/materials.selfies-ted", | |
| "train": "openwebtext", | |
| "valid": "wikitext103", | |
| "wrap": true | |
| }, | |
| "diffusion": "absorbing_state", | |
| "eval": { | |
| "checkpoint_path": "/data2/tianang/projects/mdlm/Checkpoints_fangping/1-255000-fine-tune.ckpt", | |
| "compute_generative_perplexity": false, | |
| "compute_perplexity_on_sanity": false, | |
| "disable_ema": false, | |
| "gen_ppl_eval_model_name_or_path": "gpt2-large", | |
| "generate_samples": true, | |
| "perplexity_batch_size": 8 | |
| }, | |
| "loader": { | |
| "batch_size": "${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}", | |
| "eval_batch_size": "${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}", | |
| "eval_global_batch_size": 512, | |
| "global_batch_size": 512, | |
| "num_workers": "${eval:\"len(__import__('os').sched_getaffinity(0))\"}", | |
| "pin_memory": true | |
| }, | |
| "lr_scheduler": { | |
| "_target_": "transformers.get_constant_schedule_with_warmup", | |
| "num_warmup_steps": 2500 | |
| }, | |
| "mode": "sample_eval", | |
| "model": { | |
| "cond_dim": 128, | |
| "dropout": 0.1, | |
| "hidden_size": 768, | |
| "length": 1024, | |
| "n_blocks": 12, | |
| "n_heads": 12, | |
| "name": "small", | |
| "scale_by_sigma": true, | |
| "tie_word_embeddings": false, | |
| "type": "ddit" | |
| }, | |
| "noise": { | |
| "sigma_max": 20, | |
| "sigma_min": 0.0001, | |
| "type": "loglinear" | |
| }, | |
| "optim": { | |
| "beta1": 0.9, | |
| "beta2": 0.999, | |
| "eps": 1e-08, | |
| "lr": 0.0003, | |
| "weight_decay": 0 | |
| }, | |
| "parameterization": "subs", | |
| "sampling": { | |
| "noise_removal": true, | |
| "num_sample_batches": 2, | |
| "num_sample_log": 2, | |
| "num_strides": 1, | |
| "predictor": "ddpm_cache", | |
| "semi_ar": false, | |
| "steps": 128, | |
| "stride_length": 1 | |
| }, | |
| "seed": 1, | |
| "strategy": { | |
| "_target_": "lightning.pytorch.strategies.DDPStrategy", | |
| "find_unused_parameters": false | |
| }, | |
| "subs_masking": false, | |
| "time_conditioning": false, | |
| "trainer": { | |
| "_target_": "lightning.Trainer", | |
| "accelerator": "cuda", | |
| "accumulate_grad_batches": "${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}", | |
| "devices": "${device_count:}", | |
| "gradient_clip_val": 1.0, | |
| "limit_train_batches": 1.0, | |
| "limit_val_batches": 1.0, | |
| "log_every_n_steps": 10, | |
| "max_steps": 1000000, | |
| "num_nodes": 1, | |
| "num_sanity_val_steps": 2, | |
| "precision": "bf16", | |
| "val_check_interval": 10000 | |
| }, | |
| "training": { | |
| "antithetic_sampling": true, | |
| "change_of_variables": false, | |
| "ema": 0.9999, | |
| "importance_sampling": false, | |
| "sampling_eps": 0.001 | |
| }, | |
| "wandb": { | |
| "group": null, | |
| "id": "None_1", | |
| "job_type": null, | |
| "name": null, | |
| "notes": "Mulan for text", | |
| "project": "text-diffusion", | |
| "tags": [ | |
| "loglinear", | |
| "openwebtext", | |
| "wikitext103" | |
| ] | |
| } | |
| }, | |
| "hidden_size": 768, | |
| "mask_index": 4, | |
| "max_position_embeddings": 1024, | |
| "model_type": "mol_emb_raw", | |
| "n_blocks": 12, | |
| "n_heads": 12, | |
| "noise_schedule_type": "loglinear", | |
| "parameterization": "subs", | |
| "sigma_max": 20, | |
| "sigma_min": 0.0001, | |
| "time_conditioning": false, | |
| "tokenizer_name_or_path": "ibm-research/materials.selfies-ted", | |
| "vocab_size": 3160 | |
| } |