trixyL
dump: train artifacts
a81731e
{
"checkpointing": {
"best_metric_name": "val_loss",
"best_mode": "min",
"ckpting_save_iter": 1000,
"enabled": true,
"resume_from": null,
"resume_optimizer": true,
"run_id": null
},
"compile": null,
"data": {
"cache_all": true,
"dataset_config": null,
"dataset_name": "ylecun/mnist",
"megatron_train_prefix": null,
"megatron_val_prefix": null,
"pad_random_shift": false,
"pad_token_id": null,
"pipeline_mode": "mnist",
"runs_path": "runs",
"shuffle_buffer_size": 0,
"shuffle_seed": 3407,
"text_field": "image",
"tokenizer": null,
"train_split": "train",
"val_split": "test"
},
"ddp": {
"backend": "nccl",
"bucket_size_mb": 200,
"master_addr": "127.0.0.1",
"master_port": "29500",
"nccl_p2p_disable": true,
"node_rank": 0,
"num_gpus_per_node": 1,
"num_nodes": 1
},
"logging": {
"architecture": "TransformerImage",
"backend": "wandb",
"dataset": "MNIST",
"log_activation_norms": false,
"log_grad_norms": true,
"log_p_mask_bucket_loss": false,
"log_weight_norms": true,
"p_mask_bucket_edges": null,
"run_name": null,
"val_log_every": 8,
"val_log_samples": 1
},
"model": {
"attention_backend": "torch_sdpa",
"attention_sdp_backend": "auto",
"context_length": 784,
"d_ff": 1024,
"d_model": 256,
"device": "cuda",
"dtype": "float32",
"eot_token_id": null,
"label_vocab_size": 11,
"mask_token_id": 32,
"model_type": "image",
"noise_epsilon": 0.001,
"null_label_id": 10,
"num_heads": 16,
"num_layers": 8,
"pixel_bins": 32,
"random_trunc_prob": 0.0,
"rope_theta": 10000.0,
"vocab_size": 33
},
"optimizer": {
"betas": [
0.9,
0.95
],
"cosine_cycle_iters": 60000,
"eps": 1e-08,
"grad_clip_max_l2_norm": 3.0,
"initial_learning_rate": 0.0001,
"lr_schedule": "constant_with_warmup",
"max_learning_rate": 0.003,
"min_learning_rate": 0.0003,
"muon": null,
"optimizer_name": "adamw",
"warmup_iters": 200,
"weight_decay": 0.1
},
"train_infer": null,
"training": {
"amp_dtype": "bfloat16",
"amp_enabled": true,
"batch_size": 256,
"deterministic_mask": false,
"eot_mask_loss": false,
"grad_accum_steps": 1,
"max_train_iteration": 120000,
"max_val_iteration": 10,
"objective": "diffusion",
"p_mask_override": null,
"repeat_masking_seed": null,
"seed": 3407,
"skip_validation": false,
"train_loss_ema_decay": 0.99,
"uncond_label_dropout_prob": 0.1,
"val_freq_iteration": 250
},
"wandb": {
"architecture": null,
"dataset": null,
"entity": "yiltro8-org",
"project": "mnist_diffusion"
}
}