| { |
| "checkpointing": { |
| "best_metric_name": "val_loss", |
| "best_mode": "min", |
| "ckpting_save_iter": 1000, |
| "enabled": true, |
| "resume_from": null, |
| "resume_optimizer": true, |
| "run_id": null |
| }, |
| "compile": null, |
| "data": { |
| "cache_all": true, |
| "dataset_config": null, |
| "dataset_name": "ylecun/mnist", |
| "megatron_train_prefix": null, |
| "megatron_val_prefix": null, |
| "pad_random_shift": false, |
| "pad_token_id": null, |
| "pipeline_mode": "mnist", |
| "runs_path": "runs", |
| "shuffle_buffer_size": 0, |
| "shuffle_seed": 3407, |
| "text_field": "image", |
| "tokenizer": null, |
| "train_split": "train", |
| "val_split": "test" |
| }, |
| "ddp": { |
| "backend": "nccl", |
| "bucket_size_mb": 200, |
| "master_addr": "127.0.0.1", |
| "master_port": "29500", |
| "nccl_p2p_disable": true, |
| "node_rank": 0, |
| "num_gpus_per_node": 1, |
| "num_nodes": 1 |
| }, |
| "logging": { |
| "architecture": "TransformerImage", |
| "backend": "wandb", |
| "dataset": "MNIST", |
| "log_activation_norms": false, |
| "log_grad_norms": true, |
| "log_p_mask_bucket_loss": false, |
| "log_weight_norms": true, |
| "p_mask_bucket_edges": null, |
| "run_name": null, |
| "val_log_every": 8, |
| "val_log_samples": 1 |
| }, |
| "model": { |
| "attention_backend": "torch_sdpa", |
| "attention_sdp_backend": "auto", |
| "context_length": 784, |
| "d_ff": 1024, |
| "d_model": 256, |
| "device": "cuda", |
| "dtype": "float32", |
| "eot_token_id": null, |
| "label_vocab_size": 11, |
| "mask_token_id": 32, |
| "model_type": "image", |
| "noise_epsilon": 0.001, |
| "null_label_id": 10, |
| "num_heads": 16, |
| "num_layers": 8, |
| "pixel_bins": 32, |
| "random_trunc_prob": 0.0, |
| "rope_theta": 10000.0, |
| "vocab_size": 33 |
| }, |
| "optimizer": { |
| "betas": [ |
| 0.9, |
| 0.95 |
| ], |
| "cosine_cycle_iters": 60000, |
| "eps": 1e-08, |
| "grad_clip_max_l2_norm": 3.0, |
| "initial_learning_rate": 0.0001, |
| "lr_schedule": "constant_with_warmup", |
| "max_learning_rate": 0.003, |
| "min_learning_rate": 0.0003, |
| "muon": null, |
| "optimizer_name": "adamw", |
| "warmup_iters": 200, |
| "weight_decay": 0.1 |
| }, |
| "train_infer": null, |
| "training": { |
| "amp_dtype": "bfloat16", |
| "amp_enabled": true, |
| "batch_size": 256, |
| "deterministic_mask": false, |
| "eot_mask_loss": false, |
| "grad_accum_steps": 1, |
| "max_train_iteration": 120000, |
| "max_val_iteration": 10, |
| "objective": "diffusion", |
| "p_mask_override": null, |
| "repeat_masking_seed": null, |
| "seed": 3407, |
| "skip_validation": false, |
| "train_loss_ema_decay": 0.99, |
| "uncond_label_dropout_prob": 0.1, |
| "val_freq_iteration": 250 |
| }, |
| "wandb": { |
| "architecture": null, |
| "dataset": null, |
| "entity": "yiltro8-org", |
| "project": "mnist_diffusion" |
| } |
| } |