trixyL
/

transformerlm-diff-32-mnist

Model card Files Files and versions

transformerlm-diff-32-mnist / config /config.json

trixyL

dump: train artifacts

a81731e 3 months ago

history blame contribute delete

2.78 kB

	{
	"checkpointing": {
	"best_metric_name": "val_loss",
	"best_mode": "min",
	"ckpting_save_iter": 1000,
	"enabled": true,
	"resume_from": null,
	"resume_optimizer": true,
	"run_id": null
	},
	"compile": null,
	"data": {
	"cache_all": true,
	"dataset_config": null,
	"dataset_name": "ylecun/mnist",
	"megatron_train_prefix": null,
	"megatron_val_prefix": null,
	"pad_random_shift": false,
	"pad_token_id": null,
	"pipeline_mode": "mnist",
	"runs_path": "runs",
	"shuffle_buffer_size": 0,
	"shuffle_seed": 3407,
	"text_field": "image",
	"tokenizer": null,
	"train_split": "train",
	"val_split": "test"
	},
	"ddp": {
	"backend": "nccl",
	"bucket_size_mb": 200,
	"master_addr": "127.0.0.1",
	"master_port": "29500",
	"nccl_p2p_disable": true,
	"node_rank": 0,
	"num_gpus_per_node": 1,
	"num_nodes": 1
	},
	"logging": {
	"architecture": "TransformerImage",
	"backend": "wandb",
	"dataset": "MNIST",
	"log_activation_norms": false,
	"log_grad_norms": true,
	"log_p_mask_bucket_loss": false,
	"log_weight_norms": true,
	"p_mask_bucket_edges": null,
	"run_name": null,
	"val_log_every": 8,
	"val_log_samples": 1
	},
	"model": {
	"attention_backend": "torch_sdpa",
	"attention_sdp_backend": "auto",
	"context_length": 784,
	"d_ff": 1024,
	"d_model": 256,
	"device": "cuda",
	"dtype": "float32",
	"eot_token_id": null,
	"label_vocab_size": 11,
	"mask_token_id": 32,
	"model_type": "image",
	"noise_epsilon": 0.001,
	"null_label_id": 10,
	"num_heads": 16,
	"num_layers": 8,
	"pixel_bins": 32,
	"random_trunc_prob": 0.0,
	"rope_theta": 10000.0,
	"vocab_size": 33
	},
	"optimizer": {
	"betas": [
	0.9,
	0.95
	],
	"cosine_cycle_iters": 60000,
	"eps": 1e-08,
	"grad_clip_max_l2_norm": 3.0,
	"initial_learning_rate": 0.0001,
	"lr_schedule": "constant_with_warmup",
	"max_learning_rate": 0.003,
	"min_learning_rate": 0.0003,
	"muon": null,
	"optimizer_name": "adamw",
	"warmup_iters": 200,
	"weight_decay": 0.1
	},
	"train_infer": null,
	"training": {
	"amp_dtype": "bfloat16",
	"amp_enabled": true,
	"batch_size": 256,
	"deterministic_mask": false,
	"eot_mask_loss": false,
	"grad_accum_steps": 1,
	"max_train_iteration": 120000,
	"max_val_iteration": 10,
	"objective": "diffusion",
	"p_mask_override": null,
	"repeat_masking_seed": null,
	"seed": 3407,
	"skip_validation": false,
	"train_loss_ema_decay": 0.99,
	"uncond_label_dropout_prob": 0.1,
	"val_freq_iteration": 250
	},
	"wandb": {
	"architecture": null,
	"dataset": null,
	"entity": "yiltro8-org",
	"project": "mnist_diffusion"
	}
	}