{ "data": { "cache_dir": "data", "group": true, "train": "openwebtext", "train_name": null, "valid": "wikitext103", "valid_name": null }, "eval": { "batch_size": 512, "perplexity": true, "perplexity_batch_size": 32 }, "graph": { "file": "data", "report_all": false, "type": "uniform" }, "model": { "cond_dim": 128, "dropout": 0.1, "hidden_size": 768, "length": 1024, "n_blocks": 12, "n_heads": 12, "name": "small", "scale_by_sigma": false, "type": "ddit" }, "ngpus": 8, "noise": { "sigma_max": 20, "sigma_min": 0.0001, "type": "loglinear" }, "optim": { "beta1": 0.9, "beta2": 0.999, "eps": 1e-08, "grad_clip": 1.0, "lr": 0.0003, "optimizer": "AdamW", "warmup": 2500, "weight_decay": 0 }, "sampling": { "noise_removal": true, "predictor": "euler", "steps": 128 }, "tokens": 50257, "training": { "accum": 4, "batch_size": 512, "ema": 0.9999, "eval_freq": 100, "log_freq": 50, "n_iters": 400000, "snapshot_freq": 4000, "snapshot_freq_for_preemption": 1000, "snapshot_sampling": true, "weight": "standard" }, "wandb_name": "m_small-g_uniform-pretrain" }