| {"project": "gated-10B", "seed": 42, "dim": 768, "n_layers": 12, "n_heads": 12, "n_kv_heads": null, "vocab_size": 50304, "multiple_of": 256, "ffn_dim_multiplier": 4.0, "norm_eps": 1e-05, "rope_theta": 10000, "use_scaled_rope": false, "train_files": "data/fineweb10B/fineweb_train_*.bin", "train_tokens": 5000134656, "val_files": "data/fineweb10B/fineweb_val_*.bin", "val_tokens": 10485760, "batch_size": 512, "device_batch_size": 4, "max_seq_len": 1024, "frac_warmdown_steps": 0.25, "log_every_steps": 1, "val_every_steps": 191, "save_every_steps": 1908, "gate_trainer": {"enable_gate_loss_mean": true, "enable_gate_loss_var": true, "init_gate_coef_mean": 0, "init_gate_coef_var": 0, "enable_coef_multiply": false, "coef_multiply": null, "enable_coef_add": true, "coef_add": 0.001, "coef_max": Infinity, "coef_min": -Infinity, "enable_mask_targets": false, "mask_targets": null, "mask_target_mean_start": null, "mask_target_mean_end": null, "mask_target_eps": null, "enable_gate_targets": true, "gate_targets": [1, 0.9, 0.8, 0.7, 0.6, 0.5], "gate_target_mean_start": null, "gate_target_mean_end": null, "gate_target_eps": 0.001, "enable_zero_targets": false, "zero_targets": null, "zero_target_mean_start": null, "zero_target_mean_end": null, "zero_target_eps": null}, "mask_zero_init": false, "zero_eps": 1e-08, "adam_8bit": false, "adam_beta1": 0.8, "adam_beta2": 0.95, "adam_eps": 1e-10, "adam_lr_output": 0.001, "adam_lr_embed": 0.001, "adam_lr_layers": 0.001, "adam_lr_masks": 0.001, "adam_lr_norms": 0.01, "muon_lr": 0.001, "muon_momentum": 0.95} |