spmp / config.json
aemack's picture
Upload folder
be5b5fe verified
{
"data_dir": "data/fineweb-edu-10B",
"data_seed": 1337,
"batch_size": 20,
"seq_len": 1024,
"max_iters": 15267,
"gradient_accumulation_steps": 4,
"optimizer": "muon",
"learning_rate": 0.0006,
"min_lr": 6e-05,
"weight_decay": 0.1,
"beta1": 0.9,
"beta2": 0.95,
"grad_clip": 1.0,
"warmup_iters": 200,
"lr_decay_iters": 15267,
"muon_lr": 0.004,
"muon_min_lr": 0.002,
"adamw_lr": 0.004,
"adamw_min_lr": 0.00036,
"muon_momentum": 0.95,
"muon_nesterov": true,
"muon_ns_steps": 5,
"muon_weight_decay": 0.0,
"sparse_mlp_ambient_muon_lr": 0.004,
"sparse_mlp_ambient_muon_min_lr": 0.0004,
"sparse_mlp_ambient_muon_momentum": 0.95,
"sparse_mlp_ambient_muon_nesterov": true,
"sparse_mlp_ambient_muon_ns_steps": 5,
"sparse_mlp_ambient_muon_weight_decay": 0.0,
"sparse_mlp_row_norm_lr": 0.004,
"sparse_mlp_row_norm_min_lr": 0.0004,
"sparse_mlp_row_norm_momentum": 0.9,
"sparse_mlp_row_norm_weight_decay": 0.0,
"sparse_mlp_active_time_lr": 0.004,
"sparse_mlp_active_time_min_lr": 0.0004,
"sparse_mlp_active_time_beta1": 0.9,
"sparse_mlp_active_time_beta2": 0.95,
"sparse_mlp_active_time_k_ref": 10240.0,
"sparse_mlp_active_time_c_max": 4.0,
"sparse_mlp_active_time_weight_decay": 0.0,
"sparse_mlp_adamw_lr": 0.004,
"sparse_mlp_adamw_min_lr": 0.00036,
"sparse_mlp_beta1": 0.9,
"sparse_mlp_beta2": 0.95,
"sparse_mlp_equalize_lr": true,
"lr_schedule": "cosine",
"warmdown_frac": 0.1,
"device": "cuda",
"dtype": "bfloat16",
"compile": true,
"n_layer": 24,
"n_head": 8,
"n_embd": 2048,
"block_size": 1024,
"vocab_size": 50304,
"pos_encoding": "rope",
"rope_theta": 10000.0,
"attn_alpha": 1.0,
"attn_zero_init_proj": false,
"zero_init_proj_targets": [],
"tied_embedding": true,
"sparsity_mode": "cayley_sparse_gather_dense",
"rmsnorm_affine": true,
"disable_mlp": false,
"cayley_levels": [
[
11,
16,
0
],
[
15,
32,
240
],
[
19,
64,
240
],
[
23,
128,
240
]
],
"cayley_per_parent_budget": true,
"cayley_leaf_runner_j": 0,
"cayley_seed": 42,
"cayley_balanced_generators": true,
"cayley_backend": "auto",
"cayley_locations": [
"mlp_in"
],
"cayley_score_standardize": true,
"cayley_score_std_scope": "per_vertex",
"cayley_score_std_batch_size": 64,
"cayley_activation_std_levels": [
"all"
],
"cayley_activation_tanh": false,
"cayley_matching_pursuit": false,
"cayley_additive_parent_bias": false,
"cayley_alpha_init_override": null,
"cayley_alpha_weight_decay": 0.1,
"sparse_mlp_K_in": 56,
"sparse_mlp_H": 32,
"sparse_mlp_K_out": 128,
"sparse_mlp_F_up": 32,
"sparse_mlp_F_down": 32,
"sparse_mlp_Delta": 32,
"sparse_mlp_rewire_every": 50,
"sparse_mlp_rewire_components": [],
"sparse_mlp_seed": 43,
"sparse_mlp_init_mode": "supported",
"sparse_mlp_tied_adjacency": false,
"sparse_mlp_tied_rewire": false,
"sparse_mlp_init_boost": 1.0,
"sparse_mlp_up_init_scale": 1.0,
"sparse_mlp_down_init_scale": 1.0,
"sparse_mlp_up_norm_constraint": false,
"sparse_mlp_up_per_level_init": true,
"matryoshka_train": false,
"matryoshka_cut_power": 0.0,
"sparse_mlp_weight_decay": 0.0,
"sparse_mlp_optimizer": "active_time_adam",
"tied_block_init": false,
"feature_parallel": true,
"sparse_mlp_zero_l1plus": true,
"profile_iter": -1,
"profile_microbatches": 2,
"dead_threshold_c": 0.1,
"resume": false,
"out_dir": "/workspace/sparse-nanogpt-private/run_logs/sweep_alt/4L_24L_d2048_K16326412_bal_f16_m23_seq1024_10B_fp_zero_l1plus_notanh",
"eval_interval": 500,
"eval_iters": 100,
"log_interval": 10,
"wandb_project": "sparse-mlp-mscaling",
"wandb_run_name": "4L-24L-d2048-K16326412-bal-f16-m23-seq1024-10B-fp-zero-l1plus-notanh"
}