| { |
| "data_dir": "data/fineweb-edu-10B", |
| "data_seed": 1337, |
| "batch_size": 20, |
| "seq_len": 1024, |
| "max_iters": 15267, |
| "gradient_accumulation_steps": 4, |
| "optimizer": "muon", |
| "learning_rate": 0.0006, |
| "min_lr": 6e-05, |
| "weight_decay": 0.1, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "grad_clip": 1.0, |
| "warmup_iters": 200, |
| "lr_decay_iters": 15267, |
| "muon_lr": 0.004, |
| "muon_min_lr": 0.002, |
| "adamw_lr": 0.004, |
| "adamw_min_lr": 0.00036, |
| "muon_momentum": 0.95, |
| "muon_nesterov": true, |
| "muon_ns_steps": 5, |
| "muon_weight_decay": 0.0, |
| "sparse_mlp_ambient_muon_lr": 0.004, |
| "sparse_mlp_ambient_muon_min_lr": 0.0004, |
| "sparse_mlp_ambient_muon_momentum": 0.95, |
| "sparse_mlp_ambient_muon_nesterov": true, |
| "sparse_mlp_ambient_muon_ns_steps": 5, |
| "sparse_mlp_ambient_muon_weight_decay": 0.0, |
| "sparse_mlp_row_norm_lr": 0.004, |
| "sparse_mlp_row_norm_min_lr": 0.0004, |
| "sparse_mlp_row_norm_momentum": 0.9, |
| "sparse_mlp_row_norm_weight_decay": 0.0, |
| "sparse_mlp_active_time_lr": 0.004, |
| "sparse_mlp_active_time_min_lr": 0.0004, |
| "sparse_mlp_active_time_beta1": 0.9, |
| "sparse_mlp_active_time_beta2": 0.95, |
| "sparse_mlp_active_time_k_ref": 10240.0, |
| "sparse_mlp_active_time_c_max": 4.0, |
| "sparse_mlp_active_time_weight_decay": 0.0, |
| "sparse_mlp_adamw_lr": 0.004, |
| "sparse_mlp_adamw_min_lr": 0.00036, |
| "sparse_mlp_beta1": 0.9, |
| "sparse_mlp_beta2": 0.95, |
| "sparse_mlp_equalize_lr": true, |
| "lr_schedule": "cosine", |
| "warmdown_frac": 0.1, |
| "device": "cuda", |
| "dtype": "bfloat16", |
| "compile": true, |
| "n_layer": 24, |
| "n_head": 8, |
| "n_embd": 2048, |
| "block_size": 1024, |
| "vocab_size": 50304, |
| "pos_encoding": "rope", |
| "rope_theta": 10000.0, |
| "attn_alpha": 1.0, |
| "attn_zero_init_proj": false, |
| "zero_init_proj_targets": [], |
| "tied_embedding": true, |
| "sparsity_mode": "cayley_sparse_gather_dense", |
| "rmsnorm_affine": true, |
| "disable_mlp": false, |
| "cayley_levels": [ |
| [ |
| 11, |
| 16, |
| 0 |
| ], |
| [ |
| 15, |
| 32, |
| 240 |
| ], |
| [ |
| 19, |
| 64, |
| 240 |
| ], |
| [ |
| 23, |
| 128, |
| 240 |
| ] |
| ], |
| "cayley_per_parent_budget": true, |
| "cayley_leaf_runner_j": 0, |
| "cayley_seed": 42, |
| "cayley_balanced_generators": true, |
| "cayley_backend": "auto", |
| "cayley_locations": [ |
| "mlp_in" |
| ], |
| "cayley_score_standardize": true, |
| "cayley_score_std_scope": "per_vertex", |
| "cayley_score_std_batch_size": 64, |
| "cayley_activation_std_levels": [ |
| "all" |
| ], |
| "cayley_activation_tanh": false, |
| "cayley_matching_pursuit": false, |
| "cayley_additive_parent_bias": false, |
| "cayley_alpha_init_override": null, |
| "cayley_alpha_weight_decay": 0.1, |
| "sparse_mlp_K_in": 56, |
| "sparse_mlp_H": 32, |
| "sparse_mlp_K_out": 128, |
| "sparse_mlp_F_up": 32, |
| "sparse_mlp_F_down": 32, |
| "sparse_mlp_Delta": 32, |
| "sparse_mlp_rewire_every": 50, |
| "sparse_mlp_rewire_components": [], |
| "sparse_mlp_seed": 43, |
| "sparse_mlp_init_mode": "supported", |
| "sparse_mlp_tied_adjacency": false, |
| "sparse_mlp_tied_rewire": false, |
| "sparse_mlp_init_boost": 1.0, |
| "sparse_mlp_up_init_scale": 1.0, |
| "sparse_mlp_down_init_scale": 1.0, |
| "sparse_mlp_up_norm_constraint": false, |
| "sparse_mlp_up_per_level_init": true, |
| "matryoshka_train": false, |
| "matryoshka_cut_power": 0.0, |
| "sparse_mlp_weight_decay": 0.0, |
| "sparse_mlp_optimizer": "active_time_adam", |
| "tied_block_init": false, |
| "feature_parallel": true, |
| "sparse_mlp_zero_l1plus": true, |
| "profile_iter": -1, |
| "profile_microbatches": 2, |
| "dead_threshold_c": 0.1, |
| "resume": false, |
| "out_dir": "/workspace/sparse-nanogpt-private/run_logs/sweep_alt/4L_24L_d2048_K16326412_bal_f16_m23_seq1024_10B_fp_zero_l1plus_notanh", |
| "eval_interval": 500, |
| "eval_iters": 100, |
| "log_interval": 10, |
| "wandb_project": "sparse-mlp-mscaling", |
| "wandb_run_name": "4L-24L-d2048-K16326412-bal-f16-m23-seq1024-10B-fp-zero-l1plus-notanh" |
| } |